ปัญหาหลักของshlex
วิธีการที่ยอมรับคือมันไม่ได้ละเว้นอักขระ escape นอกสตริงย่อยที่ยกมาและให้ผลลัพธ์ที่ไม่คาดคิดเล็กน้อยในบางกรณี
ฉันมีกรณีการใช้งานดังต่อไปนี้ซึ่งฉันต้องการฟังก์ชั่นแยกที่แยกสตริงอินพุตเช่นว่าสตริงย่อยเดียวที่ยกมาหรือสองครั้งจะถูกเก็บรักษาไว้ด้วยความสามารถในการหลีกเลี่ยงคำพูดในสตริงย่อย เครื่องหมายคำพูดในสตริงที่ไม่ได้อยู่ในเครื่องหมายคำพูดไม่ควรได้รับการปฏิบัติแตกต่างจากอักขระอื่น ๆ ตัวอย่างกรณีทดสอบที่มีเอาต์พุตที่คาดไว้:
สตริงอินพุต ผลผลิตที่คาดหวัง
===============================================
'abc def' | ['abc', 'def']
"abc \\ s def" | ['abc', '\\ s', 'def']
'"abc def" ghi' | ['abc def', 'ghi']
"'abc def' ghi" | ['abc def', 'ghi']
'"abc \\" def "ghi' | ['abc" def', 'ghi']
"'abc \\' def 'ghi" | ["abc 'def",' ghi ']
"'abc \\ s def' ghi" | ['abc \\ s def', 'ghi']
'"abc \\ s def" ghi' | ['abc \\ s def', 'ghi']
'' "ทดสอบ '| ['', 'ทดสอบ']
"'' ทดสอบ" | ['', 'ทดสอบ']
"abc'def" | [ "abc'def"]
"abc'def '" | [ "abc'def"]
"abc'def 'ghi" | ["abc'def '",' ghi ']
"abc'def'ghi" | [ "abc'def'ghi"]
'abc "def' | ['abc" def']
'abc "def"' | [ 'abc "def"']
'abc "def" ghi' | ['abc "def"', 'ghi']
'abc "def" ghi' | [ 'abc "def" GHI']
"r'AA 'r'. * _ xyz $ '" | ["r'AA '", "r'. * _ xyz $ '"]
ฉันจบลงด้วยฟังก์ชั่นต่อไปนี้เพื่อแยกสตริงดังกล่าวว่าผลลัพธ์ที่คาดว่าจะได้ผลลัพธ์สำหรับทุกสายเข้า:
import re
def quoted_split(s):
def strip_quotes(s):
if s and (s[0] == '"' or s[0] == "'") and s[0] == s[-1]:
return s[1:-1]
return s
return [strip_quotes(p).replace('\\"', '"').replace("\\'", "'") \
for p in re.findall(r'"(?:\\.|[^"])*"|\'(?:\\.|[^\'])*\'|[^\s]+', s)]
แอปพลิเคชันทดสอบต่อไปนี้จะตรวจสอบผลลัพธ์ของวิธีการอื่น ๆ ( shlex
และcsv
ตอนนี้) และการใช้การแยกแบบกำหนดเอง
#!/bin/python2.7
import csv
import re
import shlex
from timeit import timeit
def test_case(fn, s, expected):
try:
if fn(s) == expected:
print '[ OK ] %s -> %s' % (s, fn(s))
else:
print '[FAIL] %s -> %s' % (s, fn(s))
except Exception as e:
print '[FAIL] %s -> exception: %s' % (s, e)
def test_case_no_output(fn, s, expected):
try:
fn(s)
except:
pass
def test_split(fn, test_case_fn=test_case):
test_case_fn(fn, 'abc def', ['abc', 'def'])
test_case_fn(fn, "abc \\s def", ['abc', '\\s', 'def'])
test_case_fn(fn, '"abc def" ghi', ['abc def', 'ghi'])
test_case_fn(fn, "'abc def' ghi", ['abc def', 'ghi'])
test_case_fn(fn, '"abc \\" def" ghi', ['abc " def', 'ghi'])
test_case_fn(fn, "'abc \\' def' ghi", ["abc ' def", 'ghi'])
test_case_fn(fn, "'abc \\s def' ghi", ['abc \\s def', 'ghi'])
test_case_fn(fn, '"abc \\s def" ghi', ['abc \\s def', 'ghi'])
test_case_fn(fn, '"" test', ['', 'test'])
test_case_fn(fn, "'' test", ['', 'test'])
test_case_fn(fn, "abc'def", ["abc'def"])
test_case_fn(fn, "abc'def'", ["abc'def'"])
test_case_fn(fn, "abc'def' ghi", ["abc'def'", 'ghi'])
test_case_fn(fn, "abc'def'ghi", ["abc'def'ghi"])
test_case_fn(fn, 'abc"def', ['abc"def'])
test_case_fn(fn, 'abc"def"', ['abc"def"'])
test_case_fn(fn, 'abc"def" ghi', ['abc"def"', 'ghi'])
test_case_fn(fn, 'abc"def"ghi', ['abc"def"ghi'])
test_case_fn(fn, "r'AA' r'.*_xyz$'", ["r'AA'", "r'.*_xyz$'"])
def csv_split(s):
return list(csv.reader([s], delimiter=' '))[0]
def re_split(s):
def strip_quotes(s):
if s and (s[0] == '"' or s[0] == "'") and s[0] == s[-1]:
return s[1:-1]
return s
return [strip_quotes(p).replace('\\"', '"').replace("\\'", "'") for p in re.findall(r'"(?:\\.|[^"])*"|\'(?:\\.|[^\'])*\'|[^\s]+', s)]
if __name__ == '__main__':
print 'shlex\n'
test_split(shlex.split)
print
print 'csv\n'
test_split(csv_split)
print
print 're\n'
test_split(re_split)
print
iterations = 100
setup = 'from __main__ import test_split, test_case_no_output, csv_split, re_split\nimport shlex, re'
def benchmark(method, code):
print '%s: %.3fms per iteration' % (method, (1000 * timeit(code, setup=setup, number=iterations) / iterations))
benchmark('shlex', 'test_split(shlex.split, test_case_no_output)')
benchmark('csv', 'test_split(csv_split, test_case_no_output)')
benchmark('re', 'test_split(re_split, test_case_no_output)')
เอาท์พุท:
shlex
[OK] abc def -> ['abc', 'def']
[FAIL] abc \ s def -> ['abc', 's', 'def']
[ตกลง] "abc def" ghi -> ['abc def', 'ghi']
[ตกลง] 'abc def' ghi -> ['abc def', 'ghi']
[ตกลง] "abc \" def "ghi -> ['abc" def', 'ghi']
[FAIL] 'abc \' def 'ghi -> ข้อยกเว้น: ไม่มีใบเสนอราคาปิด
[ตกลง] 'abc \ s def' ghi -> ['abc \\ s def', 'ghi']
[ตกลง] "abc \ s def" ghi -> ['abc \\ s def', 'ghi']
[ตกลง] "" การทดสอบ -> ['', 'ทดสอบ']
[ตกลง] '' ทดสอบ -> ['', 'ทดสอบ']
[FAIL] abc'def -> ข้อยกเว้น: ไม่มีใบเสนอราคาปิด
[FAIL] abc'def '-> [' abcdef ']
[FAIL] abc'def 'ghi -> [' abcdef ',' ghi ']
[FAIL] abc'def'ghi -> ['abcdefghi']
[FAIL] abc "def -> ข้อยกเว้น: ไม่มีใบเสนอราคาปิด
[FAIL] abc "def" -> ['abcdef']
[FAIL] abc "def" ghi -> ['abcdef', 'ghi']
[FAIL] abc "def" ghi -> ['abcdefghi']
[FAIL] r'AA 'r'. * _ xyz $ '-> [' rAA ',' r. * _ xyz $ ']
CSV
[OK] abc def -> ['abc', 'def']
[ตกลง] abc \ s def -> ['abc', '\\ s', 'def']
[ตกลง] "abc def" ghi -> ['abc def', 'ghi']
[FAIL] 'abc def' ghi -> ["'abc", "def'", 'ghi']
[FAIL] "abc \" def "ghi -> ['abc \\', 'def"', 'ghi']
[FAIL] 'abc \' def 'ghi -> ["' abc", "\\ '", "def'", 'ghi']
[FAIL] 'abc \ s def' ghi -> ["'abc",' \\ s ', "def'", 'ghi']
[ตกลง] "abc \ s def" ghi -> ['abc \\ s def', 'ghi']
[ตกลง] "" การทดสอบ -> ['', 'ทดสอบ']
[FAIL] '' การทดสอบ -> ["''", 'ทดสอบ']
[OK] abc'def -> ["abc'def"]
[OK] abc'def '-> ["abc'def'"]
[ตกลง] abc'def 'ghi -> ["abc'def'", 'ghi']
[OK] abc'def'ghi -> ["abc'def'ghi"]
[ตกลง] abc "def -> ['abc" def']
[OK] abc "def" -> ['abc "def"']
[ตกลง] abc "def" ghi -> ['abc "def"', 'ghi']
[ตกลง] abc "def" ghi -> ['abc "def" ghi']
[ตกลง] r'AA 'r'. * _ xyz $ '-> ["r'AA'", "r '. * _ xyz $'"]
อีกครั้ง
[OK] abc def -> ['abc', 'def']
[ตกลง] abc \ s def -> ['abc', '\\ s', 'def']
[ตกลง] "abc def" ghi -> ['abc def', 'ghi']
[ตกลง] 'abc def' ghi -> ['abc def', 'ghi']
[ตกลง] "abc \" def "ghi -> ['abc" def', 'ghi']
[ตกลง] 'abc \' def 'ghi -> ["abc' def", 'ghi']
[ตกลง] 'abc \ s def' ghi -> ['abc \\ s def', 'ghi']
[ตกลง] "abc \ s def" ghi -> ['abc \\ s def', 'ghi']
[ตกลง] "" การทดสอบ -> ['', 'ทดสอบ']
[ตกลง] '' ทดสอบ -> ['', 'ทดสอบ']
[OK] abc'def -> ["abc'def"]
[OK] abc'def '-> ["abc'def'"]
[ตกลง] abc'def 'ghi -> ["abc'def'", 'ghi']
[OK] abc'def'ghi -> ["abc'def'ghi"]
[ตกลง] abc "def -> ['abc" def']
[OK] abc "def" -> ['abc "def"']
[ตกลง] abc "def" ghi -> ['abc "def"', 'ghi']
[ตกลง] abc "def" ghi -> ['abc "def" ghi']
[ตกลง] r'AA 'r'. * _ xyz $ '-> ["r'AA'", "r '. * _ xyz $'"]
shlex: 0.281ms ต่อการทำซ้ำ
csv: 0.030ms ต่อการทำซ้ำ
Re: 0.049ms ต่อการทำซ้ำ
ดังนั้นประสิทธิภาพจะดีกว่าshlex
และสามารถปรับปรุงได้อีกด้วยการคอมไพล์นิพจน์ปกติซึ่งในกรณีนี้มันจะดีกว่าcsv
วิธีการ