-
-
Notifications
You must be signed in to change notification settings - Fork 31.8k
gh-69753: Add Bytes Object Support to Shlex #22657
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d5242f7
f90f33b
ab48481
a00ce01
571d7c9
bfb76be
473c81d
f8f2c06
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -22,6 +22,10 @@ def __init__(self, instream=None, infile=None, posix=False, | |||
punctuation_chars=False): | ||||
if isinstance(instream, str): | ||||
instream = StringIO(instream) | ||||
elif isinstance(instream, bytes): | ||||
# convert byte instreams to string | ||||
instream = StringIO(instream.decode("ascii", "surrogateescape")) | ||||
|
||||
if instream is not None: | ||||
self.instream = instream | ||||
self.infile = infile | ||||
|
@@ -312,26 +316,78 @@ def split(s, comments=False, posix=True): | |||
lex.whitespace_split = True | ||||
if not comments: | ||||
lex.commenters = '' | ||||
return list(lex) | ||||
|
||||
if isinstance(s, bytes): | ||||
return [i.encode("ascii") for i in lex] | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, this should not be restricted to ASCII characters only. |
||||
else: | ||||
return list(lex) | ||||
|
||||
|
||||
def join(split_command): | ||||
"""Return a shell-escaped string from *split_command*.""" | ||||
return ' '.join(quote(arg) for arg in split_command) | ||||
if len(split_command) == 0: | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This behaviour should be documented, saying that a string will be returned instead of bytes for an empty sequence of bytes. Or maybe add an additional a separate function which only accepts bytes inputs (less code and no warnings and more efficient). |
||||
return "" | ||||
|
||||
# Ensure all objects are the same type as the first one, | ||||
# otherwise convert and raise a warning | ||||
cleaned = [] | ||||
warned = False | ||||
|
||||
for command in split_command: | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is too complicated. The str-version assumes that the objects are all strings, namely that |
||||
if not isinstance(command, type(split_command[0])): | ||||
# Check if user was warned not to mix types, | ||||
# warn otherwise. | ||||
if not warned: | ||||
import warnings | ||||
warnings.warn("All objects passed to join must be of the same type." | ||||
" Converting all to {}.".format(type(split_command[0]).__name__), | ||||
BytesWarning, stacklevel=2) | ||||
|
||||
warned = True | ||||
|
||||
_find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search | ||||
# Convert object to opposite type | ||||
if isinstance(command, bytes): | ||||
command = command.decode("ascii", "surrogateescape") | ||||
else: | ||||
command = command.encode("ascii") | ||||
|
||||
cleaned.append(command) | ||||
|
||||
# Return a joined string or bytes object | ||||
if isinstance(cleaned[0], bytes): | ||||
return b' '.join(quote(arg) for arg in cleaned) | ||||
else: | ||||
return ' '.join(quote(arg) for arg in cleaned) | ||||
|
||||
|
||||
_find_unsafe_string = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search | ||||
_find_unsafe_bytes = re.compile(rb'[^\w@%+=:,./-]').search | ||||
|
||||
def quote(s): | ||||
"""Return a shell-escaped version of the string *s*.""" | ||||
if not s: | ||||
return "''" | ||||
if _find_unsafe(s) is None: | ||||
return s | ||||
|
||||
# use single quotes, and put single quotes into double quotes | ||||
# the string $'b is then quoted as '$'"'"'b' | ||||
return "'" + s.replace("'", "'\"'\"'") + "'" | ||||
# determine if s is bytes or string object, | ||||
# and check for unsafe characters | ||||
if isinstance(s, bytes): | ||||
if not s: | ||||
return b"''" | ||||
|
||||
if _find_unsafe_bytes(s) is None: | ||||
return s | ||||
|
||||
# use single quotes, and put single quotes into double quotes | ||||
# the string $'b is then quoted as '$'"'"'b' | ||||
return b"'" + s.replace(b"'", b"'\"'\"'") + b"'" | ||||
|
||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
else: | ||||
if not s: | ||||
return "''" | ||||
|
||||
if _find_unsafe_string(s) is None: | ||||
return s | ||||
|
||||
# use single quotes, and put single quotes into double quotes | ||||
# the string $'b is then quoted as '$'"'"'b' | ||||
return "'" + s.replace("'", "'\"'\"'") + "'" | ||||
|
||||
|
||||
def _print_tokens(lexer): | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -171,6 +171,10 @@ def testSplitPosix(self): | |
"""Test data splitting with posix parser""" | ||
self.splitTest(self.posix_data, comments=True) | ||
|
||
def testSplitBytes(self): | ||
"""Test byte objects splitting""" | ||
self.assertEqual(shlex.split(b"split words"), [b"split", b"words"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The coverage is insufficient. Use the In addition, use more than just ASCII characters but also those that are supported by the POSIX platforms. |
||
|
||
def testCompat(self): | ||
"""Test compatibility interface""" | ||
for i in range(len(self.data)): | ||
|
@@ -339,6 +343,23 @@ def testQuote(self): | |
self.assertEqual(shlex.quote("test%s'name'" % u), | ||
"'test%s'\"'\"'name'\"'\"''" % u) | ||
|
||
def testQuoteBytes(self): | ||
"""Test quoting of byte objects""" | ||
# Copied from testQuote | ||
safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./' | ||
unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s | ||
unsafe = '"`$\\!' + unicode_sample | ||
|
||
self.assertEqual(shlex.quote(b''), b"''") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be good to test some bytestrings with non-ASCII characters, to make sure that they're handled correctly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added the missing unicode characters from the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add all characters that are supported on POSIX (not just three of them): unicode_chars = ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ') (I'll let you CC the chars since I am not sure about my own CC) |
||
self.assertEqual(shlex.quote(safeunquoted.encode("ascii")), safeunquoted.encode("ascii")) | ||
self.assertEqual(shlex.quote(b'test file name'), b"'test file name'") | ||
for u in unsafe: | ||
self.assertEqual(shlex.quote(('test%sname' % u).encode("utf-8")), | ||
("'test%sname'" % u).encode("utf-8")) | ||
for u in unsafe: | ||
self.assertEqual(shlex.quote(("test%s'name'" % u).encode("utf-8")), | ||
("'test%s'\"'\"'name'\"'\"''" % u).encode("utf-8")) | ||
|
||
def testJoin(self): | ||
for split_command, command in [ | ||
(['a ', 'b'], "'a ' b"), | ||
|
@@ -350,6 +371,9 @@ def testJoin(self): | |
joined = shlex.join(split_command) | ||
self.assertEqual(joined, command) | ||
|
||
def testEmptyJoin(self): | ||
self.assertEqual(shlex.join([]), "") | ||
|
||
def testJoinRoundtrip(self): | ||
all_data = self.data + self.posix_data | ||
for command, *split_command in all_data: | ||
|
@@ -358,6 +382,46 @@ def testJoinRoundtrip(self): | |
resplit = shlex.split(joined) | ||
self.assertEqual(split_command, resplit) | ||
|
||
def testJoinBytes(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use the same dataset as for those used in strings, especially with quotation symbols or unsafe characters. |
||
self.assertEqual(shlex.join([b"Join", b"me"]), b"Join me") | ||
self.assertEqual(shlex.join([b"Just_me"]), b"Just_me") | ||
|
||
def testJoinBytesAndStrings(self): | ||
"""Test join can handle combinations of string and byte objects""" | ||
# String then bytes | ||
with self.assertWarns(BytesWarning, | ||
msg="All objects passed to join must be of the same type." | ||
"Converting all to str."): | ||
self.assertEqual(shlex.join(["str_object", b"byte_object"]), "str_object byte_object") | ||
|
||
# Bytes then string | ||
with self.assertWarns(BytesWarning, | ||
msg="All objects passed to join must be of the same type." | ||
"Converting all to bytes."): | ||
self.assertEqual(shlex.join([b"byte_object", "str_object"]), b"byte_object str_object") | ||
|
||
# Random combination | ||
with self.assertWarns(BytesWarning): | ||
import random | ||
|
||
words = "This is a fully formed sentence, to test the join functionality." | ||
|
||
new_words = [] | ||
for word in words.split(" "): | ||
if bool(random.randint(0, 1)): | ||
new_words.append(word.encode("ascii")) | ||
else: | ||
new_words.append(word) | ||
|
||
# ensure at least one bytes object to raise warning | ||
if isinstance(new_words[2], str): | ||
new_words[2] = new_words[2].encode("ascii") | ||
|
||
if isinstance(new_words[0], bytes): | ||
self.assertEqual(shlex.join(new_words), words.encode("ascii")) | ||
else: | ||
self.assertEqual(shlex.join(new_words), words) | ||
|
||
def testPunctuationCharsReadOnly(self): | ||
punctuation_chars = "/|$%^" | ||
shlex_instance = shlex.shlex(punctuation_chars=punctuation_chars) | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1 @@ | ||||||
Add support for bytes objects in the shlex module. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
You should probably explain that |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why would you only support ASCII strings? for POSIX platforms, you should support also additional characters (see wordchars being extended).