Skip to content

gh-69753: Add Bytes Object Support to Shlex #22657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 67 additions & 11 deletions Lib/shlex.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ def __init__(self, instream=None, infile=None, posix=False,
punctuation_chars=False):
if isinstance(instream, str):
instream = StringIO(instream)
elif isinstance(instream, bytes):
# convert byte instreams to string
instream = StringIO(instream.decode("ascii", "surrogateescape"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would you only support ASCII strings? for POSIX platforms, you should support also additional characters (see wordchars being extended).


if instream is not None:
self.instream = instream
self.infile = infile
Expand Down Expand Up @@ -312,26 +316,78 @@ def split(s, comments=False, posix=True):
lex.whitespace_split = True
if not comments:
lex.commenters = ''
return list(lex)

if isinstance(s, bytes):
return [i.encode("ascii") for i in lex]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, this should not be restricted to ASCII characters only.

else:
return list(lex)


def join(split_command):
"""Return a shell-escaped string from *split_command*."""
return ' '.join(quote(arg) for arg in split_command)
if len(split_command) == 0:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This behaviour should be documented, saying that a string will be returned instead of bytes for an empty sequence of bytes. Or maybe add an additional a separate function which only accepts bytes inputs (less code and no warnings and more efficient).

return ""

# Ensure all objects are the same type as the first one,
# otherwise convert and raise a warning
cleaned = []
warned = False

for command in split_command:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is too complicated. The str-version assumes that the objects are all strings, namely that split_command is an iterable of strings.

if not isinstance(command, type(split_command[0])):
# Check if user was warned not to mix types,
# warn otherwise.
if not warned:
import warnings
warnings.warn("All objects passed to join must be of the same type."
" Converting all to {}.".format(type(split_command[0]).__name__),
BytesWarning, stacklevel=2)

warned = True

_find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search
# Convert object to opposite type
if isinstance(command, bytes):
command = command.decode("ascii", "surrogateescape")
else:
command = command.encode("ascii")

cleaned.append(command)

# Return a joined string or bytes object
if isinstance(cleaned[0], bytes):
return b' '.join(quote(arg) for arg in cleaned)
else:
return ' '.join(quote(arg) for arg in cleaned)


_find_unsafe_string = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search
_find_unsafe_bytes = re.compile(rb'[^\w@%+=:,./-]').search

def quote(s):
"""Return a shell-escaped version of the string *s*."""
if not s:
return "''"
if _find_unsafe(s) is None:
return s

# use single quotes, and put single quotes into double quotes
# the string $'b is then quoted as '$'"'"'b'
return "'" + s.replace("'", "'\"'\"'") + "'"
# determine if s is bytes or string object,
# and check for unsafe characters
if isinstance(s, bytes):
if not s:
return b"''"

if _find_unsafe_bytes(s) is None:
return s

# use single quotes, and put single quotes into double quotes
# the string $'b is then quoted as '$'"'"'b'
return b"'" + s.replace(b"'", b"'\"'\"'") + b"'"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

else:
if not s:
return "''"

if _find_unsafe_string(s) is None:
return s

# use single quotes, and put single quotes into double quotes
# the string $'b is then quoted as '$'"'"'b'
return "'" + s.replace("'", "'\"'\"'") + "'"


def _print_tokens(lexer):
Expand Down
64 changes: 64 additions & 0 deletions Lib/test/test_shlex.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ def testSplitPosix(self):
"""Test data splitting with posix parser"""
self.splitTest(self.posix_data, comments=True)

def testSplitBytes(self):
"""Test byte objects splitting"""
self.assertEqual(shlex.split(b"split words"), [b"split", b"words"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The coverage is insufficient. Use the self.splitTest as for the string case but do it for bytes inputs instead.

In addition, use more than just ASCII characters but also those that are supported by the POSIX platforms.


def testCompat(self):
"""Test compatibility interface"""
for i in range(len(self.data)):
Expand Down Expand Up @@ -339,6 +343,23 @@ def testQuote(self):
self.assertEqual(shlex.quote("test%s'name'" % u),
"'test%s'\"'\"'name'\"'\"''" % u)

def testQuoteBytes(self):
"""Test quoting of byte objects"""
# Copied from testQuote
safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s
unsafe = '"`$\\!' + unicode_sample

self.assertEqual(shlex.quote(b''), b"''")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to test some bytestrings with non-ASCII characters, to make sure that they're handled correctly.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the missing unicode characters from the testQuote test, and changes the encoding to utf-8 to account for the changes. Added in bfb76be.

Copy link
Member

@picnixz picnixz Jul 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add all characters that are supported on POSIX (not just three of them):

unicode_chars = ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
 				 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')

(I'll let you CC the chars since I am not sure about my own CC)

self.assertEqual(shlex.quote(safeunquoted.encode("ascii")), safeunquoted.encode("ascii"))
self.assertEqual(shlex.quote(b'test file name'), b"'test file name'")
for u in unsafe:
self.assertEqual(shlex.quote(('test%sname' % u).encode("utf-8")),
("'test%sname'" % u).encode("utf-8"))
for u in unsafe:
self.assertEqual(shlex.quote(("test%s'name'" % u).encode("utf-8")),
("'test%s'\"'\"'name'\"'\"''" % u).encode("utf-8"))

def testJoin(self):
for split_command, command in [
(['a ', 'b'], "'a ' b"),
Expand All @@ -350,6 +371,9 @@ def testJoin(self):
joined = shlex.join(split_command)
self.assertEqual(joined, command)

def testEmptyJoin(self):
self.assertEqual(shlex.join([]), "")

def testJoinRoundtrip(self):
all_data = self.data + self.posix_data
for command, *split_command in all_data:
Expand All @@ -358,6 +382,46 @@ def testJoinRoundtrip(self):
resplit = shlex.split(joined)
self.assertEqual(split_command, resplit)

def testJoinBytes(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use the same dataset as for those used in strings, especially with quotation symbols or unsafe characters.

self.assertEqual(shlex.join([b"Join", b"me"]), b"Join me")
self.assertEqual(shlex.join([b"Just_me"]), b"Just_me")

def testJoinBytesAndStrings(self):
"""Test join can handle combinations of string and byte objects"""
# String then bytes
with self.assertWarns(BytesWarning,
msg="All objects passed to join must be of the same type."
"Converting all to str."):
self.assertEqual(shlex.join(["str_object", b"byte_object"]), "str_object byte_object")

# Bytes then string
with self.assertWarns(BytesWarning,
msg="All objects passed to join must be of the same type."
"Converting all to bytes."):
self.assertEqual(shlex.join([b"byte_object", "str_object"]), b"byte_object str_object")

# Random combination
with self.assertWarns(BytesWarning):
import random

words = "This is a fully formed sentence, to test the join functionality."

new_words = []
for word in words.split(" "):
if bool(random.randint(0, 1)):
new_words.append(word.encode("ascii"))
else:
new_words.append(word)

# ensure at least one bytes object to raise warning
if isinstance(new_words[2], str):
new_words[2] = new_words[2].encode("ascii")

if isinstance(new_words[0], bytes):
self.assertEqual(shlex.join(new_words), words.encode("ascii"))
else:
self.assertEqual(shlex.join(new_words), words)

def testPunctuationCharsReadOnly(self):
punctuation_chars = "/|$%^"
shlex_instance = shlex.shlex(punctuation_chars=punctuation_chars)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for bytes objects in the shlex module.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Add support for bytes objects in the shlex module.
Add support for :class:`bytes` objects to the :mod:`shlex` module.

You should probably explain that join() returns an empty string as well in the docs. This is important. Or introduce a separate function for bytes objects (which I think would be preferrable because you wouldn't have all those warnings to handle).