Skip to content

Commit 31e320d

Browse files
authored
Merge pull request package-url#11 from package-url/improve-unicode-handling
Improve unicode handling
2 parents 5dbe3b1 + 9e5cc27 commit 31e320d

File tree

4 files changed

+282
-101
lines changed

4 files changed

+282
-101
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
setup(
1414
name='packageurl-python',
15-
version='0.6.0',
15+
version='0.7.0',
1616
license='MIT',
1717
description='A "purl" aka. package URL parser and builder',
1818
long_description='Python library to parse and build "purl" aka. package URLs. '

src/packageurl.py

Lines changed: 153 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# -*- coding: utf-8 -*-
2+
#
13
# Copyright (c) the purl authors
24
#
35
# Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -21,31 +23,33 @@
2123
# Visit https://github.com/package-url/packageurl-python for support and
2224
# download.
2325

26+
2427
from __future__ import absolute_import
2528
from __future__ import print_function
2629
from __future__ import unicode_literals
2730

2831
from collections import namedtuple
29-
from collections import OrderedDict
32+
import string
3033

3134
# Python 2 and 3 support
3235
try:
3336
# Python 2
34-
from urlparse import urlsplit
35-
from urllib import quote as percent_quote
36-
from urllib import unquote as percent_unquote
37+
from urlparse import urlsplit as _urlsplit
38+
from urllib import quote as _percent_quote
39+
from urllib import unquote as _percent_unquote
3740
except ImportError:
3841
# Python 3
39-
from urllib.parse import urlsplit
40-
from urllib.parse import quote as percent_quote
41-
from urllib.parse import unquote as percent_unquote
42+
from urllib.parse import urlsplit as _urlsplit
43+
from urllib.parse import quote as _percent_quote
44+
from urllib.parse import unquote as _percent_unquote
4245

4346
# Python 2 and 3 support
4447
try:
4548
# Python 2
4649
unicode
47-
str = unicode # NOQA
4850
basestring = basestring # NOQA
51+
bytes = str # NOQA
52+
str = unicode # NOQA
4953
except NameError:
5054
# Python 3
5155
unicode = str # NOQA
@@ -59,10 +63,27 @@
5963

6064
def quote(s):
6165
"""
62-
Percent-encode a string, except for colon :
66+
Return a percent-encoded unicode string, except for colon :, given an `s`
67+
byte or unicode string.
68+
"""
69+
if isinstance(s, unicode):
70+
s = s.encode('utf-8')
71+
quoted = _percent_quote(s)
72+
if not isinstance(quoted, unicode):
73+
quoted = quoted.decode('utf-8')
74+
quoted = quoted.replace('%3A', ':')
75+
return quoted
76+
77+
78+
def unquote(s):
79+
"""
80+
Return a percent-decoded unicode string, given an `s` byte or unicode
81+
string.
6382
"""
64-
quoted = percent_quote(s)
65-
return quoted.replace('%3A', ':')
83+
unquoted = _percent_unquote(s)
84+
if not isinstance(unquoted, unicode):
85+
unquoted = unquoted .decode('utf-8')
86+
return unquoted
6687

6788

6889
def get_quoter(encode=True):
@@ -72,96 +93,146 @@ def get_quoter(encode=True):
7293
if encode is True:
7394
return quote
7495
elif encode is False:
75-
return percent_unquote
96+
return unquote
7697
elif encode is None:
7798
return lambda x: x
7899

79100

80-
def normalize_qualifiers(qualifiers, encode=True):
81-
"""
82-
Return normalized qualifiers.
101+
def normalize_type(type, encode=True): # NOQA
102+
if not type:
103+
return
104+
if not isinstance(type, unicode):
105+
type = type.decode('utf-8') # NOQA
83106

84-
If `qualifiers` is a dictionary of qualifiers and values and `encode` is true,
85-
the dictionary is then converted to a string of qualifiers, formatted to the purl specifications.
107+
quoter = get_quoter(encode)
108+
type = quoter(type) # NOQA
109+
return type.strip().lower() or None
86110

87-
If `qualifiers` is a string of qualfiers, formatted to the purl specifications, and `encode`
88-
is false, the string is then converted to a dictionary of qualifiers and their values.
89-
"""
90-
quoting = get_quoter(encode)
91-
92-
if qualifiers:
93-
if isinstance(qualifiers, basestring):
94-
# decode string to dict
95-
qualifiers = qualifiers.split('&')
96-
qualifiers = [kv.partition('=') for kv in qualifiers]
97-
if qualifiers:
98-
qualifiers = [(k, v) for k, _, v in qualifiers]
99-
else:
100-
qualifiers = []
101-
elif isinstance(qualifiers, (dict, OrderedDict,)):
102-
qualifiers = qualifiers.items()
103-
else:
104-
raise ValueError(
105-
'Invalid qualifier. '
106-
'Must be a string or dict:{}'.format(repr(qualifiers)))
107111

108-
if qualifiers:
109-
qualifiers = {
110-
k.strip().lower(): quoting(v)
111-
for k, v in qualifiers
112-
if k and k.strip() and v and v.strip()
113-
}
112+
def normalize_namespace(namespace, ptype, encode=True): # NOQA
113+
if not namespace:
114+
return
115+
if not isinstance(namespace, unicode):
116+
namespace = namespace.decode('utf-8')
114117

115-
if qualifiers and encode is True:
116-
# encode dict as a string
117-
qualifiers = sorted(qualifiers.items())
118-
qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
119-
qualifiers = '&'.join(qualifiers)
118+
namespace = namespace.strip().strip('/')
119+
if ptype in ('bitbucket', 'github', 'pypi'):
120+
namespace = namespace.lower()
121+
segments = [seg for seg in namespace.split('/') if seg.strip()]
122+
segments = map(get_quoter(encode), segments)
123+
return '/'.join(segments) or None
120124

121-
return qualifiers or None
122125

126+
def normalize_name(name, ptype, encode=True): # NOQA
127+
if not name:
128+
return
129+
if not isinstance(name, unicode):
130+
name = name.decode('utf-8')
123131

124-
def normalize(type, namespace, name, version, qualifiers, subpath, encode=True): # NOQA
132+
quoter = get_quoter(encode)
133+
name = quoter(name)
134+
name = name.strip().strip('/')
135+
if ptype in ('bitbucket', 'github', 'pypi',):
136+
name = name.lower()
137+
if ptype in ('pypi',):
138+
name = name.replace('_', '-')
139+
return name or None
140+
141+
142+
def normalize_version(version, encode=True): # NOQA
143+
if not version:
144+
return
145+
if not isinstance(version, unicode):
146+
version = version.decode('utf-8')
147+
148+
quoter = get_quoter(encode)
149+
version = quoter(version.strip())
150+
return version or None
151+
152+
153+
def normalize_qualifiers(qualifiers, encode=True): # NOQA
125154
"""
126-
Return normalized purl components.
155+
Return normalized `qualifiers` as a mapping (or as a string if `encode` is
156+
True). The `qualifiers` arg is either a mapping or a string.
157+
Always return a mapping if decode is True (and never None).
158+
Raise ValueError on errors.
127159
"""
128-
quoting = get_quoter(encode)
160+
if not qualifiers:
161+
return None if encode else {}
162+
163+
if isinstance(qualifiers, basestring):
164+
if not isinstance(qualifiers, unicode):
165+
qualifiers = qualifiers.decode('utf-8')
166+
# decode string to list of tuples
167+
qualifiers = qualifiers.split('&')
168+
qualifiers = [kv.partition('=') for kv in qualifiers]
169+
qualifiers = [(k, v) for k, _, v in qualifiers]
170+
elif isinstance(qualifiers, dict):
171+
qualifiers = qualifiers.items()
172+
else:
173+
raise ValueError(
174+
'Invalid qualifier. '
175+
'Must be a string or dict:{}'.format(repr(qualifiers)))
176+
177+
quoter = get_quoter(encode)
178+
qualifiers = {k.strip().lower(): quoter(v)
179+
for k, v in qualifiers if k and k.strip() and v and v.strip()}
180+
181+
valid_chars = string.ascii_letters + string.digits + '.-_'
182+
for key in qualifiers:
183+
if not key:
184+
raise ValueError('A qualifier key cannot be empty')
185+
186+
if '%' in key:
187+
raise ValueError(
188+
"A qualifier key cannot be percent encoded: {}".format(repr(key)))
189+
190+
if ' ' in key:
191+
raise ValueError(
192+
"A qualifier key cannot contain spaces: {}".format(repr(key)))
129193

130-
if type:
131-
type = type.strip().lower() # NOQA
194+
if not all(c in valid_chars for c in key):
195+
raise ValueError(
196+
"A qualifier key must be composed only of ASCII letters and numbers"
197+
"period, dash and underscore: {}".format(repr(key)))
132198

133-
if namespace:
134-
namespace = namespace.strip().strip('/')
135-
if type in ('bitbucket', 'github', 'pypi'):
136-
namespace = namespace.lower()
137-
segments = namespace.split('/')
138-
segments = [seg for seg in segments if seg and seg.strip()]
139-
segments = map(quoting, segments)
140-
namespace = '/'.join(segments)
199+
if key[0] in string.digits:
200+
raise ValueError(
201+
"A qualifier key cannot start with a number: {}".format(repr(key)))
141202

142-
if name:
143-
name = name.strip().strip('/')
144-
if type in ('bitbucket', 'github', 'pypi',):
145-
name = name.lower()
146-
if type in ('pypi',):
147-
name = name.replace('_', '-')
148-
name = quoting(name)
203+
if encode:
204+
qualifiers = sorted(qualifiers.items())
205+
qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
206+
qualifiers = '&'.join(qualifiers)
207+
return qualifiers or None
208+
else:
209+
return qualifiers or {}
149210

150-
name = name or None
151211

152-
if version:
153-
version = quoting(version.strip())
212+
def normalize_subpath(subpath, encode=True): # NOQA
213+
if not subpath:
214+
return None
215+
if not isinstance(subpath, unicode):
216+
subpath = subpath.decode('utf-8')
154217

155-
qualifiers = normalize_qualifiers(qualifiers, encode)
218+
quoter = get_quoter(encode)
219+
segments = subpath.split('/')
220+
segments = [quoter(s) for s in segments if s.strip() and s not in ('.', '..')]
221+
subpath = '/'.join(segments)
222+
return subpath or None
156223

157-
if subpath:
158-
segments = subpath.split('/')
159-
segments = [quoting(s) for s in segments if s and s.strip()
160-
and s not in ('.', '..')]
161-
subpath = '/'.join(segments)
162224

163-
return (type or None, namespace or None, name or None, version or None,
164-
qualifiers or None, subpath or None)
225+
def normalize(type, namespace, name, version, qualifiers, subpath, encode=True): # NOQA
226+
"""
227+
Return normalized purl components
228+
"""
229+
type = normalize_type(type, encode) # NOQA
230+
namespace = normalize_namespace(namespace, type, encode)
231+
name = normalize_name(name, type, encode)
232+
version = normalize_version(version, encode)
233+
qualifiers = normalize_qualifiers(qualifiers, encode)
234+
subpath = normalize_subpath(subpath, encode)
235+
return type, namespace, name, version, qualifiers, subpath
165236

166237

167238
_components = ['type', 'namespace', 'name', 'version', 'qualifiers', 'subpath']
@@ -191,7 +262,7 @@ def __new__(self, type=None, namespace=None, name=None, # NOQA
191262
raise ValueError('Invalid purl: {} argument must be a string: {}.'
192263
.format(key, repr(value)))
193264

194-
if qualifiers and not isinstance(qualifiers, (basestring, dict, OrderedDict,)):
265+
if qualifiers and not isinstance(qualifiers, (basestring, dict,)):
195266
raise ValueError('Invalid purl: {} argument must be a dict or a string: {}.'
196267
.format('qualifiers', repr(qualifiers)))
197268

@@ -268,7 +339,7 @@ def from_string(cls, purl):
268339
'purl is missing the required '
269340
'type component: {}.'.format(repr(purl)))
270341

271-
scheme, authority, path, qualifiers, subpath = urlsplit(
342+
scheme, authority, path, qualifiers, subpath = _urlsplit(
272343
url=remainder, scheme='', allow_fragments=True)
273344

274345
if scheme or authority:

test-suite-data.json

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,5 +262,41 @@
262262
"qualifiers": null,
263263
"subpath": null,
264264
"is_invalid": false
265+
},
266+
{
267+
"description": "valid maven purl with case sensitive namespace and name",
268+
"purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3",
269+
"canonical_purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3",
270+
"type": "maven",
271+
"namespace": "HTTPClient",
272+
"name": "HTTPClient",
273+
"version": "0.3-3",
274+
"qualifiers": null,
275+
"subpath": null,
276+
"is_invalid": false
277+
},
278+
{
279+
"description": "valid maven purl containing a space in the version and qualifier",
280+
"purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value",
281+
"canonical_purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value",
282+
"type": "maven",
283+
"namespace": "mygroup",
284+
"name": "myartifact",
285+
"version": "1.0.0 Final",
286+
"qualifiers": {"mykey": "my value"},
287+
"subpath": null,
288+
"is_invalid": false
289+
},
290+
{
291+
"description": "checks for invalid qualifier keys",
292+
"purl": "pkg:npm/myartifact@1.0.0?in%20production=true",
293+
"canonical_purl": null,
294+
"type": "npm",
295+
"namespace": null,
296+
"name": "myartifact",
297+
"version": "1.0.0",
298+
"qualifiers": {"in production": "true"},
299+
"subpath": null,
300+
"is_invalid": true
265301
}
266302
]

0 commit comments

Comments
 (0)