From d7be0209d00fefd819d27804b1ee536765e6509e Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sat, 6 Oct 2018 23:37:59 +0200 Subject: [PATCH 1/3] Improve unicode handling * always use unicode internally and encode and decode at the boundaries * when calling URL quoting functions, use bytes. Signed-off-by: Philippe Ombredanne --- setup.py | 2 +- src/packageurl.py | 211 ++++++++++++++++++++++++++++------------------ test_purl.py | 110 ++++++++++++++++++++---- 3 files changed, 221 insertions(+), 102 deletions(-) diff --git a/setup.py b/setup.py index 2a485c5..332e7c3 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name='packageurl-python', - version='0.6.0', + version='0.7.0', license='MIT', description='A "purl" aka. package URL parser and builder', long_description='Python library to parse and build "purl" aka. package URLs. ' diff --git a/src/packageurl.py b/src/packageurl.py index 5916f6e..ea2be2a 100644 --- a/src/packageurl.py +++ b/src/packageurl.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# # Copyright (c) the purl authors # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -31,21 +33,22 @@ # Python 2 and 3 support try: # Python 2 - from urlparse import urlsplit - from urllib import quote as percent_quote - from urllib import unquote as percent_unquote + from urlparse import urlsplit as _urlsplit + from urllib import quote as _percent_quote + from urllib import unquote as _percent_unquote except ImportError: # Python 3 - from urllib.parse import urlsplit - from urllib.parse import quote as percent_quote - from urllib.parse import unquote as percent_unquote + from urllib.parse import urlsplit as _urlsplit + from urllib.parse import quote as _percent_quote + from urllib.parse import unquote as _percent_unquote # Python 2 and 3 support try: # Python 2 unicode - str = unicode # NOQA basestring = basestring # NOQA + bytes = str # NOQA + str = unicode # NOQA except NameError: # Python 3 unicode = str # NOQA @@ -59,10 +62,27 @@ def quote(s): """ - Percent-encode a string, except for colon : + Return a percent-encoded unicode string, except for colon :, given an `s` + byte or unicode string. """ - quoted = percent_quote(s) - return quoted.replace('%3A', ':') + if isinstance(s, unicode): + s = s.encode('utf-8') + quoted = _percent_quote(s) + if not isinstance(quoted, unicode): + quoted = quoted.decode('utf-8') + quoted = quoted.replace('%3A', ':') + return quoted + + +def unquote(s): + """ + Return a percent-decoded unicode string, given an `s` byte or unicode + string. + """ + unquoted = _percent_unquote(s) + if not isinstance(unquoted, unicode): + unquoted = unquoted .decode('utf-8') + return unquoted def get_quoter(encode=True): @@ -72,96 +92,121 @@ def get_quoter(encode=True): if encode is True: return quote elif encode is False: - return percent_unquote + return unquote elif encode is None: return lambda x: x -def normalize_qualifiers(qualifiers, encode=True): - """ - Return normalized qualifiers. - - If `qualifiers` is a dictionary of qualifiers and values and `encode` is true, - the dictionary is then converted to a string of qualifiers, formatted to the purl specifications. +def normalize_type(type, encode=True): # NOQA + if not type: + return + if not isinstance(type, unicode): + type = type.decode('utf-8') # NOQA - If `qualifiers` is a string of qualfiers, formatted to the purl specifications, and `encode` - is false, the string is then converted to a dictionary of qualifiers and their values. - """ - quoting = get_quoter(encode) - - if qualifiers: - if isinstance(qualifiers, basestring): - # decode string to dict - qualifiers = qualifiers.split('&') - qualifiers = [kv.partition('=') for kv in qualifiers] - if qualifiers: - qualifiers = [(k, v) for k, _, v in qualifiers] - else: - qualifiers = [] - elif isinstance(qualifiers, (dict, OrderedDict,)): - qualifiers = qualifiers.items() - else: - raise ValueError( - 'Invalid qualifier. ' - 'Must be a string or dict:{}'.format(repr(qualifiers))) + quoter = get_quoter(encode) + type = quoter(type) # NOQA + return type.strip().lower() or None - if qualifiers: - qualifiers = { - k.strip().lower(): quoting(v) - for k, v in qualifiers - if k and k.strip() and v and v.strip() - } - if qualifiers and encode is True: - # encode dict as a string - qualifiers = sorted(qualifiers.items()) - qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers] - qualifiers = '&'.join(qualifiers) +def normalize_namespace(namespace, ptype, encode=True): # NOQA + if not namespace: + return + if not isinstance(namespace, unicode): + namespace = namespace.decode('utf-8') - return qualifiers or None + namespace = namespace.strip().strip('/') + if ptype in ('bitbucket', 'github', 'pypi'): + namespace = namespace.lower() + segments = [seg for seg in namespace.split('/') if seg.strip()] + segments = map(get_quoter(encode), segments) + return '/'.join(segments) or None -def normalize(type, namespace, name, version, qualifiers, subpath, encode=True): # NOQA - """ - Return normalized purl components. - """ - quoting = get_quoter(encode) +def normalize_name(name, ptype, encode=True): # NOQA + if not name: + return + if not isinstance(name, unicode): + name = name.decode('utf-8') - if type: - type = type.strip().lower() # NOQA + quoter = get_quoter(encode) + name = quoter(name) + name = name.strip().strip('/') + if ptype in ('bitbucket', 'github', 'pypi',): + name = name.lower() + if ptype in ('pypi',): + name = name.replace('_', '-') + return name or None - if namespace: - namespace = namespace.strip().strip('/') - if type in ('bitbucket', 'github', 'pypi'): - namespace = namespace.lower() - segments = namespace.split('/') - segments = [seg for seg in segments if seg and seg.strip()] - segments = map(quoting, segments) - namespace = '/'.join(segments) - if name: - name = name.strip().strip('/') - if type in ('bitbucket', 'github', 'pypi',): - name = name.lower() - if type in ('pypi',): - name = name.replace('_', '-') - name = quoting(name) +def normalize_version(version, encode=True): # NOQA + if not version: + return + if not isinstance(version, unicode): + version = version.decode('utf-8') - name = name or None + quoter = get_quoter(encode) + version = quoter(version.strip()) + return version or None - if version: - version = quoting(version.strip()) - qualifiers = normalize_qualifiers(qualifiers, encode) +def normalize_qualifiers(qualifiers, encode=True): # NOQA + """ + Return normalized `qualifiers` as a mapping (or as a string if `encode` is + True). The `qualifiers` arg is either a mapping or a string. + """ + if not qualifiers: + return + + if isinstance(qualifiers, basestring): + if not isinstance(qualifiers, unicode): + qualifiers = qualifiers.decode('utf-8') + # decode string to list of tuples + qualifiers = qualifiers.split('&') + qualifiers = [kv.partition('=') for kv in qualifiers] + qualifiers = [(k, v) for k, _, v in qualifiers] + elif isinstance(qualifiers, dict): + qualifiers = qualifiers.items() + else: + raise ValueError( + 'Invalid qualifier. ' + 'Must be a string or dict:{}'.format(repr(qualifiers))) + + quoter = get_quoter(encode) + qualifiers = {quoter(k.strip().lower()): quoter(v) + for k, v in qualifiers if k and k.strip() and v and v.strip()} + + if encode: + qualifiers = sorted(qualifiers.items()) + qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers] + qualifiers = '&'.join(qualifiers) + + return qualifiers or None + + +def normalize_subpath(subpath, encode=True): # NOQA + if not subpath: + return None + if not isinstance(subpath, unicode): + subpath = subpath.decode('utf-8') + + quoter = get_quoter(encode) + segments = subpath.split('/') + segments = [quoter(s) for s in segments if s.strip() and s not in ('.', '..')] + subpath = '/'.join(segments) + return subpath or None - if subpath: - segments = subpath.split('/') - segments = [quoting(s) for s in segments if s and s.strip() - and s not in ('.', '..')] - subpath = '/'.join(segments) - return (type or None, namespace or None, name or None, version or None, - qualifiers or None, subpath or None) +def normalize(type, namespace, name, version, qualifiers, subpath, encode=True): # NOQA + """ + Return normalized purl components + """ + type = normalize_type(type, encode) # NOQA + namespace = normalize_namespace(namespace, type, encode) + name = normalize_name(name, type, encode) + version = normalize_version(version, encode) + qualifiers = normalize_qualifiers(qualifiers, encode) + subpath = normalize_subpath(subpath, encode) + return type, namespace, name, version, qualifiers, subpath _components = ['type', 'namespace', 'name', 'version', 'qualifiers', 'subpath'] @@ -268,7 +313,7 @@ def from_string(cls, purl): 'purl is missing the required ' 'type component: {}.'.format(repr(purl))) - scheme, authority, path, qualifiers, subpath = urlsplit( + scheme, authority, path, qualifiers, subpath = _urlsplit( url=remainder, scheme='', allow_fragments=True) if scheme or authority: diff --git a/test_purl.py b/test_purl.py index 5871d68..1010665 100644 --- a/test_purl.py +++ b/test_purl.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# # Copyright (c) the purl authors # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -30,8 +32,10 @@ import unittest from packageurl import normalize_qualifiers +from packageurl import normalize from packageurl import PackageURL + # Python 2 and 3 support try: # Python 2 @@ -144,29 +148,99 @@ def build_tests(clazz=PurlTest, test_file='test-suite-data.json'): build_tests() -class NormalizePurlQualifiersTest(unittest.TestCase): - canonical_purl = 'pkg:maven/org.apache.xmlgraphics/batik-anim@1.9.1?classifier=sources&repository_url=repo.spring.io/release' - type = 'maven' - namespace = 'org.apache.xmlgraphics' - name = 'batik-anim' - version = '1.9.1' - qualifiers_as_dict = { - 'classifier': 'sources', - 'repository_url': 'repo.spring.io/release' - } - qualifiers_as_string = 'classifier=sources&repository_url=repo.spring.io/release' - subpath = None +class NormalizePurlTest(unittest.TestCase): def test_normalize_qualifiers_as_string(self): - assert self.qualifiers_as_string == normalize_qualifiers(self.qualifiers_as_dict, encode=True) + qualifiers_as_dict = { + 'classifier': 'sources', + 'repository_url': 'repo.spring.io/release' + } + qualifiers_as_string = 'classifier=sources&repository_url=repo.spring.io/release' + assert qualifiers_as_string == normalize_qualifiers( + qualifiers_as_dict, encode=True) def test_normalize_qualifiers_as_dict(self): - assert self.qualifiers_as_dict == normalize_qualifiers(self.qualifiers_as_string, encode=False) + qualifiers_as_dict = { + 'classifier': 'sources', + 'repository_url': 'repo.spring.io/release' + } + qualifiers_as_string = 'classifier=sources&repository_url=repo.spring.io/release' + assert qualifiers_as_dict == normalize_qualifiers( + qualifiers_as_string, encode=False) def test_create_PackageURL_from_qualifiers_string(self): - assert self.canonical_purl == PackageURL(self.type, self.namespace, self.name, self.version, - self.qualifiers_as_string, self.subpath).to_string() + canonical_purl = 'pkg:maven/org.apache.xmlgraphics/batik-anim@1.9.1?classifier=sources&repository_url=repo.spring.io/release' + type = 'maven' # NOQA + namespace = 'org.apache.xmlgraphics' + name = 'batik-anim' + version = '1.9.1' + qualifiers_as_string = 'classifier=sources&repository_url=repo.spring.io/release' + subpath = None + + purl = PackageURL(type, namespace, name, version, + qualifiers_as_string, + subpath) + assert canonical_purl == purl.to_string() def test_create_PackageURL_from_qualifiers_dict(self): - assert self.canonical_purl == PackageURL(self.type, self.namespace, self.name, self.version, - self.qualifiers_as_dict, self.subpath).to_string() + canonical_purl = 'pkg:maven/org.apache.xmlgraphics/batik-anim@1.9.1?classifier=sources&repository_url=repo.spring.io/release' + type = 'maven' # NOQA + namespace = 'org.apache.xmlgraphics' + name = 'batik-anim' + version = '1.9.1' + qualifiers_as_dict = { + 'classifier': 'sources', + 'repository_url': 'repo.spring.io/release' + } + subpath = None + + purl = PackageURL(type, namespace, name, version, + qualifiers_as_dict, + subpath) + assert canonical_purl == purl.to_string() + + + def test_normalize_encode_can_take_unicode_with_non_ascii_with_slash(self): + uncd = u'núcleo/núcleo' + normal = normalize( + type=uncd, namespace=uncd, name=uncd, version=uncd, + qualifiers='a=' + uncd, subpath=uncd, encode=True) + expected = ( + 'n%c3%bacleo/n%c3%bacleo', + 'n%C3%BAcleo/n%C3%BAcleo', + 'n%C3%BAcleo/n%C3%BAcleo', + 'n%C3%BAcleo/n%C3%BAcleo', + 'a=n%C3%BAcleo/n%C3%BAcleo', + 'n%C3%BAcleo/n%C3%BAcleo' + ) + assert expected == normal + + def test_normalize_decode_can_take_unicode_with_non_ascii_with_slash(self): + uncd = u'núcleo/núcleo' + normal = normalize( + type=uncd, namespace=uncd, name=uncd, version=uncd, + qualifiers='a=' + uncd, subpath=uncd, encode=False) + expected = ( + 'núcleo/núcleo', + 'núcleo/núcleo', + 'núcleo/núcleo', + 'núcleo/núcleo', + {'a': 'núcleo/núcleo'}, + 'núcleo/núcleo', + ) + assert expected == normal + + def test_normalize_encode_always_reencodes(self): + uncd = u'n%c3%bacleo/n%c3%bacleo' + normal = normalize( + type=uncd, namespace=uncd, name=uncd, version=uncd, + qualifiers='a=' + uncd, subpath=uncd, encode=True) + expected = ( + u'n%25c3%25bacleo/n%25c3%25bacleo', + u'n%25c3%25bacleo/n%25c3%25bacleo', + u'n%25c3%25bacleo/n%25c3%25bacleo', + u'n%25c3%25bacleo/n%25c3%25bacleo', + u'a=n%25c3%25bacleo/n%25c3%25bacleo', + u'n%25c3%25bacleo/n%25c3%25bacleo' + ) + assert expected == normal From 635f9cfdfb02a543ece38548028bb03cb0e92910 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sat, 6 Oct 2018 23:53:35 +0200 Subject: [PATCH 2/3] Use latest test suite data Signed-off-by: Philippe Ombredanne --- src/packageurl.py | 26 +++++++++++++++++++++++++- test-suite-data.json | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/packageurl.py b/src/packageurl.py index ea2be2a..0867427 100644 --- a/src/packageurl.py +++ b/src/packageurl.py @@ -23,12 +23,14 @@ # Visit https://github.com/package-url/packageurl-python for support and # download. + from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals from collections import namedtuple from collections import OrderedDict +import string # Python 2 and 3 support try: @@ -172,9 +174,31 @@ def normalize_qualifiers(qualifiers, encode=True): # NOQA 'Must be a string or dict:{}'.format(repr(qualifiers))) quoter = get_quoter(encode) - qualifiers = {quoter(k.strip().lower()): quoter(v) + qualifiers = {k.strip().lower(): quoter(v) for k, v in qualifiers if k and k.strip() and v and v.strip()} + valid_chars = string.ascii_letters + string.digits + '.-_' + for key in qualifiers: + if not key: + raise ValueError('A qualifier key cannot be empty') + + if '%' in key: + raise ValueError( + "A qualifier key cannot be percent encoded: {}".format(repr(key))) + + if ' ' in key: + raise ValueError( + "A qualifier key cannot contain spaces: {}".format(repr(key))) + + if not all(c in valid_chars for c in key): + raise ValueError( + "A qualifier key must be composed only of ASCII letters and numbers" + "period, dash and underscore: {}".format(repr(key))) + + if key[0] in string.digits: + raise ValueError( + "A qualifier key cannot start with a number: {}".format(repr(key))) + if encode: qualifiers = sorted(qualifiers.items()) qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers] diff --git a/test-suite-data.json b/test-suite-data.json index 3f222a8..b4641cc 100644 --- a/test-suite-data.json +++ b/test-suite-data.json @@ -262,5 +262,41 @@ "qualifiers": null, "subpath": null, "is_invalid": false + }, + { + "description": "valid maven purl with case sensitive namespace and name", + "purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3", + "canonical_purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3", + "type": "maven", + "namespace": "HTTPClient", + "name": "HTTPClient", + "version": "0.3-3", + "qualifiers": null, + "subpath": null, + "is_invalid": false + }, + { + "description": "valid maven purl containing a space in the version and qualifier", + "purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value", + "canonical_purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value", + "type": "maven", + "namespace": "mygroup", + "name": "myartifact", + "version": "1.0.0 Final", + "qualifiers": {"mykey": "my value"}, + "subpath": null, + "is_invalid": false + }, + { + "description": "checks for invalid qualifier keys", + "purl": "pkg:npm/myartifact@1.0.0?in%20production=true", + "canonical_purl": null, + "type": "npm", + "namespace": null, + "name": "myartifact", + "version": "1.0.0", + "qualifiers": {"in production": "true"}, + "subpath": null, + "is_invalid": true } ] From 9e5cc27b3b3629771800af5bd3b5fa4a61758040 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sun, 7 Oct 2018 18:53:08 +0200 Subject: [PATCH 3/3] Ensure a decoded qualifier is always a dict Signed-off-by: Philippe Ombredanne --- src/packageurl.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/packageurl.py b/src/packageurl.py index 0867427..4f73f75 100644 --- a/src/packageurl.py +++ b/src/packageurl.py @@ -29,7 +29,6 @@ from __future__ import unicode_literals from collections import namedtuple -from collections import OrderedDict import string # Python 2 and 3 support @@ -155,9 +154,11 @@ def normalize_qualifiers(qualifiers, encode=True): # NOQA """ Return normalized `qualifiers` as a mapping (or as a string if `encode` is True). The `qualifiers` arg is either a mapping or a string. + Always return a mapping if decode is True (and never None). + Raise ValueError on errors. """ if not qualifiers: - return + return None if encode else {} if isinstance(qualifiers, basestring): if not isinstance(qualifiers, unicode): @@ -203,8 +204,9 @@ def normalize_qualifiers(qualifiers, encode=True): # NOQA qualifiers = sorted(qualifiers.items()) qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers] qualifiers = '&'.join(qualifiers) - - return qualifiers or None + return qualifiers or None + else: + return qualifiers or {} def normalize_subpath(subpath, encode=True): # NOQA @@ -260,7 +262,7 @@ def __new__(self, type=None, namespace=None, name=None, # NOQA raise ValueError('Invalid purl: {} argument must be a string: {}.' .format(key, repr(value))) - if qualifiers and not isinstance(qualifiers, (basestring, dict, OrderedDict,)): + if qualifiers and not isinstance(qualifiers, (basestring, dict,)): raise ValueError('Invalid purl: {} argument must be a dict or a string: {}.' .format('qualifiers', repr(qualifiers)))