From d7be0209d00fefd819d27804b1ee536765e6509e Mon Sep 17 00:00:00 2001
From: Philippe Ombredanne <pombredanne@nexb.com>
Date: Sat, 6 Oct 2018 23:37:59 +0200
Subject: [PATCH 1/3] Improve unicode handling

* always use unicode internally and encode and decode
  at the boundaries
* when calling URL quoting functions, use bytes.

Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
---
 setup.py          |   2 +-
 src/packageurl.py | 211 ++++++++++++++++++++++++++++------------------
 test_purl.py      | 110 ++++++++++++++++++++----
 3 files changed, 221 insertions(+), 102 deletions(-)

diff --git a/setup.py b/setup.py
index 2a485c5..332e7c3 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name='packageurl-python',
-    version='0.6.0',
+    version='0.7.0',
     license='MIT',
     description='A "purl" aka. package URL parser and builder',
     long_description='Python library to parse and build "purl" aka. package URLs. '
diff --git a/src/packageurl.py b/src/packageurl.py
index 5916f6e..ea2be2a 100644
--- a/src/packageurl.py
+++ b/src/packageurl.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+#
 # Copyright (c) the purl authors
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -31,21 +33,22 @@
 # Python 2 and 3 support
 try:
     # Python 2
-    from urlparse import urlsplit
-    from urllib import quote as percent_quote
-    from urllib import unquote as percent_unquote
+    from urlparse import urlsplit as _urlsplit
+    from urllib import quote as _percent_quote
+    from urllib import unquote as _percent_unquote
 except ImportError:
     # Python 3
-    from urllib.parse import urlsplit
-    from urllib.parse import quote as percent_quote
-    from urllib.parse import unquote as percent_unquote
+    from urllib.parse import urlsplit as _urlsplit
+    from urllib.parse import quote as _percent_quote
+    from urllib.parse import unquote as _percent_unquote
 
 # Python 2 and 3 support
 try:
     # Python 2
     unicode
-    str = unicode  # NOQA
     basestring = basestring  # NOQA
+    bytes = str  # NOQA
+    str = unicode  # NOQA
 except NameError:
     # Python 3
     unicode = str  # NOQA
@@ -59,10 +62,27 @@
 
 def quote(s):
     """
-    Percent-encode a string, except for colon :
+    Return a percent-encoded unicode string, except for colon :, given an `s`
+    byte or unicode string.
     """
-    quoted = percent_quote(s)
-    return quoted.replace('%3A', ':')
+    if isinstance(s, unicode):
+        s = s.encode('utf-8')
+    quoted = _percent_quote(s)
+    if not isinstance(quoted, unicode):
+        quoted = quoted.decode('utf-8')
+    quoted = quoted.replace('%3A', ':')
+    return quoted
+
+
+def unquote(s):
+    """
+    Return a percent-decoded unicode string, given an `s` byte or unicode
+    string.
+    """
+    unquoted = _percent_unquote(s)
+    if not isinstance(unquoted, unicode):
+        unquoted = unquoted .decode('utf-8')
+    return unquoted
 
 
 def get_quoter(encode=True):
@@ -72,96 +92,121 @@ def get_quoter(encode=True):
     if encode is True:
         return quote
     elif encode is False:
-        return percent_unquote
+        return unquote
     elif encode is None:
         return lambda x: x
 
 
-def normalize_qualifiers(qualifiers, encode=True):
-    """
-    Return normalized qualifiers.
-
-    If `qualifiers` is a dictionary of qualifiers and values and `encode` is true,
-    the dictionary is then converted to a string of qualifiers, formatted to the purl specifications.
+def normalize_type(type, encode=True):  # NOQA
+    if not type:
+        return
+    if not isinstance(type, unicode):
+        type = type.decode('utf-8')  # NOQA
 
-    If `qualifiers` is a string of qualfiers, formatted to the purl specifications, and `encode`
-    is false, the string is then converted to a dictionary of qualifiers and their values.
-    """
-    quoting = get_quoter(encode)
-
-    if qualifiers:
-        if isinstance(qualifiers, basestring):
-            # decode string to dict
-            qualifiers = qualifiers.split('&')
-            qualifiers = [kv.partition('=') for kv in qualifiers]
-            if qualifiers:
-                qualifiers = [(k, v) for k, _, v in qualifiers]
-            else:
-                qualifiers = []
-        elif isinstance(qualifiers, (dict, OrderedDict,)):
-            qualifiers = qualifiers.items()
-        else:
-            raise ValueError(
-                'Invalid qualifier. '
-                'Must be a string or dict:{}'.format(repr(qualifiers)))
+    quoter = get_quoter(encode)
+    type = quoter(type)  # NOQA
+    return type.strip().lower() or None
 
-        if qualifiers:
-            qualifiers = {
-                k.strip().lower(): quoting(v)
-                for k, v in qualifiers
-                if k and k.strip() and v and v.strip()
-            }
 
-            if qualifiers and encode is True:
-                # encode dict as a string
-                qualifiers = sorted(qualifiers.items())
-                qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
-                qualifiers = '&'.join(qualifiers)
+def normalize_namespace(namespace, ptype, encode=True):  # NOQA
+    if not namespace:
+        return
+    if not isinstance(namespace, unicode):
+        namespace = namespace.decode('utf-8')
 
-            return qualifiers or None
+    namespace = namespace.strip().strip('/')
+    if ptype in ('bitbucket', 'github', 'pypi'):
+        namespace = namespace.lower()
+    segments = [seg for seg in namespace.split('/') if seg.strip()]
+    segments = map(get_quoter(encode), segments)
+    return '/'.join(segments) or None
 
 
-def normalize(type, namespace, name, version, qualifiers, subpath, encode=True):  # NOQA
-    """
-    Return normalized purl components.
-    """
-    quoting = get_quoter(encode)
+def normalize_name(name, ptype, encode=True):  # NOQA
+    if not name:
+        return
+    if not isinstance(name, unicode):
+        name = name.decode('utf-8')
 
-    if type:
-        type = type.strip().lower()  # NOQA
+    quoter = get_quoter(encode)
+    name = quoter(name)
+    name = name.strip().strip('/')
+    if ptype in ('bitbucket', 'github', 'pypi',):
+        name = name.lower()
+    if ptype in ('pypi',):
+        name = name.replace('_', '-')
+    return name or None
 
-    if namespace:
-        namespace = namespace.strip().strip('/')
-        if type in ('bitbucket', 'github', 'pypi'):
-            namespace = namespace.lower()
-        segments = namespace.split('/')
-        segments = [seg for seg in segments if seg and seg.strip()]
-        segments = map(quoting, segments)
-        namespace = '/'.join(segments)
 
-    if name:
-        name = name.strip().strip('/')
-        if type in ('bitbucket', 'github', 'pypi',):
-            name = name.lower()
-        if type in ('pypi',):
-            name = name.replace('_', '-')
-        name = quoting(name)
+def normalize_version(version, encode=True):  # NOQA
+    if not version:
+        return
+    if not isinstance(version, unicode):
+        version = version.decode('utf-8')
 
-    name = name or None
+    quoter = get_quoter(encode)
+    version = quoter(version.strip())
+    return version or None
 
-    if version:
-        version = quoting(version.strip())
 
-    qualifiers = normalize_qualifiers(qualifiers, encode)
+def normalize_qualifiers(qualifiers, encode=True):  # NOQA
+    """
+    Return normalized `qualifiers` as a mapping (or as a string if `encode` is
+    True). The `qualifiers` arg is either a mapping or a string.
+    """
+    if not qualifiers:
+        return
+
+    if isinstance(qualifiers, basestring):
+        if not isinstance(qualifiers, unicode):
+            qualifiers = qualifiers.decode('utf-8')
+        # decode string to list of tuples
+        qualifiers = qualifiers.split('&')
+        qualifiers = [kv.partition('=') for kv in qualifiers]
+        qualifiers = [(k, v) for k, _, v in qualifiers]
+    elif isinstance(qualifiers, dict):
+        qualifiers = qualifiers.items()
+    else:
+        raise ValueError(
+            'Invalid qualifier. '
+            'Must be a string or dict:{}'.format(repr(qualifiers)))
+
+    quoter = get_quoter(encode)
+    qualifiers = {quoter(k.strip().lower()): quoter(v)
+        for k, v in qualifiers if k and k.strip() and v and v.strip()}
+
+    if encode:
+        qualifiers = sorted(qualifiers.items())
+        qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
+        qualifiers = '&'.join(qualifiers)
+
+    return qualifiers or None
+
+
+def normalize_subpath(subpath, encode=True):  # NOQA
+    if not subpath:
+        return None
+    if not isinstance(subpath, unicode):
+        subpath = subpath.decode('utf-8')
+
+    quoter = get_quoter(encode)
+    segments = subpath.split('/')
+    segments = [quoter(s) for s in segments if s.strip() and s not in ('.', '..')]
+    subpath = '/'.join(segments)
+    return subpath or None
 
-    if subpath:
-        segments = subpath.split('/')
-        segments = [quoting(s) for s in segments if s and s.strip()
-                    and s not in ('.', '..')]
-        subpath = '/'.join(segments)
 
-    return (type or None, namespace or None, name or None, version or None,
-            qualifiers or None, subpath or None)
+def normalize(type, namespace, name, version, qualifiers, subpath, encode=True):  # NOQA
+    """
+    Return normalized purl components
+    """
+    type = normalize_type(type, encode)  # NOQA
+    namespace = normalize_namespace(namespace, type, encode)
+    name = normalize_name(name, type, encode)
+    version = normalize_version(version, encode)
+    qualifiers = normalize_qualifiers(qualifiers, encode)
+    subpath = normalize_subpath(subpath, encode)
+    return type, namespace, name, version, qualifiers, subpath
 
 
 _components = ['type', 'namespace', 'name', 'version', 'qualifiers', 'subpath']
@@ -268,7 +313,7 @@ def from_string(cls, purl):
                 'purl is missing the required '
                 'type component: {}.'.format(repr(purl)))
 
-        scheme, authority, path, qualifiers, subpath = urlsplit(
+        scheme, authority, path, qualifiers, subpath = _urlsplit(
             url=remainder, scheme='', allow_fragments=True)
 
         if scheme or authority:
diff --git a/test_purl.py b/test_purl.py
index 5871d68..1010665 100644
--- a/test_purl.py
+++ b/test_purl.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+#
 # Copyright (c) the purl authors
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -30,8 +32,10 @@
 import unittest
 
 from packageurl import normalize_qualifiers
+from packageurl import normalize
 from packageurl import PackageURL
 
+
 # Python 2 and 3 support
 try:
     # Python 2
@@ -144,29 +148,99 @@ def build_tests(clazz=PurlTest, test_file='test-suite-data.json'):
 build_tests()
 
 
-class NormalizePurlQualifiersTest(unittest.TestCase):
-    canonical_purl = 'pkg:maven/org.apache.xmlgraphics/batik-anim@1.9.1?classifier=sources&repository_url=repo.spring.io/release'
-    type = 'maven'
-    namespace = 'org.apache.xmlgraphics'
-    name = 'batik-anim'
-    version = '1.9.1'
-    qualifiers_as_dict = {
-        'classifier': 'sources',
-        'repository_url': 'repo.spring.io/release'
-    }
-    qualifiers_as_string = 'classifier=sources&repository_url=repo.spring.io/release'
-    subpath = None
+class NormalizePurlTest(unittest.TestCase):
 
     def test_normalize_qualifiers_as_string(self):
-        assert self.qualifiers_as_string == normalize_qualifiers(self.qualifiers_as_dict, encode=True)
+        qualifiers_as_dict = {
+            'classifier': 'sources',
+            'repository_url': 'repo.spring.io/release'
+        }
+        qualifiers_as_string = 'classifier=sources&repository_url=repo.spring.io/release'
+        assert qualifiers_as_string == normalize_qualifiers(
+            qualifiers_as_dict, encode=True)
 
     def test_normalize_qualifiers_as_dict(self):
-        assert self.qualifiers_as_dict == normalize_qualifiers(self.qualifiers_as_string, encode=False)
+        qualifiers_as_dict = {
+            'classifier': 'sources',
+            'repository_url': 'repo.spring.io/release'
+        }
+        qualifiers_as_string = 'classifier=sources&repository_url=repo.spring.io/release'
+        assert qualifiers_as_dict == normalize_qualifiers(
+            qualifiers_as_string, encode=False)
 
     def test_create_PackageURL_from_qualifiers_string(self):
-        assert self.canonical_purl == PackageURL(self.type, self.namespace, self.name, self.version,
-                                                    self.qualifiers_as_string, self.subpath).to_string()
+        canonical_purl = 'pkg:maven/org.apache.xmlgraphics/batik-anim@1.9.1?classifier=sources&repository_url=repo.spring.io/release'
+        type = 'maven'  # NOQA
+        namespace = 'org.apache.xmlgraphics'
+        name = 'batik-anim'
+        version = '1.9.1'
+        qualifiers_as_string = 'classifier=sources&repository_url=repo.spring.io/release'
+        subpath = None
+
+        purl = PackageURL(type, namespace, name, version,
+            qualifiers_as_string,
+            subpath)
+        assert canonical_purl == purl.to_string()
 
     def test_create_PackageURL_from_qualifiers_dict(self):
-        assert self.canonical_purl == PackageURL(self.type, self.namespace, self.name, self.version,
-                                                    self.qualifiers_as_dict, self.subpath).to_string()
+        canonical_purl = 'pkg:maven/org.apache.xmlgraphics/batik-anim@1.9.1?classifier=sources&repository_url=repo.spring.io/release'
+        type = 'maven'  # NOQA
+        namespace = 'org.apache.xmlgraphics'
+        name = 'batik-anim'
+        version = '1.9.1'
+        qualifiers_as_dict = {
+            'classifier': 'sources',
+            'repository_url': 'repo.spring.io/release'
+        }
+        subpath = None
+
+        purl = PackageURL(type, namespace, name, version,
+            qualifiers_as_dict,
+            subpath)
+        assert canonical_purl == purl.to_string()
+
+
+    def test_normalize_encode_can_take_unicode_with_non_ascii_with_slash(self):
+        uncd = u'núcleo/núcleo'
+        normal = normalize(
+            type=uncd, namespace=uncd, name=uncd, version=uncd,
+            qualifiers='a=' + uncd, subpath=uncd, encode=True)
+        expected = (
+            'n%c3%bacleo/n%c3%bacleo',
+            'n%C3%BAcleo/n%C3%BAcleo',
+            'n%C3%BAcleo/n%C3%BAcleo',
+            'n%C3%BAcleo/n%C3%BAcleo',
+            'a=n%C3%BAcleo/n%C3%BAcleo',
+            'n%C3%BAcleo/n%C3%BAcleo'
+        )
+        assert expected == normal
+
+    def test_normalize_decode_can_take_unicode_with_non_ascii_with_slash(self):
+        uncd = u'núcleo/núcleo'
+        normal = normalize(
+            type=uncd, namespace=uncd, name=uncd, version=uncd,
+            qualifiers='a=' + uncd, subpath=uncd, encode=False)
+        expected = (
+            'núcleo/núcleo',
+            'núcleo/núcleo',
+            'núcleo/núcleo',
+            'núcleo/núcleo',
+            {'a': 'núcleo/núcleo'},
+            'núcleo/núcleo',
+        )
+        assert expected == normal
+
+    def test_normalize_encode_always_reencodes(self):
+        uncd = u'n%c3%bacleo/n%c3%bacleo'
+        normal = normalize(
+            type=uncd, namespace=uncd, name=uncd, version=uncd,
+            qualifiers='a=' + uncd, subpath=uncd, encode=True)
+        expected = (
+            u'n%25c3%25bacleo/n%25c3%25bacleo',
+            u'n%25c3%25bacleo/n%25c3%25bacleo',
+            u'n%25c3%25bacleo/n%25c3%25bacleo',
+            u'n%25c3%25bacleo/n%25c3%25bacleo',
+            u'a=n%25c3%25bacleo/n%25c3%25bacleo',
+            u'n%25c3%25bacleo/n%25c3%25bacleo'
+        )
+        assert expected == normal

From 635f9cfdfb02a543ece38548028bb03cb0e92910 Mon Sep 17 00:00:00 2001
From: Philippe Ombredanne <pombredanne@nexb.com>
Date: Sat, 6 Oct 2018 23:53:35 +0200
Subject: [PATCH 2/3] Use latest test suite data

Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
---
 src/packageurl.py    | 26 +++++++++++++++++++++++++-
 test-suite-data.json | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/src/packageurl.py b/src/packageurl.py
index ea2be2a..0867427 100644
--- a/src/packageurl.py
+++ b/src/packageurl.py
@@ -23,12 +23,14 @@
 # Visit https://github.com/package-url/packageurl-python for support and
 # download.
 
+
 from __future__ import absolute_import
 from __future__ import print_function
 from __future__ import unicode_literals
 
 from collections import namedtuple
 from collections import OrderedDict
+import string
 
 # Python 2 and 3 support
 try:
@@ -172,9 +174,31 @@ def normalize_qualifiers(qualifiers, encode=True):  # NOQA
             'Must be a string or dict:{}'.format(repr(qualifiers)))
 
     quoter = get_quoter(encode)
-    qualifiers = {quoter(k.strip().lower()): quoter(v)
+    qualifiers = {k.strip().lower(): quoter(v)
         for k, v in qualifiers if k and k.strip() and v and v.strip()}
 
+    valid_chars = string.ascii_letters + string.digits + '.-_'
+    for key in qualifiers:
+        if not key:
+            raise ValueError('A qualifier key cannot be empty')
+
+        if '%' in key:
+            raise ValueError(
+                "A qualifier key cannot be percent encoded: {}".format(repr(key)))
+
+        if ' ' in key:
+            raise ValueError(
+                "A qualifier key cannot contain spaces: {}".format(repr(key)))
+
+        if not all(c in valid_chars for c in key):
+            raise ValueError(
+                "A qualifier key must be composed only of ASCII letters and numbers"
+                "period, dash and underscore: {}".format(repr(key)))
+
+        if key[0] in string.digits:
+            raise ValueError(
+                "A qualifier key cannot start with a number: {}".format(repr(key)))
+
     if encode:
         qualifiers = sorted(qualifiers.items())
         qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
diff --git a/test-suite-data.json b/test-suite-data.json
index 3f222a8..b4641cc 100644
--- a/test-suite-data.json
+++ b/test-suite-data.json
@@ -262,5 +262,41 @@
     "qualifiers": null,
     "subpath": null,
     "is_invalid": false
+  },
+  {
+    "description": "valid maven purl with case sensitive namespace and name",
+    "purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3",
+    "canonical_purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3",
+    "type": "maven",
+    "namespace": "HTTPClient",
+    "name": "HTTPClient",
+    "version": "0.3-3",
+    "qualifiers": null,
+    "subpath": null,
+    "is_invalid": false
+  },
+  {
+    "description": "valid maven purl containing a space in the version and qualifier",
+    "purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value",
+    "canonical_purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value",
+    "type": "maven",
+    "namespace": "mygroup",
+    "name": "myartifact",
+    "version": "1.0.0 Final",
+    "qualifiers": {"mykey": "my value"},
+    "subpath": null,
+    "is_invalid": false
+  },
+  {
+    "description": "checks for invalid qualifier keys",
+    "purl": "pkg:npm/myartifact@1.0.0?in%20production=true",
+    "canonical_purl": null,
+    "type": "npm",
+    "namespace": null,
+    "name": "myartifact",
+    "version": "1.0.0",
+    "qualifiers": {"in production": "true"},
+    "subpath": null,
+    "is_invalid": true
   }
 ]

From 9e5cc27b3b3629771800af5bd3b5fa4a61758040 Mon Sep 17 00:00:00 2001
From: Philippe Ombredanne <pombredanne@nexb.com>
Date: Sun, 7 Oct 2018 18:53:08 +0200
Subject: [PATCH 3/3] Ensure a decoded qualifier is always a dict

Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
---
 src/packageurl.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/packageurl.py b/src/packageurl.py
index 0867427..4f73f75 100644
--- a/src/packageurl.py
+++ b/src/packageurl.py
@@ -29,7 +29,6 @@
 from __future__ import unicode_literals
 
 from collections import namedtuple
-from collections import OrderedDict
 import string
 
 # Python 2 and 3 support
@@ -155,9 +154,11 @@ def normalize_qualifiers(qualifiers, encode=True):  # NOQA
     """
     Return normalized `qualifiers` as a mapping (or as a string if `encode` is
     True). The `qualifiers` arg is either a mapping or a string.
+    Always return a mapping if decode is True (and never None).
+    Raise ValueError on errors.
     """
     if not qualifiers:
-        return
+        return None if encode else {}
 
     if isinstance(qualifiers, basestring):
         if not isinstance(qualifiers, unicode):
@@ -203,8 +204,9 @@ def normalize_qualifiers(qualifiers, encode=True):  # NOQA
         qualifiers = sorted(qualifiers.items())
         qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
         qualifiers = '&'.join(qualifiers)
-
-    return qualifiers or None
+        return qualifiers or None
+    else:
+        return qualifiers or {}
 
 
 def normalize_subpath(subpath, encode=True):  # NOQA
@@ -260,7 +262,7 @@ def __new__(self, type=None, namespace=None, name=None,  # NOQA
             raise ValueError('Invalid purl: {} argument must be a string: {}.'
                              .format(key, repr(value)))
 
-        if qualifiers and not isinstance(qualifiers, (basestring, dict, OrderedDict,)):
+        if qualifiers and not isinstance(qualifiers, (basestring, dict,)):
             raise ValueError('Invalid purl: {} argument must be a dict or a string: {}.'
                              .format('qualifiers', repr(qualifiers)))