Merge pull request package-url#11 from package-url/improve-unicode-handling

pombredanne · web-flow · commit 31e320d8141d · 2018-10-07T19:46:39.000+02:00
Improve unicode handling
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name='packageurl-python',
-    version='0.6.0',
+    version='0.7.0',
     license='MIT',
     description='A "purl" aka. package URL parser and builder',
     long_description='Python library to parse and build "purl" aka. package URLs. '
diff --git a/src/packageurl.py b/src/packageurl.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+#
 # Copyright (c) the purl authors
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -21,31 +23,33 @@
 # Visit https://github.com/package-url/packageurl-python for support and
 # download.
 
+
 from __future__ import absolute_import
 from __future__ import print_function
 from __future__ import unicode_literals
 
 from collections import namedtuple
-from collections import OrderedDict
+import string
 
 # Python 2 and 3 support
 try:
     # Python 2
-    from urlparse import urlsplit
-    from urllib import quote as percent_quote
-    from urllib import unquote as percent_unquote
+    from urlparse import urlsplit as _urlsplit
+    from urllib import quote as _percent_quote
+    from urllib import unquote as _percent_unquote
 except ImportError:
     # Python 3
-    from urllib.parse import urlsplit
-    from urllib.parse import quote as percent_quote
-    from urllib.parse import unquote as percent_unquote
+    from urllib.parse import urlsplit as _urlsplit
+    from urllib.parse import quote as _percent_quote
+    from urllib.parse import unquote as _percent_unquote
 
 # Python 2 and 3 support
 try:
     # Python 2
     unicode
-    str = unicode  # NOQA
     basestring = basestring  # NOQA
+    bytes = str  # NOQA
+    str = unicode  # NOQA
 except NameError:
     # Python 3
     unicode = str  # NOQA
@@ -59,10 +63,27 @@
 
 def quote(s):
     """
-    Percent-encode a string, except for colon :
+    Return a percent-encoded unicode string, except for colon :, given an `s`
+    byte or unicode string.
+    """
+    if isinstance(s, unicode):
+        s = s.encode('utf-8')
+    quoted = _percent_quote(s)
+    if not isinstance(quoted, unicode):
+        quoted = quoted.decode('utf-8')
+    quoted = quoted.replace('%3A', ':')
+    return quoted
+
+
+def unquote(s):
+    """
+    Return a percent-decoded unicode string, given an `s` byte or unicode
+    string.
     """
-    quoted = percent_quote(s)
-    return quoted.replace('%3A', ':')
+    unquoted = _percent_unquote(s)
+    if not isinstance(unquoted, unicode):
+        unquoted = unquoted .decode('utf-8')
+    return unquoted
 
 
 def get_quoter(encode=True):
@@ -72,96 +93,146 @@ def get_quoter(encode=True):
     if encode is True:
         return quote
     elif encode is False:
-        return percent_unquote
+        return unquote
     elif encode is None:
         return lambda x: x
 
 
-def normalize_qualifiers(qualifiers, encode=True):
-    """
-    Return normalized qualifiers.
+def normalize_type(type, encode=True):  # NOQA
+    if not type:
+        return
+    if not isinstance(type, unicode):
+        type = type.decode('utf-8')  # NOQA
 
-    If `qualifiers` is a dictionary of qualifiers and values and `encode` is true,
-    the dictionary is then converted to a string of qualifiers, formatted to the purl specifications.
+    quoter = get_quoter(encode)
+    type = quoter(type)  # NOQA
+    return type.strip().lower() or None
 
-    If `qualifiers` is a string of qualfiers, formatted to the purl specifications, and `encode`
-    is false, the string is then converted to a dictionary of qualifiers and their values.
-    """
-    quoting = get_quoter(encode)
-
-    if qualifiers:
-        if isinstance(qualifiers, basestring):
-            # decode string to dict
-            qualifiers = qualifiers.split('&')
-            qualifiers = [kv.partition('=') for kv in qualifiers]
-            if qualifiers:
-                qualifiers = [(k, v) for k, _, v in qualifiers]
-            else:
-                qualifiers = []
-        elif isinstance(qualifiers, (dict, OrderedDict,)):
-            qualifiers = qualifiers.items()
-        else:
-            raise ValueError(
-                'Invalid qualifier. '
-                'Must be a string or dict:{}'.format(repr(qualifiers)))
 
-        if qualifiers:
-            qualifiers = {
-                k.strip().lower(): quoting(v)
-                for k, v in qualifiers
-                if k and k.strip() and v and v.strip()
-            }
+def normalize_namespace(namespace, ptype, encode=True):  # NOQA
+    if not namespace:
+        return
+    if not isinstance(namespace, unicode):
+        namespace = namespace.decode('utf-8')
 
-            if qualifiers and encode is True:
-                # encode dict as a string
-                qualifiers = sorted(qualifiers.items())
-                qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
-                qualifiers = '&'.join(qualifiers)
+    namespace = namespace.strip().strip('/')
+    if ptype in ('bitbucket', 'github', 'pypi'):
+        namespace = namespace.lower()
+    segments = [seg for seg in namespace.split('/') if seg.strip()]
+    segments = map(get_quoter(encode), segments)
+    return '/'.join(segments) or None
 
-            return qualifiers or None
 
+def normalize_name(name, ptype, encode=True):  # NOQA
+    if not name:
+        return
+    if not isinstance(name, unicode):
+        name = name.decode('utf-8')
 
-def normalize(type, namespace, name, version, qualifiers, subpath, encode=True):  # NOQA
+    quoter = get_quoter(encode)
+    name = quoter(name)
+    name = name.strip().strip('/')
+    if ptype in ('bitbucket', 'github', 'pypi',):
+        name = name.lower()
+    if ptype in ('pypi',):
+        name = name.replace('_', '-')
+    return name or None
+
+
+def normalize_version(version, encode=True):  # NOQA
+    if not version:
+        return
+    if not isinstance(version, unicode):
+        version = version.decode('utf-8')
+
+    quoter = get_quoter(encode)
+    version = quoter(version.strip())
+    return version or None
+
+
+def normalize_qualifiers(qualifiers, encode=True):  # NOQA
     """
-    Return normalized purl components.
+    Return normalized `qualifiers` as a mapping (or as a string if `encode` is
+    True). The `qualifiers` arg is either a mapping or a string.
+    Always return a mapping if decode is True (and never None).
+    Raise ValueError on errors.
     """
-    quoting = get_quoter(encode)
+    if not qualifiers:
+        return None if encode else {}
+
+    if isinstance(qualifiers, basestring):
+        if not isinstance(qualifiers, unicode):
+            qualifiers = qualifiers.decode('utf-8')
+        # decode string to list of tuples
+        qualifiers = qualifiers.split('&')
+        qualifiers = [kv.partition('=') for kv in qualifiers]
+        qualifiers = [(k, v) for k, _, v in qualifiers]
+    elif isinstance(qualifiers, dict):
+        qualifiers = qualifiers.items()
+    else:
+        raise ValueError(
+            'Invalid qualifier. '
+            'Must be a string or dict:{}'.format(repr(qualifiers)))
+
+    quoter = get_quoter(encode)
+    qualifiers = {k.strip().lower(): quoter(v)
+        for k, v in qualifiers if k and k.strip() and v and v.strip()}
+
+    valid_chars = string.ascii_letters + string.digits + '.-_'
+    for key in qualifiers:
+        if not key:
+            raise ValueError('A qualifier key cannot be empty')
+
+        if '%' in key:
+            raise ValueError(
+                "A qualifier key cannot be percent encoded: {}".format(repr(key)))
+
+        if ' ' in key:
+            raise ValueError(
+                "A qualifier key cannot contain spaces: {}".format(repr(key)))
 
-    if type:
-        type = type.strip().lower()  # NOQA
+        if not all(c in valid_chars for c in key):
+            raise ValueError(
+                "A qualifier key must be composed only of ASCII letters and numbers"
+                "period, dash and underscore: {}".format(repr(key)))
 
-    if namespace:
-        namespace = namespace.strip().strip('/')
-        if type in ('bitbucket', 'github', 'pypi'):
-            namespace = namespace.lower()
-        segments = namespace.split('/')
-        segments = [seg for seg in segments if seg and seg.strip()]
-        segments = map(quoting, segments)
-        namespace = '/'.join(segments)
+        if key[0] in string.digits:
+            raise ValueError(
+                "A qualifier key cannot start with a number: {}".format(repr(key)))
 
-    if name:
-        name = name.strip().strip('/')
-        if type in ('bitbucket', 'github', 'pypi',):
-            name = name.lower()
-        if type in ('pypi',):
-            name = name.replace('_', '-')
-        name = quoting(name)
+    if encode:
+        qualifiers = sorted(qualifiers.items())
+        qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
+        qualifiers = '&'.join(qualifiers)
+        return qualifiers or None
+    else:
+        return qualifiers or {}
 
-    name = name or None
 
-    if version:
-        version = quoting(version.strip())
+def normalize_subpath(subpath, encode=True):  # NOQA
+    if not subpath:
+        return None
+    if not isinstance(subpath, unicode):
+        subpath = subpath.decode('utf-8')
 
-    qualifiers = normalize_qualifiers(qualifiers, encode)
+    quoter = get_quoter(encode)
+    segments = subpath.split('/')
+    segments = [quoter(s) for s in segments if s.strip() and s not in ('.', '..')]
+    subpath = '/'.join(segments)
+    return subpath or None
 
-    if subpath:
-        segments = subpath.split('/')
-        segments = [quoting(s) for s in segments if s and s.strip()
-                    and s not in ('.', '..')]
-        subpath = '/'.join(segments)
 
-    return (type or None, namespace or None, name or None, version or None,
-            qualifiers or None, subpath or None)
+def normalize(type, namespace, name, version, qualifiers, subpath, encode=True):  # NOQA
+    """
+    Return normalized purl components
+    """
+    type = normalize_type(type, encode)  # NOQA
+    namespace = normalize_namespace(namespace, type, encode)
+    name = normalize_name(name, type, encode)
+    version = normalize_version(version, encode)
+    qualifiers = normalize_qualifiers(qualifiers, encode)
+    subpath = normalize_subpath(subpath, encode)
+    return type, namespace, name, version, qualifiers, subpath
 
 
 _components = ['type', 'namespace', 'name', 'version', 'qualifiers', 'subpath']
@@ -191,7 +262,7 @@ def __new__(self, type=None, namespace=None, name=None,  # NOQA
             raise ValueError('Invalid purl: {} argument must be a string: {}.'
                              .format(key, repr(value)))
 
-        if qualifiers and not isinstance(qualifiers, (basestring, dict, OrderedDict,)):
+        if qualifiers and not isinstance(qualifiers, (basestring, dict,)):
             raise ValueError('Invalid purl: {} argument must be a dict or a string: {}.'
                              .format('qualifiers', repr(qualifiers)))
 
@@ -268,7 +339,7 @@ def from_string(cls, purl):
                 'purl is missing the required '
                 'type component: {}.'.format(repr(purl)))
 
-        scheme, authority, path, qualifiers, subpath = urlsplit(
+        scheme, authority, path, qualifiers, subpath = _urlsplit(
             url=remainder, scheme='', allow_fragments=True)
 
         if scheme or authority:
diff --git a/test-suite-data.json b/test-suite-data.json
@@ -262,5 +262,41 @@
     "qualifiers": null,
     "subpath": null,
     "is_invalid": false
+  },
+  {
+    "description": "valid maven purl with case sensitive namespace and name",
+    "purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3",
+    "canonical_purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3",
+    "type": "maven",
+    "namespace": "HTTPClient",
+    "name": "HTTPClient",
+    "version": "0.3-3",
+    "qualifiers": null,
+    "subpath": null,
+    "is_invalid": false
+  },
+  {
+    "description": "valid maven purl containing a space in the version and qualifier",
+    "purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value",
+    "canonical_purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value",
+    "type": "maven",
+    "namespace": "mygroup",
+    "name": "myartifact",
+    "version": "1.0.0 Final",
+    "qualifiers": {"mykey": "my value"},
+    "subpath": null,
+    "is_invalid": false
+  },
+  {
+    "description": "checks for invalid qualifier keys",
+    "purl": "pkg:npm/myartifact@1.0.0?in%20production=true",
+    "canonical_purl": null,
+    "type": "npm",
+    "namespace": null,
+    "name": "myartifact",
+    "version": "1.0.0",
+    "qualifiers": {"in production": "true"},
+    "subpath": null,
+    "is_invalid": true
   }
 ]
diff --git a/test_purl.py b/test_purl.py