Skip to content

Improve unicode handling #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

setup(
name='packageurl-python',
version='0.6.0',
version='0.7.0',
license='MIT',
description='A "purl" aka. package URL parser and builder',
long_description='Python library to parse and build "purl" aka. package URLs. '
Expand Down
235 changes: 153 additions & 82 deletions src/packageurl.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) the purl authors
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
Expand All @@ -21,31 +23,33 @@
# Visit https://github.com/package-url/packageurl-python for support and
# download.


from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals

from collections import namedtuple
from collections import OrderedDict
import string

# Python 2 and 3 support
try:
# Python 2
from urlparse import urlsplit
from urllib import quote as percent_quote
from urllib import unquote as percent_unquote
from urlparse import urlsplit as _urlsplit
from urllib import quote as _percent_quote
from urllib import unquote as _percent_unquote
except ImportError:
# Python 3
from urllib.parse import urlsplit
from urllib.parse import quote as percent_quote
from urllib.parse import unquote as percent_unquote
from urllib.parse import urlsplit as _urlsplit
from urllib.parse import quote as _percent_quote
from urllib.parse import unquote as _percent_unquote

# Python 2 and 3 support
try:
# Python 2
unicode
str = unicode # NOQA
basestring = basestring # NOQA
bytes = str # NOQA
str = unicode # NOQA
except NameError:
# Python 3
unicode = str # NOQA
Expand All @@ -59,10 +63,27 @@

def quote(s):
"""
Percent-encode a string, except for colon :
Return a percent-encoded unicode string, except for colon :, given an `s`
byte or unicode string.
"""
if isinstance(s, unicode):
s = s.encode('utf-8')
quoted = _percent_quote(s)
if not isinstance(quoted, unicode):
quoted = quoted.decode('utf-8')
quoted = quoted.replace('%3A', ':')
return quoted


def unquote(s):
"""
Return a percent-decoded unicode string, given an `s` byte or unicode
string.
"""
quoted = percent_quote(s)
return quoted.replace('%3A', ':')
unquoted = _percent_unquote(s)
if not isinstance(unquoted, unicode):
unquoted = unquoted .decode('utf-8')
return unquoted


def get_quoter(encode=True):
Expand All @@ -72,96 +93,146 @@ def get_quoter(encode=True):
if encode is True:
return quote
elif encode is False:
return percent_unquote
return unquote
elif encode is None:
return lambda x: x


def normalize_qualifiers(qualifiers, encode=True):
"""
Return normalized qualifiers.
def normalize_type(type, encode=True): # NOQA
if not type:
return
if not isinstance(type, unicode):
type = type.decode('utf-8') # NOQA

If `qualifiers` is a dictionary of qualifiers and values and `encode` is true,
the dictionary is then converted to a string of qualifiers, formatted to the purl specifications.
quoter = get_quoter(encode)
type = quoter(type) # NOQA
return type.strip().lower() or None

If `qualifiers` is a string of qualfiers, formatted to the purl specifications, and `encode`
is false, the string is then converted to a dictionary of qualifiers and their values.
"""
quoting = get_quoter(encode)

if qualifiers:
if isinstance(qualifiers, basestring):
# decode string to dict
qualifiers = qualifiers.split('&')
qualifiers = [kv.partition('=') for kv in qualifiers]
if qualifiers:
qualifiers = [(k, v) for k, _, v in qualifiers]
else:
qualifiers = []
elif isinstance(qualifiers, (dict, OrderedDict,)):
qualifiers = qualifiers.items()
else:
raise ValueError(
'Invalid qualifier. '
'Must be a string or dict:{}'.format(repr(qualifiers)))

if qualifiers:
qualifiers = {
k.strip().lower(): quoting(v)
for k, v in qualifiers
if k and k.strip() and v and v.strip()
}
def normalize_namespace(namespace, ptype, encode=True): # NOQA
if not namespace:
return
if not isinstance(namespace, unicode):
namespace = namespace.decode('utf-8')

if qualifiers and encode is True:
# encode dict as a string
qualifiers = sorted(qualifiers.items())
qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
qualifiers = '&'.join(qualifiers)
namespace = namespace.strip().strip('/')
if ptype in ('bitbucket', 'github', 'pypi'):
namespace = namespace.lower()
segments = [seg for seg in namespace.split('/') if seg.strip()]
segments = map(get_quoter(encode), segments)
return '/'.join(segments) or None

return qualifiers or None

def normalize_name(name, ptype, encode=True): # NOQA
if not name:
return
if not isinstance(name, unicode):
name = name.decode('utf-8')

def normalize(type, namespace, name, version, qualifiers, subpath, encode=True): # NOQA
quoter = get_quoter(encode)
name = quoter(name)
name = name.strip().strip('/')
if ptype in ('bitbucket', 'github', 'pypi',):
name = name.lower()
if ptype in ('pypi',):
name = name.replace('_', '-')
return name or None


def normalize_version(version, encode=True): # NOQA
if not version:
return
if not isinstance(version, unicode):
version = version.decode('utf-8')

quoter = get_quoter(encode)
version = quoter(version.strip())
return version or None


def normalize_qualifiers(qualifiers, encode=True): # NOQA
"""
Return normalized purl components.
Return normalized `qualifiers` as a mapping (or as a string if `encode` is
True). The `qualifiers` arg is either a mapping or a string.
Always return a mapping if decode is True (and never None).
Raise ValueError on errors.
"""
quoting = get_quoter(encode)
if not qualifiers:
return None if encode else {}

if isinstance(qualifiers, basestring):
if not isinstance(qualifiers, unicode):
qualifiers = qualifiers.decode('utf-8')
# decode string to list of tuples
qualifiers = qualifiers.split('&')
qualifiers = [kv.partition('=') for kv in qualifiers]
qualifiers = [(k, v) for k, _, v in qualifiers]
elif isinstance(qualifiers, dict):
qualifiers = qualifiers.items()
else:
raise ValueError(
'Invalid qualifier. '
'Must be a string or dict:{}'.format(repr(qualifiers)))

quoter = get_quoter(encode)
qualifiers = {k.strip().lower(): quoter(v)
for k, v in qualifiers if k and k.strip() and v and v.strip()}

valid_chars = string.ascii_letters + string.digits + '.-_'
for key in qualifiers:
if not key:
raise ValueError('A qualifier key cannot be empty')

if '%' in key:
raise ValueError(
"A qualifier key cannot be percent encoded: {}".format(repr(key)))

if ' ' in key:
raise ValueError(
"A qualifier key cannot contain spaces: {}".format(repr(key)))

if type:
type = type.strip().lower() # NOQA
if not all(c in valid_chars for c in key):
raise ValueError(
"A qualifier key must be composed only of ASCII letters and numbers"
"period, dash and underscore: {}".format(repr(key)))

if namespace:
namespace = namespace.strip().strip('/')
if type in ('bitbucket', 'github', 'pypi'):
namespace = namespace.lower()
segments = namespace.split('/')
segments = [seg for seg in segments if seg and seg.strip()]
segments = map(quoting, segments)
namespace = '/'.join(segments)
if key[0] in string.digits:
raise ValueError(
"A qualifier key cannot start with a number: {}".format(repr(key)))

if name:
name = name.strip().strip('/')
if type in ('bitbucket', 'github', 'pypi',):
name = name.lower()
if type in ('pypi',):
name = name.replace('_', '-')
name = quoting(name)
if encode:
qualifiers = sorted(qualifiers.items())
qualifiers = ['{}={}'.format(k, v) for k, v in qualifiers]
qualifiers = '&'.join(qualifiers)
return qualifiers or None
else:
return qualifiers or {}

name = name or None

if version:
version = quoting(version.strip())
def normalize_subpath(subpath, encode=True): # NOQA
if not subpath:
return None
if not isinstance(subpath, unicode):
subpath = subpath.decode('utf-8')

qualifiers = normalize_qualifiers(qualifiers, encode)
quoter = get_quoter(encode)
segments = subpath.split('/')
segments = [quoter(s) for s in segments if s.strip() and s not in ('.', '..')]
subpath = '/'.join(segments)
return subpath or None

if subpath:
segments = subpath.split('/')
segments = [quoting(s) for s in segments if s and s.strip()
and s not in ('.', '..')]
subpath = '/'.join(segments)

return (type or None, namespace or None, name or None, version or None,
qualifiers or None, subpath or None)
def normalize(type, namespace, name, version, qualifiers, subpath, encode=True): # NOQA
"""
Return normalized purl components
"""
type = normalize_type(type, encode) # NOQA
namespace = normalize_namespace(namespace, type, encode)
name = normalize_name(name, type, encode)
version = normalize_version(version, encode)
qualifiers = normalize_qualifiers(qualifiers, encode)
subpath = normalize_subpath(subpath, encode)
return type, namespace, name, version, qualifiers, subpath


_components = ['type', 'namespace', 'name', 'version', 'qualifiers', 'subpath']
Expand Down Expand Up @@ -191,7 +262,7 @@ def __new__(self, type=None, namespace=None, name=None, # NOQA
raise ValueError('Invalid purl: {} argument must be a string: {}.'
.format(key, repr(value)))

if qualifiers and not isinstance(qualifiers, (basestring, dict, OrderedDict,)):
if qualifiers and not isinstance(qualifiers, (basestring, dict,)):
raise ValueError('Invalid purl: {} argument must be a dict or a string: {}.'
.format('qualifiers', repr(qualifiers)))

Expand Down Expand Up @@ -268,7 +339,7 @@ def from_string(cls, purl):
'purl is missing the required '
'type component: {}.'.format(repr(purl)))

scheme, authority, path, qualifiers, subpath = urlsplit(
scheme, authority, path, qualifiers, subpath = _urlsplit(
url=remainder, scheme='', allow_fragments=True)

if scheme or authority:
Expand Down
36 changes: 36 additions & 0 deletions test-suite-data.json
Original file line number Diff line number Diff line change
Expand Up @@ -262,5 +262,41 @@
"qualifiers": null,
"subpath": null,
"is_invalid": false
},
{
"description": "valid maven purl with case sensitive namespace and name",
"purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3",
"canonical_purl": "pkg:maven/HTTPClient/HTTPClient@0.3-3",
"type": "maven",
"namespace": "HTTPClient",
"name": "HTTPClient",
"version": "0.3-3",
"qualifiers": null,
"subpath": null,
"is_invalid": false
},
{
"description": "valid maven purl containing a space in the version and qualifier",
"purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value",
"canonical_purl": "pkg:maven/mygroup/myartifact@1.0.0%20Final?mykey=my%20value",
"type": "maven",
"namespace": "mygroup",
"name": "myartifact",
"version": "1.0.0 Final",
"qualifiers": {"mykey": "my value"},
"subpath": null,
"is_invalid": false
},
{
"description": "checks for invalid qualifier keys",
"purl": "pkg:npm/myartifact@1.0.0?in%20production=true",
"canonical_purl": null,
"type": "npm",
"namespace": null,
"name": "myartifact",
"version": "1.0.0",
"qualifiers": {"in production": "true"},
"subpath": null,
"is_invalid": true
}
]
Loading