Skip to content

Commit 484d21c

Browse files
committed
Add a url2purl module for Package URL creation from arbitrary URLs
Includes: - Support for maven, npm, rubygems - route.py routing module - Data driven test suite Signed-off-by: Thomas Druez <tdruez@nexb.com>
1 parent 254876f commit 484d21c

File tree

8 files changed

+551
-3
lines changed

8 files changed

+551
-3
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,10 @@
55
/.eggs/
66
/.cache/
77
/src/packageurl_python.egg-info/
8+
lib
9+
bin
10+
.idea
11+
include
12+
pip-selfcheck.json
13+
pyvenv.cfg
14+
.Python

MANIFEST.in

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
graft src
2+
graft tests
23

34
include mit.LICENSE
45
include setup.py
@@ -8,7 +9,5 @@ include MANIFEST.in
89
include requirements_tests.txt
910
include CHANGELOG.rst
1011
include CONTRIBUTING.rst
11-
include test-suite-data.json
12-
include test_purl.py
1312

1413
global-exclude *.py[co] __pycache__

src/packageurl/contrib/route.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
#
2+
# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved.
3+
#
4+
5+
from __future__ import absolute_import
6+
from __future__ import print_function
7+
from __future__ import unicode_literals
8+
9+
from collections import OrderedDict
10+
from functools import wraps
11+
import inspect
12+
import re
13+
14+
15+
"""
16+
Given a URI regex (or some string), this module can route execution to a
17+
callable.
18+
19+
There are several routing implementations available in Rails, Django, Flask,
20+
Paste, etc. However, these all assume that the routed processing is to craft a
21+
response to an incoming external HTTP request.
22+
23+
Here we are instead doing the opposite: given a URI (and no request yet) we are
24+
routing the processing to emit a request externally (HTTP or other protocol)
25+
and handling its response.
26+
27+
Also we crawl a lot and not only HTTP: git, svn, ftp, rsync and more.
28+
This simple library support this kind of arbitrary URI routing.
29+
30+
This is inspired by Guido's http://www.artima.com/weblogs/viewpost.jsp?thread=101605
31+
and Django, Flask, Werkzeug and other url dispatch and routing design from web
32+
frameworks.
33+
https://github.com/douban/brownant has a similar approach, using
34+
Werkzeug with the limitation that it does not route based on URI scheme and is
35+
limited to HTTP.
36+
"""
37+
38+
39+
class Rule(object):
40+
"""
41+
A rule is a mapping between a pattern (typically a URI) and a callable
42+
(typically a function).
43+
The pattern is a regex string pattern and must match entirely a string
44+
(typically a URI) for the rule to be considered, i.e. for the endpoint to
45+
be resolved and eventually invoked for a given string (typically a URI).
46+
"""
47+
def __init__(self, pattern, endpoint):
48+
# To ensure the pattern will match entirely, we wrap the pattern
49+
# with start of line ^ and end of line $.
50+
self.pattern = pattern.lstrip('^').rstrip('$')
51+
self.pattern_match = re.compile('^' + self.pattern + '$').match
52+
53+
# ensure the endpoint is callable
54+
assert callable(endpoint)
55+
# classes are not always callable, make an extra check
56+
if inspect.isclass(endpoint):
57+
obj = endpoint()
58+
assert callable(obj)
59+
60+
self.endpoint = endpoint
61+
62+
def __repr__(self):
63+
return 'Rule(r"""{}""", {}.{})'.format(
64+
self.pattern, self.endpoint.__module__, self.endpoint.__name__)
65+
66+
def match(self, string):
67+
"""
68+
Match a string with the rule pattern, return True is matching.
69+
"""
70+
return self.pattern_match(string)
71+
72+
73+
class RouteAlreadyDefined(TypeError):
74+
"""
75+
Raised when this route Rule already exists in the route map.
76+
"""
77+
78+
79+
class NoRouteAvailable(TypeError):
80+
"""
81+
Raised when there are no route available.
82+
"""
83+
84+
85+
class MultipleRoutesDefined(TypeError):
86+
"""
87+
Raised when there are more than one route possible.
88+
"""
89+
90+
91+
class Router(object):
92+
"""
93+
A router is:
94+
- a container for a route map, consisting of several rules, stored in an
95+
ordered dictionary keyed by pattern text
96+
- a way to process a route, i.e. given a string (typically a URI), find the
97+
correct rule and invoke its callable endpoint
98+
- and a convenience decorator for routed callables (either a function or
99+
something with a __call__ method)
100+
101+
Multiple routers can co-exist as needed, such as a router to collect,
102+
another to fetch, etc.
103+
"""
104+
def __init__(self, route_map=None):
105+
"""
106+
'route_map' is an ordered mapping of pattern -> Rule.
107+
"""
108+
self.route_map = route_map or OrderedDict()
109+
# lazy cached pre-compiled regex match() for all route patterns
110+
self._is_routable = None
111+
112+
def __repr__(self):
113+
return repr(self.route_map)
114+
115+
def __iter__(self):
116+
return iter(self.route_map.items())
117+
118+
def keys(self):
119+
return self.route_map.keys()
120+
121+
def append(self, pattern, endpoint):
122+
"""
123+
Append a new pattern and endpoint Rule at the end of the map.
124+
Use this as an alternative to the route decorator.
125+
"""
126+
if pattern in self.route_map:
127+
raise RouteAlreadyDefined(pattern)
128+
self.route_map[pattern] = Rule(pattern, endpoint)
129+
130+
def route(self, *patterns):
131+
"""
132+
Decorator to make a callable 'endpoint' routed to one or more patterns.
133+
134+
Example:
135+
>>> my_router = Router()
136+
>>> @my_router.route('http://nexb.com', 'http://deja.com')
137+
... def somefunc(uri):
138+
... pass
139+
"""
140+
def decorator(endpoint):
141+
assert patterns
142+
for pat in patterns:
143+
self.append(pat, endpoint)
144+
145+
@wraps(endpoint)
146+
def decorated(*args, **kwargs):
147+
return self.process(*args, **kwargs)
148+
return decorated
149+
150+
return decorator
151+
152+
def process(self, string, *args, **kwargs):
153+
"""
154+
Given a string (typically a URI), resolve this string to an endpoint
155+
by searching available rules then execute the endpoint callable for
156+
that string passing down all arguments to the endpoint invocation.
157+
"""
158+
endpoint = self.resolve(string)
159+
if inspect.isclass(endpoint):
160+
# instantiate a class, that must define a __call__ method
161+
# TODO: consider passing args to the constructor?
162+
endpoint = endpoint()
163+
# call the callable
164+
return endpoint(string, *args, **kwargs)
165+
166+
def resolve(self, string):
167+
"""
168+
Resolve a string: given a string (typically a URI) resolve and
169+
return the best endpoint function for that string.
170+
171+
Ambiguous resolution is not allowed in order to keep things in
172+
check when there are hundreds rules: if multiple routes are
173+
possible for a string (typically a URI), a MultipleRoutesDefined
174+
TypeError is raised.
175+
"""
176+
# TODO: we could improve the performance of this by using a single
177+
# regex and named groups if this ever becomes a bottleneck.
178+
candidates = [r for r in self.route_map.values() if r.match(string)]
179+
180+
if not candidates:
181+
raise NoRouteAvailable(string)
182+
183+
if len(candidates) > 1:
184+
# this can happen when multiple patterns match the same string
185+
# we raise an exception with enough debugging information
186+
pats = repr([r.pattern for r in candidates])
187+
msg = '%(string)r matches multiple patterns %(pats)r' % locals()
188+
raise MultipleRoutesDefined(msg)
189+
190+
return candidates[0].endpoint
191+
192+
def is_routable(self, string):
193+
"""
194+
Return True if `string` is routable by this router, e.g. if it
195+
matches any of the route patterns.
196+
"""
197+
if not string:
198+
return
199+
200+
if not self._is_routable:
201+
# build an alternation regex
202+
routables = '^(' + '|'.join(pat for pat in self.route_map) + ')$'
203+
self._is_routable = re.compile(routables, re.UNICODE).match
204+
205+
return bool(self._is_routable(string))

src/packageurl/contrib/url2purl.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
#
2+
# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved.
3+
#
4+
5+
from __future__ import absolute_import
6+
from __future__ import print_function
7+
from __future__ import unicode_literals
8+
9+
import os
10+
11+
try:
12+
from urlparse import urlparse # Python 2
13+
from urllib import unquote_plus
14+
except ImportError:
15+
from urllib.parse import urlparse # Python 3
16+
from urllib.parse import unquote_plus
17+
18+
from packageurl import PackageURL
19+
from packageurl.contrib.route import Router
20+
from packageurl.contrib.route import NoRouteAvailable
21+
22+
23+
"""
24+
This module helps build a PackageURL from an arbitrary URL.
25+
This uses the a routing mechanism available in the route.py module.
26+
27+
In order to make it easy to use, it contains all the conversion functions
28+
in this single Python script.
29+
"""
30+
31+
32+
purl_router = Router()
33+
34+
35+
def get_purl(uri):
36+
"""
37+
Return a PackageURL inferred from the `uri` string or None.
38+
"""
39+
if uri:
40+
try:
41+
return purl_router.process(uri)
42+
except NoRouteAvailable:
43+
return
44+
45+
46+
@purl_router.route('https?://registry.npmjs.*/.*',
47+
'https?://registry.yarnpkg.com/.*')
48+
def build_npm_url(uri):
49+
# npm URLs are difficult to disambiguate with regex
50+
if '/-/' in uri:
51+
return build_npm_download_purl(uri)
52+
else:
53+
return build_npm_api_purl(uri)
54+
55+
56+
def build_npm_api_purl(uri):
57+
path = unquote_plus(urlparse(uri).path)
58+
segments = [seg for seg in path.split('/') if seg]
59+
60+
if len(segments) != 2:
61+
return
62+
63+
# /@invisionag/eslint-config-ivx
64+
if segments[0].startswith('@'):
65+
namespace = segments[0]
66+
name = segments[1]
67+
return PackageURL('npm', namespace, name)
68+
69+
# /angular/1.6.6
70+
else:
71+
name = segments[0]
72+
version = segments[1]
73+
return PackageURL('npm', name=name, version=version)
74+
75+
76+
def build_npm_download_purl(uri):
77+
path = unquote_plus(urlparse(uri).path)
78+
segments = [seg for seg in path.split('/') if seg and seg != '-']
79+
len_segments = len(segments)
80+
81+
# /@invisionag/eslint-config-ivx/-/eslint-config-ivx-0.0.2.tgz
82+
if len_segments == 3:
83+
namespace, name, filename = segments
84+
85+
# /automatta/-/automatta-0.0.1.tgz
86+
elif len_segments == 2:
87+
namespace = None
88+
name, filename = segments
89+
90+
else:
91+
return
92+
93+
base_filename, ext = os.path.splitext(filename)
94+
version = base_filename.split('-')[-1]
95+
96+
return PackageURL('npm', namespace, name, version)
97+
98+
99+
@purl_router.route('https?://repo1.maven.org/maven2/.*',
100+
'https?://central.maven.org/maven2/.*',
101+
'maven-index://repo1.maven.org/.*')
102+
def build_maven_purl(uri):
103+
path = unquote_plus(urlparse(uri).path)
104+
segments = [seg for seg in path.split('/') if seg and seg != 'maven2']
105+
106+
if len(segments) < 3:
107+
return
108+
109+
before_last_segment, last_segment = segments[-2:]
110+
has_filename = before_last_segment in last_segment
111+
112+
filename = None
113+
if has_filename:
114+
filename = segments.pop()
115+
116+
version = segments[-1]
117+
name = segments[-2]
118+
namespace = '.'.join(segments[:-2])
119+
qualifiers = {}
120+
121+
if filename:
122+
name_version = '{}-{}'.format(name, version)
123+
_, _, classifier_ext = filename.rpartition(name_version)
124+
classifier, _, extension = classifier_ext.partition('.')
125+
if not extension:
126+
return
127+
128+
qualifiers['classifier'] = classifier.strip('-')
129+
130+
valid_types = ('aar', 'ear', 'mar', 'pom', 'rar', 'rpm',
131+
'sar', 'tar.gz', 'war', 'zip')
132+
if extension in valid_types:
133+
qualifiers['type'] = extension
134+
135+
return PackageURL('maven', namespace, name, version, qualifiers)
136+
137+
138+
@purl_router.route('https?://rubygems.org/downloads/.*')
139+
def build_rubygems_url(uri):
140+
if uri.endswith('/') or not uri.endswith('.gem'):
141+
return
142+
143+
path = unquote_plus(urlparse(uri).path)
144+
last_segment = path.split('/')[-1]
145+
archive_basename = last_segment.rstrip('.gem')
146+
name, _, version = archive_basename.rpartition('-')
147+
148+
return PackageURL('rubygems', name=name, version=version)

0 commit comments

Comments
 (0)