Skip to content

fix(pypi): make the URL/filename extraction from requirement more robust #2871

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@ max_line_length = 100
[*.{py,bzl}]
indent_style = space
indent_size = 4

# different overrides for git commit messages
[.git/COMMIT_EDITMSG]
max_line_length = 72
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ END_UNRELEASED_TEMPLATE
multiple times.
* (tools/wheelmaker.py) Extras are now preserved in Requires-Dist metadata when using requires_file
to specify the requirements.
* (pypi) Use bazel downloader for direct URL references and correctly detect the filenames from
various URL formats - URL encoded version strings get correctly resolved, sha256 value can be
also retrieved from the URL as opposed to only the `--hash` parameter. Fixes
[#2363](https://github.com/bazel-contrib/rules_python/issues/2363).

{#v0-0-0-added}
### Added
Expand Down
4 changes: 2 additions & 2 deletions python/private/pypi/extension.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def _whl_repos(*, requirement, whl_library_args, download_only, netrc, auth_patt
# This is no-op because pip is not used to download the wheel.
args.pop("download_only", None)

args["requirement"] = requirement.srcs.requirement
args["requirement"] = requirement.line
args["urls"] = [distribution.url]
args["sha256"] = distribution.sha256
args["filename"] = distribution.filename
Expand Down Expand Up @@ -338,7 +338,7 @@ def _whl_repos(*, requirement, whl_library_args, download_only, netrc, auth_patt

# Fallback to a pip-installed wheel
args = dict(whl_library_args) # make a copy
args["requirement"] = requirement.srcs.requirement_line
args["requirement"] = requirement.line
if requirement.extra_pip_args:
args["extra_pip_args"] = requirement.extra_pip_args

Expand Down
42 changes: 40 additions & 2 deletions python/private/pypi/index_sources.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@
A file that houses private functions used in the `bzlmod` extension with the same name.
"""

# Just list them here and me super conservative
_KNOWN_EXTS = [
# Note, the following source in pip has more extensions
# https://github.com/pypa/pip/blob/3c5a189141a965f21a473e46c3107e689eb9f79f/src/pip/_vendor/distlib/locators.py#L90
#
# ".tar.bz2",
# ".tar",
# ".tgz",
# ".tbz",
#
# But we support only the following, used in 'packaging'
# https://github.com/pypa/pip/blob/3c5a189141a965f21a473e46c3107e689eb9f79f/src/pip/_vendor/packaging/utils.py#L137
".whl",
".tar.gz",
".zip",
]

def index_sources(line):
"""Get PyPI sources from a requirements.txt line.

Expand Down Expand Up @@ -58,16 +75,37 @@ def index_sources(line):
).strip()

url = ""
filename = ""
if "@" in head:
requirement = requirement_line
_, _, url_and_rest = requirement.partition("@")
maybe_requirement, _, url_and_rest = requirement.partition("@")
url = url_and_rest.strip().partition(" ")[0].strip()

url, _, sha256 = url.partition("#sha256=")
if sha256:
shas.append(sha256)
_, _, filename = url.rpartition("/")

# Replace URL encoded characters and luckily there is only one case
filename = filename.replace("%2B", "+")
is_known_ext = False
for ext in _KNOWN_EXTS:
if filename.endswith(ext):
is_known_ext = True
break

if is_known_ext:
requirement = maybe_requirement.strip()
else:
# could not detect filename from the URL
filename = ""
requirement = requirement_line

return struct(
requirement = requirement,
requirement_line = requirement_line,
version = version,
shas = sorted(shas),
marker = marker,
url = url,
filename = filename,
)
35 changes: 16 additions & 19 deletions python/private/pypi/parse_requirements.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def parse_requirements(
extra_pip_args = [],
get_index_urls = None,
evaluate_markers = None,
extract_url_srcs = True,
logger = None):
"""Get the requirements with platforms that the requirements apply to.

Expand All @@ -58,6 +59,8 @@ def parse_requirements(
the platforms stored as values in the input dict. Returns the same
dict, but with values being platforms that are compatible with the
requirements line.
extract_url_srcs: A boolean to enable extracting URLs from requirement
lines to enable using bazel downloader.
logger: repo_utils.logger or None, a simple struct to log diagnostic messages.

Returns:
Expand Down Expand Up @@ -206,7 +209,7 @@ def parse_requirements(
ret_requirements.append(
struct(
distribution = r.distribution,
srcs = r.srcs,
line = r.srcs.requirement if extract_url_srcs and (whls or sdist) else r.srcs.requirement_line,
target_platforms = sorted(target_platforms),
extra_pip_args = r.extra_pip_args,
whls = whls,
Expand Down Expand Up @@ -281,32 +284,26 @@ def _add_dists(*, requirement, index_urls, logger = None):
logger: A logger for printing diagnostic info.
"""

# Handle direct URLs in requirements
if requirement.srcs.url:
url = requirement.srcs.url
_, _, filename = url.rpartition("/")
filename, _, _ = filename.partition("#sha256=")
if "." not in filename:
# detected filename has no extension, it might be an sdist ref
# TODO @aignas 2025-04-03: should be handled if the following is fixed:
# https://github.com/bazel-contrib/rules_python/issues/2363
if not requirement.srcs.filename:
if logger:
logger.debug(lambda: "Could not detect the filename from the URL, falling back to pip: {}".format(
requirement.srcs.url,
))
return [], None

if "@" in filename:
# this is most likely foo.git@git_sha, skip special handling of these
return [], None

direct_url_dist = struct(
url = url,
filename = filename,
# Handle direct URLs in requirements
dist = struct(
url = requirement.srcs.url,
filename = requirement.srcs.filename,
sha256 = requirement.srcs.shas[0] if requirement.srcs.shas else "",
yanked = False,
)

if filename.endswith(".whl"):
return [direct_url_dist], None
if dist.filename.endswith(".whl"):
return [dist], None
else:
return [], direct_url_dist
return [], dist

if not index_urls:
return [], None
Expand Down
9 changes: 5 additions & 4 deletions python/private/pypi/pip_repository.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -89,19 +89,20 @@ def _pip_repository_impl(rctx):
python_interpreter_target = rctx.attr.python_interpreter_target,
srcs = rctx.attr._evaluate_markers_srcs,
),
extract_url_srcs = False,
)
selected_requirements = {}
options = None
repository_platform = host_platform(rctx)
for name, requirements in requirements_by_platform.items():
r = select_requirement(
requirement = select_requirement(
requirements,
platform = None if rctx.attr.download_only else repository_platform,
)
if not r:
if not requirement:
continue
options = options or r.extra_pip_args
selected_requirements[name] = r.srcs.requirement_line
options = options or requirement.extra_pip_args
selected_requirements[name] = requirement.line

bzl_packages = sorted(selected_requirements.keys())

Expand Down
12 changes: 1 addition & 11 deletions python/private/pypi/whl_library.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -258,19 +258,9 @@ def _whl_library_impl(rctx):
# Simulate the behaviour where the whl is present in the current directory.
rctx.symlink(whl_path, whl_path.basename)
whl_path = rctx.path(whl_path.basename)
elif rctx.attr.urls:
elif rctx.attr.urls and rctx.attr.filename:
filename = rctx.attr.filename
urls = rctx.attr.urls
if not filename:
_, _, filename = urls[0].rpartition("/")

if not (filename.endswith(".whl") or filename.endswith("tar.gz") or filename.endswith(".zip")):
if rctx.attr.filename:
msg = "got '{}'".format(filename)
else:
msg = "detected '{}' from url:\n{}".format(filename, urls[0])
fail("Only '.whl', '.tar.gz' or '.zip' files are supported, {}".format(msg))

result = rctx.download(
url = urls,
output = filename,
Expand Down
6 changes: 3 additions & 3 deletions tests/pypi/extension/extension_tests.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ git_dep @ git+https://git.server/repo/project@deadbeefdeadbeef
"extra_pip_args": ["--extra-args-for-sdist-building"],
"filename": "any-name.tar.gz",
"python_interpreter_target": "unit_test_interpreter_target",
"requirement": "direct_sdist_without_sha @ some-archive/any-name.tar.gz",
"requirement": "direct_sdist_without_sha",
"sha256": "",
"urls": ["some-archive/any-name.tar.gz"],
},
Expand All @@ -824,7 +824,7 @@ git_dep @ git+https://git.server/repo/project@deadbeefdeadbeef
],
"filename": "direct_without_sha-0.0.1-py3-none-any.whl",
"python_interpreter_target": "unit_test_interpreter_target",
"requirement": "direct_without_sha==0.0.1 @ example-direct.org/direct_without_sha-0.0.1-py3-none-any.whl",
"requirement": "direct_without_sha==0.0.1",
"sha256": "",
"urls": ["example-direct.org/direct_without_sha-0.0.1-py3-none-any.whl"],
},
Expand Down Expand Up @@ -891,7 +891,7 @@ git_dep @ git+https://git.server/repo/project@deadbeefdeadbeef
],
"filename": "some_pkg-0.0.1-py3-none-any.whl",
"python_interpreter_target": "unit_test_interpreter_target",
"requirement": "some_pkg==0.0.1 @ example-direct.org/some_pkg-0.0.1-py3-none-any.whl --hash=sha256:deadbaaf",
"requirement": "some_pkg==0.0.1",
"sha256": "deadbaaf",
"urls": ["example-direct.org/some_pkg-0.0.1-py3-none-any.whl"],
},
Expand Down
39 changes: 35 additions & 4 deletions tests/pypi/index_sources/index_sources_tests.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -23,52 +23,83 @@ def _test_no_simple_api_sources(env):
inputs = {
"foo @ git+https://github.com/org/foo.git@deadbeef": struct(
requirement = "foo @ git+https://github.com/org/foo.git@deadbeef",
requirement_line = "foo @ git+https://github.com/org/foo.git@deadbeef",
marker = "",
url = "git+https://github.com/org/foo.git@deadbeef",
shas = [],
version = "",
filename = "",
),
"foo==0.0.1": struct(
requirement = "foo==0.0.1",
requirement_line = "foo==0.0.1",
marker = "",
url = "",
version = "0.0.1",
filename = "",
),
"foo==0.0.1 @ https://someurl.org": struct(
requirement = "foo==0.0.1 @ https://someurl.org",
requirement_line = "foo==0.0.1 @ https://someurl.org",
marker = "",
url = "https://someurl.org",
version = "0.0.1",
filename = "",
),
"foo==0.0.1 @ https://someurl.org/package.whl": struct(
requirement = "foo==0.0.1 @ https://someurl.org/package.whl",
requirement = "foo==0.0.1",
requirement_line = "foo==0.0.1 @ https://someurl.org/package.whl",
marker = "",
url = "https://someurl.org/package.whl",
version = "0.0.1",
filename = "package.whl",
),
"foo==0.0.1 @ https://someurl.org/package.whl --hash=sha256:deadbeef": struct(
requirement = "foo==0.0.1 @ https://someurl.org/package.whl --hash=sha256:deadbeef",
requirement = "foo==0.0.1",
requirement_line = "foo==0.0.1 @ https://someurl.org/package.whl --hash=sha256:deadbeef",
marker = "",
url = "https://someurl.org/package.whl",
shas = ["deadbeef"],
version = "0.0.1",
filename = "package.whl",
),
"foo==0.0.1 @ https://someurl.org/package.whl; python_version < \"2.7\"\\ --hash=sha256:deadbeef": struct(
requirement = "foo==0.0.1 @ https://someurl.org/package.whl --hash=sha256:deadbeef",
requirement = "foo==0.0.1",
requirement_line = "foo==0.0.1 @ https://someurl.org/package.whl --hash=sha256:deadbeef",
marker = "python_version < \"2.7\"",
url = "https://someurl.org/package.whl",
shas = ["deadbeef"],
version = "0.0.1",
filename = "package.whl",
),
"foo[extra] @ https://example.org/foo-1.0.tar.gz --hash=sha256:deadbe0f": struct(
requirement = "foo[extra]",
requirement_line = "foo[extra] @ https://example.org/foo-1.0.tar.gz --hash=sha256:deadbe0f",
marker = "",
url = "https://example.org/foo-1.0.tar.gz",
shas = ["deadbe0f"],
version = "",
filename = "foo-1.0.tar.gz",
),
"torch @ https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=deadbeef": struct(
requirement = "torch",
requirement_line = "torch @ https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl#sha256=deadbeef",
marker = "",
url = "https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl",
shas = ["deadbeef"],
version = "",
filename = "torch-2.6.0+cpu-cp311-cp311-linux_x86_64.whl",
),
}
for input, want in inputs.items():
got = index_sources(input)
env.expect.that_collection(got.shas).contains_exactly(want.shas if hasattr(want, "shas") else [])
env.expect.that_str(got.version).equals(want.version)
env.expect.that_str(got.requirement).equals(want.requirement)
env.expect.that_str(got.requirement_line).equals(got.requirement)
env.expect.that_str(got.requirement_line).equals(got.requirement_line)
env.expect.that_str(got.marker).equals(want.marker)
env.expect.that_str(got.url).equals(want.url)
env.expect.that_str(got.filename).equals(want.filename)

_tests.append(_test_no_simple_api_sources)

Expand Down
Loading