diff --git a/python/private/pypi/parse_requirements.bzl b/python/private/pypi/parse_requirements.bzl
index acc35b3208..78b6662d08 100644
--- a/python/private/pypi/parse_requirements.bzl
+++ b/python/private/pypi/parse_requirements.bzl
@@ -267,7 +267,7 @@ def _package_srcs(
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
)
req_line = r.srcs.requirement_line
else:
@@ -379,7 +379,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
url = requirement.srcs.url,
filename = requirement.srcs.filename,
sha256 = requirement.srcs.shas[0] if requirement.srcs.shas else "",
- yanked = False,
+ yanked = None,
)
return dist, False
@@ -403,12 +403,12 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
maybe_whl = index_urls.whls.get(sha256)
- if maybe_whl and not maybe_whl.yanked:
+ if maybe_whl and maybe_whl.yanked == None:
whls.append(maybe_whl)
continue
maybe_sdist = index_urls.sdists.get(sha256)
- if maybe_sdist and not maybe_sdist.yanked:
+ if maybe_sdist and maybe_sdist.yanked == None:
sdist = maybe_sdist
continue
@@ -416,7 +416,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
yanked = {}
for dist in whls + [sdist]:
- if dist and dist.yanked:
+ if dist and dist.yanked != None:
yanked.setdefault(dist.yanked, []).append(dist.filename)
if yanked:
logger.warn(lambda: "\n".join([
diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl
index 6778d3da16..563130791e 100644
--- a/python/private/pypi/parse_simpleapi_html.bzl
+++ b/python/private/pypi/parse_simpleapi_html.bzl
@@ -26,81 +26,177 @@ def parse_simpleapi_html(*, content):
Returns:
A list of structs with:
- * filename: The filename of the artifact.
- * version: The version of the artifact.
- * url: The URL to download the artifact.
- * sha256: The sha256 of the artifact.
- * metadata_sha256: The whl METADATA sha256 if we can download it. If this is
- present, then the 'metadata_url' is also present. Defaults to "".
- * metadata_url: The URL for the METADATA if we can download it. Defaults to "".
+ * filename: {type}`str` The filename of the artifact.
+ * version: {type}`str` The version of the artifact.
+ * url: {type}`str` The URL to download the artifact.
+ * sha256: {type}`str` The sha256 of the artifact.
+ * metadata_sha256: {type}`str` The whl METADATA sha256 if we can download it. If this is
+ present, then the 'metadata_url' is also present. Defaults to "".
+ * metadata_url: {type}`str` The URL for the METADATA if we can download it. Defaults to "".
+ * yanked: {type}`str | None` the yank reason if the package is yanked. If it is not yanked,
+ then it will be `None`. An empty string yank reason means that the package is yanked but
+ the reason is not provided.
"""
sdists = {}
whls = {}
- lines = content.split("= (2, 0):
# We don't expect to have version 2.0 here, but have this check in place just in case.
# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
fail("Unsupported API version: {}".format(api_version))
- # Each line follows the following pattern
- # filename
- sha256s_by_version = {}
- for line in lines[1:]:
- dist_url, _, tail = line.partition("#sha256=")
+ # 2. Iterate using find() to avoid huge list allocations from .split("
+ tag_end = content.find(">", start_tag)
+ end_tag = content.find("", tag_end)
+ if tag_end == -1 or end_tag == -1:
+ break
+
+ # Extract only the necessary slices
+ attr_part = content[start_tag + 3:tag_end]
+ filename = content[tag_end + 1:end_tag].strip()
+
+ # Update cursor for next iteration
+ cursor = end_tag + 4
+
+ # 3. Efficient Attribute Parsing
+ attrs = _parse_attrs(attr_part)
+ href = attrs.get("href", "")
+ if not href:
+ continue
- sha256, _, tail = tail.partition("\"")
+ dist_url, _, sha256 = href.partition("#sha256=")
- # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
- yanked = "data-yanked" in line
+ # Handle Yanked status
+ yanked = None
+ if "data-yanked" in attrs:
+ yanked = _unescape_pypi_html(attrs["data-yanked"])
- head, _, _ = tail.rpartition("")
- maybe_metadata, _, filename = head.rpartition(">")
version = version_from_filename(filename)
sha256s_by_version.setdefault(version, []).append(sha256)
+ # 4. Optimized Metadata Check (PEP 714)
metadata_sha256 = ""
metadata_url = ""
- for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
- metadata_marker = metadata_marker + "=\"sha256="
- if metadata_marker in maybe_metadata:
- # Implement https://peps.python.org/pep-0714/
- _, _, tail = maybe_metadata.partition(metadata_marker)
- metadata_sha256, _, _ = tail.partition("\"")
- metadata_url = dist_url + ".metadata"
- break
+
+ # Dist-info is more common in modern PyPI
+ m_val = attrs.get("data-dist-info-metadata") or attrs.get("data-core-metadata")
+ if m_val and m_val != "false":
+ _, _, metadata_sha256 = m_val.partition("sha256=")
+ metadata_url = dist_url + ".metadata"
+
+ # 5. Result object
+ dist = struct(
+ filename = filename,
+ version = version,
+ url = dist_url,
+ sha256 = sha256,
+ metadata_sha256 = metadata_sha256,
+ metadata_url = metadata_url,
+ yanked = yanked,
+ )
if filename.endswith(".whl"):
- whls[sha256] = struct(
- filename = filename,
- version = version,
- url = dist_url,
- sha256 = sha256,
- metadata_sha256 = metadata_sha256,
- metadata_url = metadata_url,
- yanked = yanked,
- )
+ whls[sha256] = dist
else:
- sdists[sha256] = struct(
- filename = filename,
- version = version,
- url = dist_url,
- sha256 = sha256,
- metadata_sha256 = "",
- metadata_url = "",
- yanked = yanked,
- )
+ sdists[sha256] = dist
return struct(
sdists = sdists,
whls = whls,
sha256s_by_version = sha256s_by_version,
)
+
+def _parse_attrs(attr_string):
+ """Parses attributes from a pre-sliced string."""
+ attrs = {}
+ parts = attr_string.split('"')
+
+ for i in range(0, len(parts) - 1, 2):
+ raw_key = parts[i].strip()
+ if not raw_key:
+ continue
+
+ key_parts = raw_key.split(" ")
+ current_key = key_parts[-1].rstrip("=")
+
+ # Batch handle booleans
+ for j in range(len(key_parts) - 1):
+ b = key_parts[j].strip()
+ if b:
+ attrs[b] = ""
+
+ attrs[current_key] = parts[i + 1]
+
+ # Final trailing boolean check
+ last = parts[-1].strip()
+ if last:
+ for b in last.split(" "):
+ if b:
+ attrs[b] = ""
+ return attrs
+
+def _unescape_pypi_html(text):
+ """Unescape HTML text.
+
+ Decodes standard HTML entities used in the Simple API.
+ Specifically targets characters used in URLs and attribute values.
+
+ Args:
+ text: {type}`str` The text to replace.
+
+ Returns:
+ A string with unescaped characters
+ """
+
+ # 1. Short circuit for the most common case
+ if not text or "&" not in text:
+ return text
+
+ # 2. Check for the most frequent PEP 503 entities first (version constraints).
+ # Re-ordering based on frequency reduces unnecessary checks for rare entities.
+ if ">" in text:
+ text = text.replace(">", ">")
+ if "<" in text:
+ text = text.replace("<", "<")
+
+ # 3. Grouped check for numeric entities.
+ # If '' isn't there, we skip 4 distinct string scans.
+ if "" in text:
+ if "'" in text:
+ text = text.replace("'", "'")
+ if "'" in text:
+ text = text.replace("'", "'")
+ if "
" in text:
+ text = text.replace("
", "\n")
+ if "
" in text:
+ text = text.replace("
", "\r")
+
+ if """ in text:
+ text = text.replace(""", '"')
+
+ # 4. Handle ampersands last to prevent double-decoding.
+ if "&" in text:
+ text = text.replace("&", "&")
+
+ return text
diff --git a/tests/pypi/hub_builder/hub_builder_tests.bzl b/tests/pypi/hub_builder/hub_builder_tests.bzl
index 27040d36d7..170e12c4e4 100644
--- a/tests/pypi/hub_builder/hub_builder_tests.bzl
+++ b/tests/pypi/hub_builder/hub_builder_tests.bzl
@@ -777,7 +777,7 @@ def _test_simple_get_index(env):
"plat_pkg": struct(
whls = {
"deadb44f": struct(
- yanked = False,
+ yanked = None,
filename = "plat-pkg-0.0.4-py3-none-linux_x86_64.whl",
sha256 = "deadb44f",
url = "example2.org/index/plat_pkg/",
@@ -792,7 +792,7 @@ def _test_simple_get_index(env):
"simple": struct(
whls = {
"deadb00f": struct(
- yanked = False,
+ yanked = None,
filename = "simple-0.0.1-py3-none-any.whl",
sha256 = "deadb00f",
url = "example2.org",
@@ -800,7 +800,7 @@ def _test_simple_get_index(env):
},
sdists = {
"deadbeef": struct(
- yanked = False,
+ yanked = None,
filename = "simple-0.0.1.tar.gz",
sha256 = "deadbeef",
url = "example.org",
@@ -811,7 +811,7 @@ def _test_simple_get_index(env):
"some_other_pkg": struct(
whls = {
"deadb33f": struct(
- yanked = False,
+ yanked = None,
filename = "some-other-pkg-0.0.1-py3-none-any.whl",
sha256 = "deadb33f",
url = "example2.org/index/some_other_pkg/",
diff --git a/tests/pypi/parse_requirements/parse_requirements_tests.bzl b/tests/pypi/parse_requirements/parse_requirements_tests.bzl
index 0d03e94467..bea8ac5f78 100644
--- a/tests/pypi/parse_requirements/parse_requirements_tests.bzl
+++ b/tests/pypi/parse_requirements/parse_requirements_tests.bzl
@@ -143,7 +143,7 @@ def _test_simple(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -174,7 +174,7 @@ def _test_direct_urls_integration(env):
sha256 = "",
target_platforms = ["osx_x86_64"],
url = "https://github.com/org/foo/downloads/foo-1.1.tar.gz",
- yanked = False,
+ yanked = None,
),
struct(
distribution = "foo",
@@ -184,7 +184,7 @@ def _test_direct_urls_integration(env):
sha256 = "",
target_platforms = ["linux_x86_64"],
url = "https://some-url/package.whl",
- yanked = False,
+ yanked = None,
),
],
),
@@ -216,7 +216,7 @@ def _test_direct_urls_no_extract(env):
sha256 = "",
target_platforms = ["osx_x86_64"],
url = "",
- yanked = False,
+ yanked = None,
),
struct(
distribution = "foo",
@@ -226,7 +226,7 @@ def _test_direct_urls_no_extract(env):
sha256 = "",
target_platforms = ["linux_x86_64"],
url = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -258,7 +258,7 @@ def _test_extra_pip_args(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -287,7 +287,7 @@ def _test_dupe_requirements(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -318,7 +318,7 @@ def _test_multi_os(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -336,7 +336,7 @@ def _test_multi_os(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
struct(
distribution = "foo",
@@ -346,7 +346,7 @@ def _test_multi_os(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -383,7 +383,7 @@ def _test_multi_os_legacy(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -401,7 +401,7 @@ def _test_multi_os_legacy(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
struct(
distribution = "foo",
@@ -411,7 +411,7 @@ def _test_multi_os_legacy(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -464,7 +464,7 @@ def _test_env_marker_resolution(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -482,7 +482,7 @@ def _test_env_marker_resolution(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -512,7 +512,7 @@ def _test_different_package_version(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
struct(
distribution = "foo",
@@ -522,7 +522,7 @@ def _test_different_package_version(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -552,7 +552,7 @@ def _test_different_package_extras(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
struct(
distribution = "foo",
@@ -562,7 +562,7 @@ def _test_different_package_extras(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -591,7 +591,7 @@ def _test_optional_hash(env):
url = "https://example.org/bar-0.0.4.whl",
filename = "bar-0.0.4.whl",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -609,7 +609,7 @@ def _test_optional_hash(env):
url = "https://example.org/foo-0.0.5.whl",
filename = "foo-0.0.5.whl",
sha256 = "deadbeef",
- yanked = False,
+ yanked = None,
),
],
),
@@ -638,7 +638,7 @@ def _test_git_sources(env):
url = "",
filename = "",
sha256 = "",
- yanked = False,
+ yanked = None,
),
],
),
@@ -680,7 +680,7 @@ def _test_overlapping_shas_with_index_results(env):
url = "sdist",
sha256 = "5d15t",
filename = "foo-0.0.1.tar.gz",
- yanked = False,
+ yanked = None,
),
},
whls = {
@@ -688,13 +688,13 @@ def _test_overlapping_shas_with_index_results(env):
url = "super2",
sha256 = "deadb11f",
filename = "foo-0.0.1-py3-none-macosx_14_0_x86_64.whl",
- yanked = False,
+ yanked = None,
),
"deadbaaf": struct(
url = "super2",
sha256 = "deadbaaf",
filename = "foo-0.0.1-py3-none-any.whl",
- yanked = False,
+ yanked = None,
),
},
),
@@ -716,7 +716,7 @@ def _test_overlapping_shas_with_index_results(env):
sha256 = "deadbaaf",
target_platforms = ["cp39_linux_x86_64"],
url = "super2",
- yanked = False,
+ yanked = None,
),
struct(
distribution = "foo",
@@ -726,7 +726,7 @@ def _test_overlapping_shas_with_index_results(env):
sha256 = "deadb11f",
target_platforms = ["cp39_osx_x86_64"],
url = "super2",
- yanked = False,
+ yanked = None,
),
],
),
@@ -771,13 +771,13 @@ def _test_get_index_urls_different_versions(env):
url = "super2",
sha256 = "deadb11f",
filename = "foo-0.0.2-py3-none-any.whl",
- yanked = False,
+ yanked = None,
),
"deadbaaf": struct(
url = "super2",
sha256 = "deadbaaf",
filename = "foo-0.0.1-py3-none-any.whl",
- yanked = False,
+ yanked = None,
),
},
),
@@ -810,7 +810,7 @@ def _test_get_index_urls_different_versions(env):
sha256 = "",
target_platforms = ["cp39_linux_x86_64"],
url = "",
- yanked = False,
+ yanked = None,
),
struct(
distribution = "foo",
@@ -820,7 +820,7 @@ def _test_get_index_urls_different_versions(env):
sha256 = "deadb11f",
target_platforms = ["cp310_linux_x86_64"],
url = "super2",
- yanked = False,
+ yanked = None,
),
],
),
@@ -855,7 +855,7 @@ def _test_get_index_urls_single_py_version(env):
url = "super2",
sha256 = "deadb11f",
filename = "foo-0.0.2-py3-none-any.whl",
- yanked = False,
+ yanked = None,
),
},
),
@@ -885,7 +885,7 @@ def _test_get_index_urls_single_py_version(env):
sha256 = "deadb11f",
target_platforms = ["cp310_linux_x86_64"],
url = "super2",
- yanked = False,
+ yanked = None,
),
],
),
diff --git a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
index f33ba05c91..f72d61371c 100644
--- a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
+++ b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl
@@ -57,7 +57,7 @@ def _test_sdist(env):
filename = "foo-0.0.1.tar.gz",
sha256 = "deadbeefasource",
url = "https://example.org/full-url/foo-0.0.1.tar.gz",
- yanked = False,
+ yanked = None,
version = "0.0.1",
),
),
@@ -65,7 +65,25 @@ def _test_sdist(env):
struct(
attrs = [
'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"',
- 'data-requires-python=">=3.7"',
+ 'data-requires-python=">=3.7"',
+ "data-yanked",
+ ],
+ filename = "foo-0.0.1.tar.gz",
+ ),
+ struct(
+ filename = "foo-0.0.1.tar.gz",
+ sha256 = "deadbeefasource",
+ url = "https://example.org/full-url/foo-0.0.1.tar.gz",
+ version = "0.0.1",
+ yanked = "",
+ ),
+ ),
+ (
+ struct(
+ attrs = [
+ 'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"',
+ 'data-requires-python=">=3.7"',
+ "data-yanked=\"Something
with "quotes"
over two lines\"",
],
filename = "foo-0.0.1.tar.gz",
),
@@ -74,7 +92,25 @@ def _test_sdist(env):
sha256 = "deadbeefasource",
url = "https://example.org/full-url/foo-0.0.1.tar.gz",
version = "0.0.1",
- yanked = False,
+ # NOTE @aignas 2026-03-09: we preserve the white space
+ yanked = "Something \nwith \"quotes\"\nover two lines",
+ ),
+ ),
+ (
+ struct(
+ attrs = [
+ 'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"',
+ 'data-requires-python=">=3.7"',
+ 'data-yanked=""',
+ ],
+ filename = "foo-0.0.1.tar.gz",
+ ),
+ struct(
+ filename = "foo-0.0.1.tar.gz",
+ sha256 = "deadbeefasource",
+ url = "https://example.org/full-url/foo-0.0.1.tar.gz",
+ version = "0.0.1",
+ yanked = "",
),
),
]
@@ -94,7 +130,7 @@ def _test_sdist(env):
filename = subjects.str,
sha256 = subjects.str,
url = subjects.str,
- yanked = subjects.bool,
+ yanked = subjects.str,
version = subjects.str,
),
)
@@ -126,14 +162,14 @@ def _test_whls(env):
sha256 = "deadbeef",
url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
version = "0.0.2",
- yanked = False,
+ yanked = None,
),
),
(
struct(
attrs = [
'href="https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=deadbeef"',
- 'data-requires-python=">=3.7"',
+ 'data-requires-python=">=3.7"',
'data-dist-info-metadata="sha256=deadb00f"',
'data-core-metadata="sha256=deadb00f"',
],
@@ -146,7 +182,7 @@ def _test_whls(env):
sha256 = "deadbeef",
url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
version = "0.0.2",
- yanked = False,
+ yanked = None,
),
),
(
@@ -165,7 +201,7 @@ def _test_whls(env):
sha256 = "deadbeef",
version = "0.0.2",
url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
- yanked = False,
+ yanked = None,
),
),
(
@@ -184,7 +220,7 @@ def _test_whls(env):
sha256 = "deadbeef",
version = "0.0.2",
url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
- yanked = False,
+ yanked = None,
),
),
(
@@ -202,7 +238,7 @@ def _test_whls(env):
sha256 = "deadbeef",
url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
version = "0.0.2",
- yanked = False,
+ yanked = None,
),
),
]
@@ -223,7 +259,7 @@ def _test_whls(env):
metadata_url = subjects.str,
sha256 = subjects.str,
url = subjects.str,
- yanked = subjects.bool,
+ yanked = subjects.str,
version = subjects.str,
),
)