From 1c73fed0f8f8775bd3645c8243b4d6ac5526b3d2 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:50:35 +0900 Subject: [PATCH] fix(pypi): return yank reason from SimpleAPI HTML This makes the logic in the parser a little bit more sophisticated, but we also start handling the yank reason. This fixes the issue where the `data-yank` presence but no value would be interpreted as a yanked package. With this it should start working. This implementation assumes that we have HTML escaped sequences as tag values. It also unescapes them when returning the strings. The posibilities that it gives us are: - Use the `data-requires-python` to potentially discard any Python packages that are unsupported in the `select_whl` function. Work towards #260. Work towards #2731. --- python/private/pypi/parse_requirements.bzl | 10 +- python/private/pypi/parse_simpleapi_html.bzl | 198 +++++++++++++----- tests/pypi/hub_builder/hub_builder_tests.bzl | 8 +- .../parse_requirements_tests.bzl | 66 +++--- .../parse_simpleapi_html_tests.bzl | 58 ++++- 5 files changed, 236 insertions(+), 104 deletions(-) diff --git a/python/private/pypi/parse_requirements.bzl b/python/private/pypi/parse_requirements.bzl index acc35b3208..78b6662d08 100644 --- a/python/private/pypi/parse_requirements.bzl +++ b/python/private/pypi/parse_requirements.bzl @@ -267,7 +267,7 @@ def _package_srcs( url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ) req_line = r.srcs.requirement_line else: @@ -379,7 +379,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None): url = requirement.srcs.url, filename = requirement.srcs.filename, sha256 = requirement.srcs.shas[0] if requirement.srcs.shas else "", - yanked = False, + yanked = None, ) return dist, False @@ -403,12 +403,12 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None): # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api maybe_whl = index_urls.whls.get(sha256) - if maybe_whl and not maybe_whl.yanked: + if maybe_whl and maybe_whl.yanked == None: whls.append(maybe_whl) continue maybe_sdist = index_urls.sdists.get(sha256) - if maybe_sdist and not maybe_sdist.yanked: + if maybe_sdist and maybe_sdist.yanked == None: sdist = maybe_sdist continue @@ -416,7 +416,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None): yanked = {} for dist in whls + [sdist]: - if dist and dist.yanked: + if dist and dist.yanked != None: yanked.setdefault(dist.yanked, []).append(dist.filename) if yanked: logger.warn(lambda: "\n".join([ diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl index 6778d3da16..563130791e 100644 --- a/python/private/pypi/parse_simpleapi_html.bzl +++ b/python/private/pypi/parse_simpleapi_html.bzl @@ -26,81 +26,177 @@ def parse_simpleapi_html(*, content): Returns: A list of structs with: - * filename: The filename of the artifact. - * version: The version of the artifact. - * url: The URL to download the artifact. - * sha256: The sha256 of the artifact. - * metadata_sha256: The whl METADATA sha256 if we can download it. If this is - present, then the 'metadata_url' is also present. Defaults to "". - * metadata_url: The URL for the METADATA if we can download it. Defaults to "". + * filename: {type}`str` The filename of the artifact. + * version: {type}`str` The version of the artifact. + * url: {type}`str` The URL to download the artifact. + * sha256: {type}`str` The sha256 of the artifact. + * metadata_sha256: {type}`str` The whl METADATA sha256 if we can download it. If this is + present, then the 'metadata_url' is also present. Defaults to "". + * metadata_url: {type}`str` The URL for the METADATA if we can download it. Defaults to "". + * yanked: {type}`str | None` the yank reason if the package is yanked. If it is not yanked, + then it will be `None`. An empty string yank reason means that the package is yanked but + the reason is not provided. """ sdists = {} whls = {} - lines = content.split("= (2, 0): # We don't expect to have version 2.0 here, but have this check in place just in case. # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api fail("Unsupported API version: {}".format(api_version)) - # Each line follows the following pattern - # filename
- sha256s_by_version = {} - for line in lines[1:]: - dist_url, _, tail = line.partition("#sha256=") + # 2. Iterate using find() to avoid huge list allocations from .split(" + tag_end = content.find(">", start_tag) + end_tag = content.find("", tag_end) + if tag_end == -1 or end_tag == -1: + break + + # Extract only the necessary slices + attr_part = content[start_tag + 3:tag_end] + filename = content[tag_end + 1:end_tag].strip() + + # Update cursor for next iteration + cursor = end_tag + 4 + + # 3. Efficient Attribute Parsing + attrs = _parse_attrs(attr_part) + href = attrs.get("href", "") + if not href: + continue - sha256, _, tail = tail.partition("\"") + dist_url, _, sha256 = href.partition("#sha256=") - # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api - yanked = "data-yanked" in line + # Handle Yanked status + yanked = None + if "data-yanked" in attrs: + yanked = _unescape_pypi_html(attrs["data-yanked"]) - head, _, _ = tail.rpartition("") - maybe_metadata, _, filename = head.rpartition(">") version = version_from_filename(filename) sha256s_by_version.setdefault(version, []).append(sha256) + # 4. Optimized Metadata Check (PEP 714) metadata_sha256 = "" metadata_url = "" - for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]: - metadata_marker = metadata_marker + "=\"sha256=" - if metadata_marker in maybe_metadata: - # Implement https://peps.python.org/pep-0714/ - _, _, tail = maybe_metadata.partition(metadata_marker) - metadata_sha256, _, _ = tail.partition("\"") - metadata_url = dist_url + ".metadata" - break + + # Dist-info is more common in modern PyPI + m_val = attrs.get("data-dist-info-metadata") or attrs.get("data-core-metadata") + if m_val and m_val != "false": + _, _, metadata_sha256 = m_val.partition("sha256=") + metadata_url = dist_url + ".metadata" + + # 5. Result object + dist = struct( + filename = filename, + version = version, + url = dist_url, + sha256 = sha256, + metadata_sha256 = metadata_sha256, + metadata_url = metadata_url, + yanked = yanked, + ) if filename.endswith(".whl"): - whls[sha256] = struct( - filename = filename, - version = version, - url = dist_url, - sha256 = sha256, - metadata_sha256 = metadata_sha256, - metadata_url = metadata_url, - yanked = yanked, - ) + whls[sha256] = dist else: - sdists[sha256] = struct( - filename = filename, - version = version, - url = dist_url, - sha256 = sha256, - metadata_sha256 = "", - metadata_url = "", - yanked = yanked, - ) + sdists[sha256] = dist return struct( sdists = sdists, whls = whls, sha256s_by_version = sha256s_by_version, ) + +def _parse_attrs(attr_string): + """Parses attributes from a pre-sliced string.""" + attrs = {} + parts = attr_string.split('"') + + for i in range(0, len(parts) - 1, 2): + raw_key = parts[i].strip() + if not raw_key: + continue + + key_parts = raw_key.split(" ") + current_key = key_parts[-1].rstrip("=") + + # Batch handle booleans + for j in range(len(key_parts) - 1): + b = key_parts[j].strip() + if b: + attrs[b] = "" + + attrs[current_key] = parts[i + 1] + + # Final trailing boolean check + last = parts[-1].strip() + if last: + for b in last.split(" "): + if b: + attrs[b] = "" + return attrs + +def _unescape_pypi_html(text): + """Unescape HTML text. + + Decodes standard HTML entities used in the Simple API. + Specifically targets characters used in URLs and attribute values. + + Args: + text: {type}`str` The text to replace. + + Returns: + A string with unescaped characters + """ + + # 1. Short circuit for the most common case + if not text or "&" not in text: + return text + + # 2. Check for the most frequent PEP 503 entities first (version constraints). + # Re-ordering based on frequency reduces unnecessary checks for rare entities. + if ">" in text: + text = text.replace(">", ">") + if "<" in text: + text = text.replace("<", "<") + + # 3. Grouped check for numeric entities. + # If '&#' isn't there, we skip 4 distinct string scans. + if "&#" in text: + if "'" in text: + text = text.replace("'", "'") + if "'" in text: + text = text.replace("'", "'") + if " " in text: + text = text.replace(" ", "\n") + if " " in text: + text = text.replace(" ", "\r") + + if """ in text: + text = text.replace(""", '"') + + # 4. Handle ampersands last to prevent double-decoding. + if "&" in text: + text = text.replace("&", "&") + + return text diff --git a/tests/pypi/hub_builder/hub_builder_tests.bzl b/tests/pypi/hub_builder/hub_builder_tests.bzl index 27040d36d7..170e12c4e4 100644 --- a/tests/pypi/hub_builder/hub_builder_tests.bzl +++ b/tests/pypi/hub_builder/hub_builder_tests.bzl @@ -777,7 +777,7 @@ def _test_simple_get_index(env): "plat_pkg": struct( whls = { "deadb44f": struct( - yanked = False, + yanked = None, filename = "plat-pkg-0.0.4-py3-none-linux_x86_64.whl", sha256 = "deadb44f", url = "example2.org/index/plat_pkg/", @@ -792,7 +792,7 @@ def _test_simple_get_index(env): "simple": struct( whls = { "deadb00f": struct( - yanked = False, + yanked = None, filename = "simple-0.0.1-py3-none-any.whl", sha256 = "deadb00f", url = "example2.org", @@ -800,7 +800,7 @@ def _test_simple_get_index(env): }, sdists = { "deadbeef": struct( - yanked = False, + yanked = None, filename = "simple-0.0.1.tar.gz", sha256 = "deadbeef", url = "example.org", @@ -811,7 +811,7 @@ def _test_simple_get_index(env): "some_other_pkg": struct( whls = { "deadb33f": struct( - yanked = False, + yanked = None, filename = "some-other-pkg-0.0.1-py3-none-any.whl", sha256 = "deadb33f", url = "example2.org/index/some_other_pkg/", diff --git a/tests/pypi/parse_requirements/parse_requirements_tests.bzl b/tests/pypi/parse_requirements/parse_requirements_tests.bzl index 0d03e94467..bea8ac5f78 100644 --- a/tests/pypi/parse_requirements/parse_requirements_tests.bzl +++ b/tests/pypi/parse_requirements/parse_requirements_tests.bzl @@ -143,7 +143,7 @@ def _test_simple(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -174,7 +174,7 @@ def _test_direct_urls_integration(env): sha256 = "", target_platforms = ["osx_x86_64"], url = "https://github.com/org/foo/downloads/foo-1.1.tar.gz", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -184,7 +184,7 @@ def _test_direct_urls_integration(env): sha256 = "", target_platforms = ["linux_x86_64"], url = "https://some-url/package.whl", - yanked = False, + yanked = None, ), ], ), @@ -216,7 +216,7 @@ def _test_direct_urls_no_extract(env): sha256 = "", target_platforms = ["osx_x86_64"], url = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -226,7 +226,7 @@ def _test_direct_urls_no_extract(env): sha256 = "", target_platforms = ["linux_x86_64"], url = "", - yanked = False, + yanked = None, ), ], ), @@ -258,7 +258,7 @@ def _test_extra_pip_args(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -287,7 +287,7 @@ def _test_dupe_requirements(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -318,7 +318,7 @@ def _test_multi_os(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -336,7 +336,7 @@ def _test_multi_os(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -346,7 +346,7 @@ def _test_multi_os(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -383,7 +383,7 @@ def _test_multi_os_legacy(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -401,7 +401,7 @@ def _test_multi_os_legacy(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -411,7 +411,7 @@ def _test_multi_os_legacy(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -464,7 +464,7 @@ def _test_env_marker_resolution(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -482,7 +482,7 @@ def _test_env_marker_resolution(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -512,7 +512,7 @@ def _test_different_package_version(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -522,7 +522,7 @@ def _test_different_package_version(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -552,7 +552,7 @@ def _test_different_package_extras(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -562,7 +562,7 @@ def _test_different_package_extras(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -591,7 +591,7 @@ def _test_optional_hash(env): url = "https://example.org/bar-0.0.4.whl", filename = "bar-0.0.4.whl", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -609,7 +609,7 @@ def _test_optional_hash(env): url = "https://example.org/foo-0.0.5.whl", filename = "foo-0.0.5.whl", sha256 = "deadbeef", - yanked = False, + yanked = None, ), ], ), @@ -638,7 +638,7 @@ def _test_git_sources(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -680,7 +680,7 @@ def _test_overlapping_shas_with_index_results(env): url = "sdist", sha256 = "5d15t", filename = "foo-0.0.1.tar.gz", - yanked = False, + yanked = None, ), }, whls = { @@ -688,13 +688,13 @@ def _test_overlapping_shas_with_index_results(env): url = "super2", sha256 = "deadb11f", filename = "foo-0.0.1-py3-none-macosx_14_0_x86_64.whl", - yanked = False, + yanked = None, ), "deadbaaf": struct( url = "super2", sha256 = "deadbaaf", filename = "foo-0.0.1-py3-none-any.whl", - yanked = False, + yanked = None, ), }, ), @@ -716,7 +716,7 @@ def _test_overlapping_shas_with_index_results(env): sha256 = "deadbaaf", target_platforms = ["cp39_linux_x86_64"], url = "super2", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -726,7 +726,7 @@ def _test_overlapping_shas_with_index_results(env): sha256 = "deadb11f", target_platforms = ["cp39_osx_x86_64"], url = "super2", - yanked = False, + yanked = None, ), ], ), @@ -771,13 +771,13 @@ def _test_get_index_urls_different_versions(env): url = "super2", sha256 = "deadb11f", filename = "foo-0.0.2-py3-none-any.whl", - yanked = False, + yanked = None, ), "deadbaaf": struct( url = "super2", sha256 = "deadbaaf", filename = "foo-0.0.1-py3-none-any.whl", - yanked = False, + yanked = None, ), }, ), @@ -810,7 +810,7 @@ def _test_get_index_urls_different_versions(env): sha256 = "", target_platforms = ["cp39_linux_x86_64"], url = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -820,7 +820,7 @@ def _test_get_index_urls_different_versions(env): sha256 = "deadb11f", target_platforms = ["cp310_linux_x86_64"], url = "super2", - yanked = False, + yanked = None, ), ], ), @@ -855,7 +855,7 @@ def _test_get_index_urls_single_py_version(env): url = "super2", sha256 = "deadb11f", filename = "foo-0.0.2-py3-none-any.whl", - yanked = False, + yanked = None, ), }, ), @@ -885,7 +885,7 @@ def _test_get_index_urls_single_py_version(env): sha256 = "deadb11f", target_platforms = ["cp310_linux_x86_64"], url = "super2", - yanked = False, + yanked = None, ), ], ), diff --git a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl index f33ba05c91..f72d61371c 100644 --- a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl +++ b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl @@ -57,7 +57,7 @@ def _test_sdist(env): filename = "foo-0.0.1.tar.gz", sha256 = "deadbeefasource", url = "https://example.org/full-url/foo-0.0.1.tar.gz", - yanked = False, + yanked = None, version = "0.0.1", ), ), @@ -65,7 +65,25 @@ def _test_sdist(env): struct( attrs = [ 'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"', - 'data-requires-python=">=3.7"', + 'data-requires-python=">=3.7"', + "data-yanked", + ], + filename = "foo-0.0.1.tar.gz", + ), + struct( + filename = "foo-0.0.1.tar.gz", + sha256 = "deadbeefasource", + url = "https://example.org/full-url/foo-0.0.1.tar.gz", + version = "0.0.1", + yanked = "", + ), + ), + ( + struct( + attrs = [ + 'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"', + 'data-requires-python=">=3.7"', + "data-yanked=\"Something with "quotes" over two lines\"", ], filename = "foo-0.0.1.tar.gz", ), @@ -74,7 +92,25 @@ def _test_sdist(env): sha256 = "deadbeefasource", url = "https://example.org/full-url/foo-0.0.1.tar.gz", version = "0.0.1", - yanked = False, + # NOTE @aignas 2026-03-09: we preserve the white space + yanked = "Something \nwith \"quotes\"\nover two lines", + ), + ), + ( + struct( + attrs = [ + 'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"', + 'data-requires-python=">=3.7"', + 'data-yanked=""', + ], + filename = "foo-0.0.1.tar.gz", + ), + struct( + filename = "foo-0.0.1.tar.gz", + sha256 = "deadbeefasource", + url = "https://example.org/full-url/foo-0.0.1.tar.gz", + version = "0.0.1", + yanked = "", ), ), ] @@ -94,7 +130,7 @@ def _test_sdist(env): filename = subjects.str, sha256 = subjects.str, url = subjects.str, - yanked = subjects.bool, + yanked = subjects.str, version = subjects.str, ), ) @@ -126,14 +162,14 @@ def _test_whls(env): sha256 = "deadbeef", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", version = "0.0.2", - yanked = False, + yanked = None, ), ), ( struct( attrs = [ 'href="https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=deadbeef"', - 'data-requires-python=">=3.7"', + 'data-requires-python=">=3.7"', 'data-dist-info-metadata="sha256=deadb00f"', 'data-core-metadata="sha256=deadb00f"', ], @@ -146,7 +182,7 @@ def _test_whls(env): sha256 = "deadbeef", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", version = "0.0.2", - yanked = False, + yanked = None, ), ), ( @@ -165,7 +201,7 @@ def _test_whls(env): sha256 = "deadbeef", version = "0.0.2", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - yanked = False, + yanked = None, ), ), ( @@ -184,7 +220,7 @@ def _test_whls(env): sha256 = "deadbeef", version = "0.0.2", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - yanked = False, + yanked = None, ), ), ( @@ -202,7 +238,7 @@ def _test_whls(env): sha256 = "deadbeef", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", version = "0.0.2", - yanked = False, + yanked = None, ), ), ] @@ -223,7 +259,7 @@ def _test_whls(env): metadata_url = subjects.str, sha256 = subjects.str, url = subjects.str, - yanked = subjects.bool, + yanked = subjects.str, version = subjects.str, ), )