diff --git a/python/private/pypi/parse_requirements.bzl b/python/private/pypi/parse_requirements.bzl index acc35b3208..78b6662d08 100644 --- a/python/private/pypi/parse_requirements.bzl +++ b/python/private/pypi/parse_requirements.bzl @@ -267,7 +267,7 @@ def _package_srcs( url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ) req_line = r.srcs.requirement_line else: @@ -379,7 +379,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None): url = requirement.srcs.url, filename = requirement.srcs.filename, sha256 = requirement.srcs.shas[0] if requirement.srcs.shas else "", - yanked = False, + yanked = None, ) return dist, False @@ -403,12 +403,12 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None): # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api maybe_whl = index_urls.whls.get(sha256) - if maybe_whl and not maybe_whl.yanked: + if maybe_whl and maybe_whl.yanked == None: whls.append(maybe_whl) continue maybe_sdist = index_urls.sdists.get(sha256) - if maybe_sdist and not maybe_sdist.yanked: + if maybe_sdist and maybe_sdist.yanked == None: sdist = maybe_sdist continue @@ -416,7 +416,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None): yanked = {} for dist in whls + [sdist]: - if dist and dist.yanked: + if dist and dist.yanked != None: yanked.setdefault(dist.yanked, []).append(dist.filename) if yanked: logger.warn(lambda: "\n".join([ diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl index 6778d3da16..563130791e 100644 --- a/python/private/pypi/parse_simpleapi_html.bzl +++ b/python/private/pypi/parse_simpleapi_html.bzl @@ -26,81 +26,177 @@ def parse_simpleapi_html(*, content): Returns: A list of structs with: - * filename: The filename of the artifact. - * version: The version of the artifact. - * url: The URL to download the artifact. - * sha256: The sha256 of the artifact. - * metadata_sha256: The whl METADATA sha256 if we can download it. If this is - present, then the 'metadata_url' is also present. Defaults to "". - * metadata_url: The URL for the METADATA if we can download it. Defaults to "". + * filename: {type}`str` The filename of the artifact. + * version: {type}`str` The version of the artifact. + * url: {type}`str` The URL to download the artifact. + * sha256: {type}`str` The sha256 of the artifact. + * metadata_sha256: {type}`str` The whl METADATA sha256 if we can download it. If this is + present, then the 'metadata_url' is also present. Defaults to "". + * metadata_url: {type}`str` The URL for the METADATA if we can download it. Defaults to "". + * yanked: {type}`str | None` the yank reason if the package is yanked. If it is not yanked, + then it will be `None`. An empty string yank reason means that the package is yanked but + the reason is not provided. """ sdists = {} whls = {} - lines = content.split("= (2, 0): # We don't expect to have version 2.0 here, but have this check in place just in case. # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api fail("Unsupported API version: {}".format(api_version)) - # Each line follows the following pattern - # filename
- sha256s_by_version = {} - for line in lines[1:]: - dist_url, _, tail = line.partition("#sha256=") + # 2. Iterate using find() to avoid huge list allocations from .split(" + tag_end = content.find(">", start_tag) + end_tag = content.find("", tag_end) + if tag_end == -1 or end_tag == -1: + break + + # Extract only the necessary slices + attr_part = content[start_tag + 3:tag_end] + filename = content[tag_end + 1:end_tag].strip() + + # Update cursor for next iteration + cursor = end_tag + 4 + + # 3. Efficient Attribute Parsing + attrs = _parse_attrs(attr_part) + href = attrs.get("href", "") + if not href: + continue - sha256, _, tail = tail.partition("\"") + dist_url, _, sha256 = href.partition("#sha256=") - # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api - yanked = "data-yanked" in line + # Handle Yanked status + yanked = None + if "data-yanked" in attrs: + yanked = _unescape_pypi_html(attrs["data-yanked"]) - head, _, _ = tail.rpartition("") - maybe_metadata, _, filename = head.rpartition(">") version = version_from_filename(filename) sha256s_by_version.setdefault(version, []).append(sha256) + # 4. Optimized Metadata Check (PEP 714) metadata_sha256 = "" metadata_url = "" - for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]: - metadata_marker = metadata_marker + "=\"sha256=" - if metadata_marker in maybe_metadata: - # Implement https://peps.python.org/pep-0714/ - _, _, tail = maybe_metadata.partition(metadata_marker) - metadata_sha256, _, _ = tail.partition("\"") - metadata_url = dist_url + ".metadata" - break + + # Dist-info is more common in modern PyPI + m_val = attrs.get("data-dist-info-metadata") or attrs.get("data-core-metadata") + if m_val and m_val != "false": + _, _, metadata_sha256 = m_val.partition("sha256=") + metadata_url = dist_url + ".metadata" + + # 5. Result object + dist = struct( + filename = filename, + version = version, + url = dist_url, + sha256 = sha256, + metadata_sha256 = metadata_sha256, + metadata_url = metadata_url, + yanked = yanked, + ) if filename.endswith(".whl"): - whls[sha256] = struct( - filename = filename, - version = version, - url = dist_url, - sha256 = sha256, - metadata_sha256 = metadata_sha256, - metadata_url = metadata_url, - yanked = yanked, - ) + whls[sha256] = dist else: - sdists[sha256] = struct( - filename = filename, - version = version, - url = dist_url, - sha256 = sha256, - metadata_sha256 = "", - metadata_url = "", - yanked = yanked, - ) + sdists[sha256] = dist return struct( sdists = sdists, whls = whls, sha256s_by_version = sha256s_by_version, ) + +def _parse_attrs(attr_string): + """Parses attributes from a pre-sliced string.""" + attrs = {} + parts = attr_string.split('"') + + for i in range(0, len(parts) - 1, 2): + raw_key = parts[i].strip() + if not raw_key: + continue + + key_parts = raw_key.split(" ") + current_key = key_parts[-1].rstrip("=") + + # Batch handle booleans + for j in range(len(key_parts) - 1): + b = key_parts[j].strip() + if b: + attrs[b] = "" + + attrs[current_key] = parts[i + 1] + + # Final trailing boolean check + last = parts[-1].strip() + if last: + for b in last.split(" "): + if b: + attrs[b] = "" + return attrs + +def _unescape_pypi_html(text): + """Unescape HTML text. + + Decodes standard HTML entities used in the Simple API. + Specifically targets characters used in URLs and attribute values. + + Args: + text: {type}`str` The text to replace. + + Returns: + A string with unescaped characters + """ + + # 1. Short circuit for the most common case + if not text or "&" not in text: + return text + + # 2. Check for the most frequent PEP 503 entities first (version constraints). + # Re-ordering based on frequency reduces unnecessary checks for rare entities. + if ">" in text: + text = text.replace(">", ">") + if "<" in text: + text = text.replace("<", "<") + + # 3. Grouped check for numeric entities. + # If '&#' isn't there, we skip 4 distinct string scans. + if "&#" in text: + if "'" in text: + text = text.replace("'", "'") + if "'" in text: + text = text.replace("'", "'") + if " " in text: + text = text.replace(" ", "\n") + if " " in text: + text = text.replace(" ", "\r") + + if """ in text: + text = text.replace(""", '"') + + # 4. Handle ampersands last to prevent double-decoding. + if "&" in text: + text = text.replace("&", "&") + + return text diff --git a/tests/pypi/hub_builder/hub_builder_tests.bzl b/tests/pypi/hub_builder/hub_builder_tests.bzl index 27040d36d7..170e12c4e4 100644 --- a/tests/pypi/hub_builder/hub_builder_tests.bzl +++ b/tests/pypi/hub_builder/hub_builder_tests.bzl @@ -777,7 +777,7 @@ def _test_simple_get_index(env): "plat_pkg": struct( whls = { "deadb44f": struct( - yanked = False, + yanked = None, filename = "plat-pkg-0.0.4-py3-none-linux_x86_64.whl", sha256 = "deadb44f", url = "example2.org/index/plat_pkg/", @@ -792,7 +792,7 @@ def _test_simple_get_index(env): "simple": struct( whls = { "deadb00f": struct( - yanked = False, + yanked = None, filename = "simple-0.0.1-py3-none-any.whl", sha256 = "deadb00f", url = "example2.org", @@ -800,7 +800,7 @@ def _test_simple_get_index(env): }, sdists = { "deadbeef": struct( - yanked = False, + yanked = None, filename = "simple-0.0.1.tar.gz", sha256 = "deadbeef", url = "example.org", @@ -811,7 +811,7 @@ def _test_simple_get_index(env): "some_other_pkg": struct( whls = { "deadb33f": struct( - yanked = False, + yanked = None, filename = "some-other-pkg-0.0.1-py3-none-any.whl", sha256 = "deadb33f", url = "example2.org/index/some_other_pkg/", diff --git a/tests/pypi/parse_requirements/parse_requirements_tests.bzl b/tests/pypi/parse_requirements/parse_requirements_tests.bzl index 0d03e94467..bea8ac5f78 100644 --- a/tests/pypi/parse_requirements/parse_requirements_tests.bzl +++ b/tests/pypi/parse_requirements/parse_requirements_tests.bzl @@ -143,7 +143,7 @@ def _test_simple(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -174,7 +174,7 @@ def _test_direct_urls_integration(env): sha256 = "", target_platforms = ["osx_x86_64"], url = "https://github.com/org/foo/downloads/foo-1.1.tar.gz", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -184,7 +184,7 @@ def _test_direct_urls_integration(env): sha256 = "", target_platforms = ["linux_x86_64"], url = "https://some-url/package.whl", - yanked = False, + yanked = None, ), ], ), @@ -216,7 +216,7 @@ def _test_direct_urls_no_extract(env): sha256 = "", target_platforms = ["osx_x86_64"], url = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -226,7 +226,7 @@ def _test_direct_urls_no_extract(env): sha256 = "", target_platforms = ["linux_x86_64"], url = "", - yanked = False, + yanked = None, ), ], ), @@ -258,7 +258,7 @@ def _test_extra_pip_args(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -287,7 +287,7 @@ def _test_dupe_requirements(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -318,7 +318,7 @@ def _test_multi_os(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -336,7 +336,7 @@ def _test_multi_os(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -346,7 +346,7 @@ def _test_multi_os(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -383,7 +383,7 @@ def _test_multi_os_legacy(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -401,7 +401,7 @@ def _test_multi_os_legacy(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -411,7 +411,7 @@ def _test_multi_os_legacy(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -464,7 +464,7 @@ def _test_env_marker_resolution(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -482,7 +482,7 @@ def _test_env_marker_resolution(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -512,7 +512,7 @@ def _test_different_package_version(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -522,7 +522,7 @@ def _test_different_package_version(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -552,7 +552,7 @@ def _test_different_package_extras(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -562,7 +562,7 @@ def _test_different_package_extras(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -591,7 +591,7 @@ def _test_optional_hash(env): url = "https://example.org/bar-0.0.4.whl", filename = "bar-0.0.4.whl", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -609,7 +609,7 @@ def _test_optional_hash(env): url = "https://example.org/foo-0.0.5.whl", filename = "foo-0.0.5.whl", sha256 = "deadbeef", - yanked = False, + yanked = None, ), ], ), @@ -638,7 +638,7 @@ def _test_git_sources(env): url = "", filename = "", sha256 = "", - yanked = False, + yanked = None, ), ], ), @@ -680,7 +680,7 @@ def _test_overlapping_shas_with_index_results(env): url = "sdist", sha256 = "5d15t", filename = "foo-0.0.1.tar.gz", - yanked = False, + yanked = None, ), }, whls = { @@ -688,13 +688,13 @@ def _test_overlapping_shas_with_index_results(env): url = "super2", sha256 = "deadb11f", filename = "foo-0.0.1-py3-none-macosx_14_0_x86_64.whl", - yanked = False, + yanked = None, ), "deadbaaf": struct( url = "super2", sha256 = "deadbaaf", filename = "foo-0.0.1-py3-none-any.whl", - yanked = False, + yanked = None, ), }, ), @@ -716,7 +716,7 @@ def _test_overlapping_shas_with_index_results(env): sha256 = "deadbaaf", target_platforms = ["cp39_linux_x86_64"], url = "super2", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -726,7 +726,7 @@ def _test_overlapping_shas_with_index_results(env): sha256 = "deadb11f", target_platforms = ["cp39_osx_x86_64"], url = "super2", - yanked = False, + yanked = None, ), ], ), @@ -771,13 +771,13 @@ def _test_get_index_urls_different_versions(env): url = "super2", sha256 = "deadb11f", filename = "foo-0.0.2-py3-none-any.whl", - yanked = False, + yanked = None, ), "deadbaaf": struct( url = "super2", sha256 = "deadbaaf", filename = "foo-0.0.1-py3-none-any.whl", - yanked = False, + yanked = None, ), }, ), @@ -810,7 +810,7 @@ def _test_get_index_urls_different_versions(env): sha256 = "", target_platforms = ["cp39_linux_x86_64"], url = "", - yanked = False, + yanked = None, ), struct( distribution = "foo", @@ -820,7 +820,7 @@ def _test_get_index_urls_different_versions(env): sha256 = "deadb11f", target_platforms = ["cp310_linux_x86_64"], url = "super2", - yanked = False, + yanked = None, ), ], ), @@ -855,7 +855,7 @@ def _test_get_index_urls_single_py_version(env): url = "super2", sha256 = "deadb11f", filename = "foo-0.0.2-py3-none-any.whl", - yanked = False, + yanked = None, ), }, ), @@ -885,7 +885,7 @@ def _test_get_index_urls_single_py_version(env): sha256 = "deadb11f", target_platforms = ["cp310_linux_x86_64"], url = "super2", - yanked = False, + yanked = None, ), ], ), diff --git a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl index f33ba05c91..f72d61371c 100644 --- a/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl +++ b/tests/pypi/parse_simpleapi_html/parse_simpleapi_html_tests.bzl @@ -57,7 +57,7 @@ def _test_sdist(env): filename = "foo-0.0.1.tar.gz", sha256 = "deadbeefasource", url = "https://example.org/full-url/foo-0.0.1.tar.gz", - yanked = False, + yanked = None, version = "0.0.1", ), ), @@ -65,7 +65,25 @@ def _test_sdist(env): struct( attrs = [ 'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"', - 'data-requires-python=">=3.7"', + 'data-requires-python=">=3.7"', + "data-yanked", + ], + filename = "foo-0.0.1.tar.gz", + ), + struct( + filename = "foo-0.0.1.tar.gz", + sha256 = "deadbeefasource", + url = "https://example.org/full-url/foo-0.0.1.tar.gz", + version = "0.0.1", + yanked = "", + ), + ), + ( + struct( + attrs = [ + 'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"', + 'data-requires-python=">=3.7"', + "data-yanked=\"Something with "quotes" over two lines\"", ], filename = "foo-0.0.1.tar.gz", ), @@ -74,7 +92,25 @@ def _test_sdist(env): sha256 = "deadbeefasource", url = "https://example.org/full-url/foo-0.0.1.tar.gz", version = "0.0.1", - yanked = False, + # NOTE @aignas 2026-03-09: we preserve the white space + yanked = "Something \nwith \"quotes\"\nover two lines", + ), + ), + ( + struct( + attrs = [ + 'href="https://example.org/full-url/foo-0.0.1.tar.gz#sha256=deadbeefasource"', + 'data-requires-python=">=3.7"', + 'data-yanked=""', + ], + filename = "foo-0.0.1.tar.gz", + ), + struct( + filename = "foo-0.0.1.tar.gz", + sha256 = "deadbeefasource", + url = "https://example.org/full-url/foo-0.0.1.tar.gz", + version = "0.0.1", + yanked = "", ), ), ] @@ -94,7 +130,7 @@ def _test_sdist(env): filename = subjects.str, sha256 = subjects.str, url = subjects.str, - yanked = subjects.bool, + yanked = subjects.str, version = subjects.str, ), ) @@ -126,14 +162,14 @@ def _test_whls(env): sha256 = "deadbeef", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", version = "0.0.2", - yanked = False, + yanked = None, ), ), ( struct( attrs = [ 'href="https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl#sha256=deadbeef"', - 'data-requires-python=">=3.7"', + 'data-requires-python=">=3.7"', 'data-dist-info-metadata="sha256=deadb00f"', 'data-core-metadata="sha256=deadb00f"', ], @@ -146,7 +182,7 @@ def _test_whls(env): sha256 = "deadbeef", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", version = "0.0.2", - yanked = False, + yanked = None, ), ), ( @@ -165,7 +201,7 @@ def _test_whls(env): sha256 = "deadbeef", version = "0.0.2", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - yanked = False, + yanked = None, ), ), ( @@ -184,7 +220,7 @@ def _test_whls(env): sha256 = "deadbeef", version = "0.0.2", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", - yanked = False, + yanked = None, ), ), ( @@ -202,7 +238,7 @@ def _test_whls(env): sha256 = "deadbeef", url = "https://example.org/full-url/foo-0.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", version = "0.0.2", - yanked = False, + yanked = None, ), ), ] @@ -223,7 +259,7 @@ def _test_whls(env): metadata_url = subjects.str, sha256 = subjects.str, url = subjects.str, - yanked = subjects.bool, + yanked = subjects.str, version = subjects.str, ), )