From e53093a02ff1c7411bfd199d946d537a6c09d975 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Thu, 12 Feb 2026 14:29:46 +0200 Subject: [PATCH 1/9] Add support for Reference Fix Commits improver Update the pipeline and fix the test Signed-off-by: ziad hany --- vulnerabilities/importers/__init__.py | 2 + .../v2_improvers/reference_collect_commits.py | 76 +++++++++++++ .../test_ref_collect_commits_v2.py | 105 ++++++++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py create mode 100644 vulnerabilities/tests/pipelines/v2_improvers/test_ref_collect_commits_v2.py diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index c0cf04ed7..739890218 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -81,6 +81,7 @@ from vulnerabilities.pipelines.v2_importers import ubuntu_osv_importer as ubuntu_osv_importer_v2 from vulnerabilities.pipelines.v2_importers import vulnrichment_importer as vulnrichment_importer_v2 from vulnerabilities.pipelines.v2_importers import xen_importer as xen_importer_v2 +from vulnerabilities.pipelines.v2_improvers import reference_collect_commits from vulnerabilities.utils import create_registry IMPORTERS_REGISTRY = create_registry( @@ -125,6 +126,7 @@ nginx_importer.NginxImporterPipeline, pysec_importer.PyPIImporterPipeline, fireeye_importer_v2.FireeyeImporterPipeline, + reference_collect_commits.CollectReferencesFixCommitsPipeline, apache_tomcat.ApacheTomcatImporter, postgresql.PostgreSQLImporter, debian.DebianImporter, diff --git a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py new file mode 100644 index 000000000..80a266b5d --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py @@ -0,0 +1,76 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from aboutcode.pipeline import LoopProgress +from packageurl.contrib.purl2url import purl2url +from packageurl.contrib.url2purl import url2purl + +from aboutcode.federated import get_core_purl +from vulnerabilities.models import AdvisoryV2 +from vulnerabilities.models import PackageCommitPatch +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerabilities.pipes.advisory import VCS_URLS_SUPPORTED_TYPES +from vulnerabilities.tests.test_export import package +from vulnerabilities.utils import is_commit + + +class CollectReferencesFixCommitsPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Improver pipeline to scout References/Patch and create PackageCommitPatch entries. + """ + + pipeline_id = "collect_ref_fix_commits_v2" + license_expression = None + + @classmethod + def steps(cls): + return (cls.collect_and_store_fix_commits,) + + def get_vcs_commit(self, url): + """Extracts and VCS URL and commit hash from URL. + >> get_vcs_commit('https://github.com/aboutcode-org/vulnerablecode/commit/98e516011d6e096e25247b82fc5f196bbeecff10') + ('https://github.com/aboutcode-org/vulnerablecode', '98e516011d6e096e25247b82fc5f196bbeecff10') + >> get_vcs_commit('https://github.com/aboutcode-org/vulnerablecode/pull/1974') + None + """ + purl = url2purl(url) + if not purl or purl.type not in VCS_URLS_SUPPORTED_TYPES: + return None + + version = getattr(purl, "version", None) + if not version or not is_commit(version): + return None + + vcs_url = purl2url(get_core_purl(purl).to_string()) + return (vcs_url, version) if vcs_url else None + + def collect_and_store_fix_commits(self): + impacted_packages_advisories = ( + AdvisoryV2.objects.filter(impacted_packages__isnull=False) + .prefetch_related("references", "patches", "impacted_packages") + .distinct() + ) + + progress = LoopProgress( + total_iterations=impacted_packages_advisories.count(), logger=self.log + ) + for adv in progress.iter(impacted_packages_advisories.paginated(per_page=500)): + urls = {r.url for r in adv.references.all()} | {p.patch_url for p in adv.patches.all()} + impacted_packages = list(adv.impacted_packages.all()) + + for url in urls: + vcs_data = self.get_vcs_commit(url) + if not vcs_data: + continue + + vcs_url, commit_hash = vcs_data + package_commit_obj, _ = PackageCommitPatch.objects.get_or_create( + vcs_url=vcs_url, commit_hash=commit_hash + ) + package_commit_obj.fixed_impacted_packages.add(*impacted_packages) diff --git a/vulnerabilities/tests/pipelines/v2_improvers/test_ref_collect_commits_v2.py b/vulnerabilities/tests/pipelines/v2_improvers/test_ref_collect_commits_v2.py new file mode 100644 index 000000000..51f3d9fe8 --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_improvers/test_ref_collect_commits_v2.py @@ -0,0 +1,105 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. + +from datetime import datetime + +import pytest + +from vulnerabilities.models import AdvisoryReference +from vulnerabilities.models import AdvisoryV2 +from vulnerabilities.models import ImpactedPackage +from vulnerabilities.models import PackageCommitPatch +from vulnerabilities.models import PackageV2 +from vulnerabilities.pipelines.v2_improvers.reference_collect_commits import ( + CollectReferencesFixCommitsPipeline, +) + + +@pytest.mark.django_db +def test_is_vcs_url_already_processed_true(): + advisory = AdvisoryV2.objects.create( + advisory_id="CVE-2025-9999", + datasource_id="test-ds", + avid="test-ds/CVE-2025-9999", + url="https://example.com/advisory/CVE-2025-9999", + unique_content_id="11111", + date_collected=datetime.now(), + ) + package = PackageV2.objects.create( + type="bar", + name="foo", + version="1.0", + ) + impact = ImpactedPackage.objects.create(advisory=advisory) + impact.affecting_packages.add(package) + package_commit_patch = PackageCommitPatch.objects.create( + vcs_url="https://github.com/user/repo/commit/6bd301819f8f69331a55ae2336c8b111fc933f3d", + commit_hash="6bd301819f8f69331a55ae2336c8b111fc933f3d", + ) + impact.fixed_by_package_commit_patches.add(package_commit_patch) + + +@pytest.mark.django_db +def test_collect_fix_commits_pipeline_creates_entry(): + advisory = AdvisoryV2.objects.create( + advisory_id="CVE-2025-1000", + datasource_id="test-ds", + avid="test-ds/CVE-2025-1000", + url="https://example.com/advisory/CVE-2025-1000", + unique_content_id="11111", + date_collected=datetime.now(), + ) + package = PackageV2.objects.create( + type="foo", + name="testpkg", + version="1.0", + ) + reference = AdvisoryReference.objects.create( + url="https://github.com/test/testpkg/commit/6bd301819f8f69331a55ae2336c8b111fc933f3d" + ) + impact = ImpactedPackage.objects.create(advisory=advisory) + impact.affecting_packages.add(package) + advisory.references.add(reference) + + pipeline = CollectReferencesFixCommitsPipeline() + pipeline.collect_and_store_fix_commits() + + package_commit_patch = PackageCommitPatch.objects.all() + + assert package_commit_patch.count() == 1 + fix = package_commit_patch.first() + assert fix.commit_hash == "6bd301819f8f69331a55ae2336c8b111fc933f3d" + assert fix.vcs_url == "https://github.com/test/testpkg" + + +@pytest.mark.django_db +def test_collect_fix_commits_pipeline_skips_non_commit_urls(): + advisory = AdvisoryV2.objects.create( + advisory_id="CVE-2025-2000", + datasource_id="test-ds", + avid="test-ds/CVE-2025-2000", + url="https://example.com/advisory/CVE-2025-2000", + unique_content_id="11111", + date_collected=datetime.now(), + ) + package = PackageV2.objects.create( + type="pypi", + name="otherpkg", + version="2.0", + ) + impact = ImpactedPackage.objects.create(advisory=advisory) + impact.affecting_packages.add(package) + + reference = AdvisoryReference.objects.create( + url="https://github.com/test/testpkg/issues/12" + ) # invalid reference 1 + advisory.references.add(reference) + + pipeline = CollectReferencesFixCommitsPipeline() + pipeline.collect_and_store_fix_commits() + assert PackageCommitPatch.objects.count() == 0 From 2c2c1584eda2aea3e1cf14f5a1e86f8321faccd2 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Fri, 20 Feb 2026 04:05:39 +0200 Subject: [PATCH 2/9] Improve the pipeline structure and reduce the number of database queries Signed-off-by: ziad hany --- .../pipelines/v2_improvers/reference_collect_commits.py | 3 +-- ...lect_commits_v2.py => test_reference_collect_commits_v2.py} | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) rename vulnerabilities/tests/pipelines/v2_improvers/{test_ref_collect_commits_v2.py => test_reference_collect_commits_v2.py} (98%) diff --git a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py index 80a266b5d..0fb9af8cd 100644 --- a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py +++ b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py @@ -16,7 +16,6 @@ from vulnerabilities.models import PackageCommitPatch from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 from vulnerabilities.pipes.advisory import VCS_URLS_SUPPORTED_TYPES -from vulnerabilities.tests.test_export import package from vulnerabilities.utils import is_commit @@ -73,4 +72,4 @@ def collect_and_store_fix_commits(self): package_commit_obj, _ = PackageCommitPatch.objects.get_or_create( vcs_url=vcs_url, commit_hash=commit_hash ) - package_commit_obj.fixed_impacted_packages.add(*impacted_packages) + package_commit_obj.fixed_in_impacts.add(*impacted_packages) diff --git a/vulnerabilities/tests/pipelines/v2_improvers/test_ref_collect_commits_v2.py b/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py similarity index 98% rename from vulnerabilities/tests/pipelines/v2_improvers/test_ref_collect_commits_v2.py rename to vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py index 51f3d9fe8..2b65c07ba 100644 --- a/vulnerabilities/tests/pipelines/v2_improvers/test_ref_collect_commits_v2.py +++ b/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py @@ -75,6 +75,7 @@ def test_collect_fix_commits_pipeline_creates_entry(): fix = package_commit_patch.first() assert fix.commit_hash == "6bd301819f8f69331a55ae2336c8b111fc933f3d" assert fix.vcs_url == "https://github.com/test/testpkg" + assert impact.fixed_by_package_commit_patches.count() == 1 @pytest.mark.django_db From eda8e8143c23b06800df698dd7a7bc260ddb5e69 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Fri, 20 Feb 2026 05:13:45 +0200 Subject: [PATCH 3/9] Delete the unnecessary license_expression var. Signed-off-by: ziad hany --- .../pipelines/v2_improvers/reference_collect_commits.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py index 0fb9af8cd..64c605cd8 100644 --- a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py +++ b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py @@ -25,7 +25,6 @@ class CollectReferencesFixCommitsPipeline(VulnerableCodeBaseImporterPipelineV2): """ pipeline_id = "collect_ref_fix_commits_v2" - license_expression = None @classmethod def steps(cls): From 9656668725a6e876efa39a387512900c077d3193 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Fri, 20 Feb 2026 14:09:50 +0200 Subject: [PATCH 4/9] Drop unused test Signed-off-by: ziad hany --- .../test_reference_collect_commits_v2.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py b/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py index 2b65c07ba..b07b610c2 100644 --- a/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py +++ b/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py @@ -20,30 +20,6 @@ ) -@pytest.mark.django_db -def test_is_vcs_url_already_processed_true(): - advisory = AdvisoryV2.objects.create( - advisory_id="CVE-2025-9999", - datasource_id="test-ds", - avid="test-ds/CVE-2025-9999", - url="https://example.com/advisory/CVE-2025-9999", - unique_content_id="11111", - date_collected=datetime.now(), - ) - package = PackageV2.objects.create( - type="bar", - name="foo", - version="1.0", - ) - impact = ImpactedPackage.objects.create(advisory=advisory) - impact.affecting_packages.add(package) - package_commit_patch = PackageCommitPatch.objects.create( - vcs_url="https://github.com/user/repo/commit/6bd301819f8f69331a55ae2336c8b111fc933f3d", - commit_hash="6bd301819f8f69331a55ae2336c8b111fc933f3d", - ) - impact.fixed_by_package_commit_patches.add(package_commit_patch) - - @pytest.mark.django_db def test_collect_fix_commits_pipeline_creates_entry(): advisory = AdvisoryV2.objects.create( From 7fe2626af9ad993430bd2720b3e81e21cb6466e6 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Fri, 27 Feb 2026 15:55:39 +0200 Subject: [PATCH 5/9] Update the pipeline to use bulk Signed-off-by: ziad hany --- .../v2_improvers/reference_collect_commits.py | 116 +++++++++++++----- .../test_reference_collect_commits_v2.py | 20 +-- 2 files changed, 91 insertions(+), 45 deletions(-) diff --git a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py index 64c605cd8..6680b27a7 100644 --- a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py +++ b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py @@ -8,18 +8,21 @@ # from aboutcode.pipeline import LoopProgress +from django.db.models import Prefetch from packageurl.contrib.purl2url import purl2url from packageurl.contrib.url2purl import url2purl from aboutcode.federated import get_core_purl +from vulnerabilities.models import AdvisoryReference from vulnerabilities.models import AdvisoryV2 +from vulnerabilities.models import ImpactedPackage from vulnerabilities.models import PackageCommitPatch -from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 -from vulnerabilities.pipes.advisory import VCS_URLS_SUPPORTED_TYPES +from vulnerabilities.models import Patch +from vulnerabilities.pipelines import VulnerableCodePipeline from vulnerabilities.utils import is_commit -class CollectReferencesFixCommitsPipeline(VulnerableCodeBaseImporterPipelineV2): +class CollectReferencesFixCommitsPipeline(VulnerableCodePipeline): """ Improver pipeline to scout References/Patch and create PackageCommitPatch entries. """ @@ -30,45 +33,98 @@ class CollectReferencesFixCommitsPipeline(VulnerableCodeBaseImporterPipelineV2): def steps(cls): return (cls.collect_and_store_fix_commits,) - def get_vcs_commit(self, url): - """Extracts and VCS URL and commit hash from URL. + def get_vcs_data(self, url): + """Extracts a VCS URL and commit hash from URL. >> get_vcs_commit('https://github.com/aboutcode-org/vulnerablecode/commit/98e516011d6e096e25247b82fc5f196bbeecff10') - ('https://github.com/aboutcode-org/vulnerablecode', '98e516011d6e096e25247b82fc5f196bbeecff10') + ("pkg:github/aboutcode-org/vulnerablecode", 'https://github.com/aboutcode-org/vulnerablecode', '98e516011d6e096e25247b82fc5f196bbeecff10') >> get_vcs_commit('https://github.com/aboutcode-org/vulnerablecode/pull/1974') None """ - purl = url2purl(url) - if not purl or purl.type not in VCS_URLS_SUPPORTED_TYPES: - return None + try: + purl = url2purl(url) + if not purl: + return - version = getattr(purl, "version", None) - if not version or not is_commit(version): - return None - - vcs_url = purl2url(get_core_purl(purl).to_string()) - return (vcs_url, version) if vcs_url else None + version = purl.version + if not version or not is_commit(version): + return + base_purl = get_core_purl(purl) + vcs_url = purl2url(base_purl.to_string()) + if base_purl and vcs_url and version: + return base_purl, vcs_url, version + except Exception as e: + self.log(f"Invalid URL: url:{url} error:{e}") def collect_and_store_fix_commits(self): - impacted_packages_advisories = ( - AdvisoryV2.objects.filter(impacted_packages__isnull=False) - .prefetch_related("references", "patches", "impacted_packages") - .distinct() + advisories = AdvisoryV2.objects.only("id").prefetch_related( + Prefetch("references", queryset=AdvisoryReference.objects.only("url")), + Prefetch("patches", queryset=Patch.objects.only("patch_url")), ) - progress = LoopProgress( - total_iterations=impacted_packages_advisories.count(), logger=self.log - ) - for adv in progress.iter(impacted_packages_advisories.paginated(per_page=500)): + progress = LoopProgress(total_iterations=advisories.count(), logger=self.log) + + commit_batch = [] + updated_pkg_patch_commit_count = 0 + batch_size = 1000 + for adv in progress.iter(advisories.paginated(per_page=batch_size)): urls = {r.url for r in adv.references.all()} | {p.patch_url for p in adv.patches.all()} - impacted_packages = list(adv.impacted_packages.all()) for url in urls: - vcs_data = self.get_vcs_commit(url) + vcs_data = self.get_vcs_data(url) if not vcs_data: continue + base_purl, vcs_url, commit_hash = vcs_data + commit_batch.append((str(base_purl), vcs_url, commit_hash, adv.id)) + + if len(commit_batch) >= batch_size: + updated_pkg_patch_commit_count += self.bulk_commit_batch_update(commit_batch) + commit_batch.clear() + + if commit_batch: + updated_pkg_patch_commit_count += self.bulk_commit_batch_update(commit_batch) + commit_batch.clear() + + self.log(f"Successfully processed pkg patch commit {updated_pkg_patch_commit_count:,d}") + + def bulk_commit_batch_update(self, vcs_data_table): + impact_data = {(row[0], row[3]) for row in vcs_data_table} # base_purl, adv_id + commit_data = {(row[1], row[2]) for row in vcs_data_table} # vcs_url, commit_hash + + adv_ids = {aid for _, aid in impact_data} + existing_impacts = ImpactedPackage.objects.filter(advisory_id__in=adv_ids) + existing_impact_pairs = {(ip.base_purl, ip.advisory_id) for ip in existing_impacts} + + new_impacts = impact_data - existing_impact_pairs + if new_impacts: + ImpactedPackage.objects.bulk_create( + [ImpactedPackage(base_purl=bp, advisory_id=aid) for bp, aid in new_impacts] + ) + + PackageCommitPatch.objects.bulk_create( + [ + PackageCommitPatch(vcs_url=vcs_url, commit_hash=commit_hash) + for vcs_url, commit_hash in commit_data + ], + ignore_conflicts=True, + ) + + adv_ids = {adv_id for _, adv_id in impact_data} + fetched_impacts = { + (impacted_pkg.base_purl, impacted_pkg.advisory_id): impacted_pkg + for impacted_pkg in ImpactedPackage.objects.filter(advisory_id__in=adv_ids) + } + + commit_hashes = {commit_hash for _, commit_hash in commit_data} + fetched_commits = { + (pkg_commit_patch.vcs_url, pkg_commit_patch.commit_hash): pkg_commit_patch + for pkg_commit_patch in PackageCommitPatch.objects.filter(commit_hash__in=commit_hashes) + } + + for base_purl, vcs_url, commit_hash, adv_id in vcs_data_table: + impacted_package = fetched_impacts.get((base_purl, adv_id)) + package_commit_obj = fetched_commits.get((vcs_url, commit_hash)) + + if impacted_package and package_commit_obj: + package_commit_obj.fixed_in_impacts.add(impacted_package) - vcs_url, commit_hash = vcs_data - package_commit_obj, _ = PackageCommitPatch.objects.get_or_create( - vcs_url=vcs_url, commit_hash=commit_hash - ) - package_commit_obj.fixed_in_impacts.add(*impacted_packages) + return len(vcs_data_table) diff --git a/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py b/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py index b07b610c2..cb26f04f2 100644 --- a/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py +++ b/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py @@ -30,28 +30,25 @@ def test_collect_fix_commits_pipeline_creates_entry(): unique_content_id="11111", date_collected=datetime.now(), ) - package = PackageV2.objects.create( - type="foo", - name="testpkg", - version="1.0", - ) + reference = AdvisoryReference.objects.create( url="https://github.com/test/testpkg/commit/6bd301819f8f69331a55ae2336c8b111fc933f3d" ) - impact = ImpactedPackage.objects.create(advisory=advisory) - impact.affecting_packages.add(package) advisory.references.add(reference) pipeline = CollectReferencesFixCommitsPipeline() pipeline.collect_and_store_fix_commits() package_commit_patch = PackageCommitPatch.objects.all() + impacted_packages = advisory.impacted_packages.all() assert package_commit_patch.count() == 1 + assert impacted_packages.count() == 1 + fix = package_commit_patch.first() assert fix.commit_hash == "6bd301819f8f69331a55ae2336c8b111fc933f3d" assert fix.vcs_url == "https://github.com/test/testpkg" - assert impact.fixed_by_package_commit_patches.count() == 1 + assert impacted_packages.first().fixed_by_package_commit_patches.count() == 1 @pytest.mark.django_db @@ -64,13 +61,6 @@ def test_collect_fix_commits_pipeline_skips_non_commit_urls(): unique_content_id="11111", date_collected=datetime.now(), ) - package = PackageV2.objects.create( - type="pypi", - name="otherpkg", - version="2.0", - ) - impact = ImpactedPackage.objects.create(advisory=advisory) - impact.affecting_packages.add(package) reference = AdvisoryReference.objects.create( url="https://github.com/test/testpkg/issues/12" From acfb3e5346593294eec7c1273db32ff9d66c2773 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Fri, 27 Feb 2026 16:12:17 +0200 Subject: [PATCH 6/9] this pipeline should be registered as improver Signed-off-by: ziad hany --- vulnerabilities/improvers/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index c55c14c8a..923a01a3d 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -31,6 +31,7 @@ ) from vulnerabilities.pipelines.v2_improvers import flag_ghost_packages as flag_ghost_packages_v2 from vulnerabilities.pipelines.v2_improvers import group_advisories_for_packages +from vulnerabilities.pipelines.v2_improvers import reference_collect_commits from vulnerabilities.pipelines.v2_improvers import relate_severities from vulnerabilities.pipelines.v2_improvers import unfurl_version_range as unfurl_version_range_v2 from vulnerabilities.utils import create_registry @@ -74,5 +75,6 @@ relate_severities.RelateSeveritiesPipeline, group_advisories_for_packages.GroupAdvisoriesForPackages, compute_advisory_todo_v2.ComputeToDo, + reference_collect_commits.CollectReferencesFixCommitsPipeline, ] ) From 04236f2b5886bfcf29739ff8416da82eb9b01faa Mon Sep 17 00:00:00 2001 From: ziad hany Date: Sat, 28 Feb 2026 02:27:57 +0200 Subject: [PATCH 7/9] Try to optimize the CollectReferencesFixCommitsPipeline pipeline Signed-off-by: ziad hany --- .../v2_improvers/reference_collect_commits.py | 47 ++++++++++++------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py index 6680b27a7..275025797 100644 --- a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py +++ b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py @@ -7,6 +7,8 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +from collections import defaultdict + from aboutcode.pipeline import LoopProgress from django.db.models import Prefetch from packageurl.contrib.purl2url import purl2url @@ -65,7 +67,7 @@ def collect_and_store_fix_commits(self): commit_batch = [] updated_pkg_patch_commit_count = 0 - batch_size = 1000 + batch_size = 10000 for adv in progress.iter(advisories.paginated(per_page=batch_size)): urls = {r.url for r in adv.references.all()} | {p.patch_url for p in adv.patches.all()} @@ -90,14 +92,22 @@ def bulk_commit_batch_update(self, vcs_data_table): impact_data = {(row[0], row[3]) for row in vcs_data_table} # base_purl, adv_id commit_data = {(row[1], row[2]) for row in vcs_data_table} # vcs_url, commit_hash - adv_ids = {aid for _, aid in impact_data} - existing_impacts = ImpactedPackage.objects.filter(advisory_id__in=adv_ids) - existing_impact_pairs = {(ip.base_purl, ip.advisory_id) for ip in existing_impacts} + adv_ids = {adv_id for _, adv_id in impact_data} + commit_hashes = {commit_hash for _, commit_hash in commit_data} - new_impacts = impact_data - existing_impact_pairs - if new_impacts: + existing_impacts = ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only( + "base_purl", "advisory_id" + ) + existing_impact_pairs = { + (impact_pkg.base_purl, impact_pkg.advisory_id) for impact_pkg in existing_impacts + } + + if new_impacts := impact_data - existing_impact_pairs: ImpactedPackage.objects.bulk_create( - [ImpactedPackage(base_purl=bp, advisory_id=aid) for bp, aid in new_impacts] + [ + ImpactedPackage(base_purl=base_purl, advisory_id=adv_id) + for base_purl, adv_id in new_impacts + ] ) PackageCommitPatch.objects.bulk_create( @@ -108,23 +118,28 @@ def bulk_commit_batch_update(self, vcs_data_table): ignore_conflicts=True, ) - adv_ids = {adv_id for _, adv_id in impact_data} fetched_impacts = { (impacted_pkg.base_purl, impacted_pkg.advisory_id): impacted_pkg - for impacted_pkg in ImpactedPackage.objects.filter(advisory_id__in=adv_ids) + for impacted_pkg in ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only( + "base_purl", "advisory_id" + ) } - commit_hashes = {commit_hash for _, commit_hash in commit_data} - fetched_commits = { + fetched_pkg_commits = { (pkg_commit_patch.vcs_url, pkg_commit_patch.commit_hash): pkg_commit_patch - for pkg_commit_patch in PackageCommitPatch.objects.filter(commit_hash__in=commit_hashes) + for pkg_commit_patch in PackageCommitPatch.objects.filter( + commit_hash__in=commit_hashes + ).only("vcs_url", "commit_hash") } + pkg_commit_add_impact_pkg = defaultdict(list) for base_purl, vcs_url, commit_hash, adv_id in vcs_data_table: - impacted_package = fetched_impacts.get((base_purl, adv_id)) - package_commit_obj = fetched_commits.get((vcs_url, commit_hash)) + impacted_pkg_obj = fetched_impacts.get((base_purl, adv_id)) + pkg_commit_obj = fetched_pkg_commits.get((vcs_url, commit_hash)) + if impacted_pkg_obj and pkg_commit_obj: + pkg_commit_add_impact_pkg[pkg_commit_obj].append(impacted_pkg_obj) - if impacted_package and package_commit_obj: - package_commit_obj.fixed_in_impacts.add(impacted_package) + for pkg_commit_obj, impact_pkgs in pkg_commit_add_impact_pkg.items(): + pkg_commit_obj.fixed_in_impacts.add(*impact_pkgs) return len(vcs_data_table) From fa72a1b43dddbd7343dcf775cd6c4a28d0be42d5 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Tue, 3 Mar 2026 03:30:15 +0200 Subject: [PATCH 8/9] Remove the invalid registration as an importer Signed-off-by: ziad hany --- vulnerabilities/importers/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 739890218..c0cf04ed7 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -81,7 +81,6 @@ from vulnerabilities.pipelines.v2_importers import ubuntu_osv_importer as ubuntu_osv_importer_v2 from vulnerabilities.pipelines.v2_importers import vulnrichment_importer as vulnrichment_importer_v2 from vulnerabilities.pipelines.v2_importers import xen_importer as xen_importer_v2 -from vulnerabilities.pipelines.v2_improvers import reference_collect_commits from vulnerabilities.utils import create_registry IMPORTERS_REGISTRY = create_registry( @@ -126,7 +125,6 @@ nginx_importer.NginxImporterPipeline, pysec_importer.PyPIImporterPipeline, fireeye_importer_v2.FireeyeImporterPipeline, - reference_collect_commits.CollectReferencesFixCommitsPipeline, apache_tomcat.ApacheTomcatImporter, postgresql.PostgreSQLImporter, debian.DebianImporter, From ddda5cf74340f5aead88970d5b834d9056d98b72 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Thu, 14 May 2026 17:37:21 +0300 Subject: [PATCH 9/9] Update bulk_commit_batch_update to avoid through_model to bulk create PackageCommitPatch.fixed_in_impacts Set ignore_conflicts to True, add id when doing .only(), use iterator Signed-off-by: ziad hany --- .../v2_improvers/reference_collect_commits.py | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py index 275025797..e694b9a14 100644 --- a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py +++ b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py @@ -6,7 +6,6 @@ # See https://github.com/aboutcode-org/vulnerablecode for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # - from collections import defaultdict from aboutcode.pipeline import LoopProgress @@ -59,8 +58,8 @@ def get_vcs_data(self, url): def collect_and_store_fix_commits(self): advisories = AdvisoryV2.objects.only("id").prefetch_related( - Prefetch("references", queryset=AdvisoryReference.objects.only("url")), - Prefetch("patches", queryset=Patch.objects.only("patch_url")), + Prefetch("references", queryset=AdvisoryReference.objects.only("id", "url")), + Prefetch("patches", queryset=Patch.objects.only("id", "patch_url")), ) progress = LoopProgress(total_iterations=advisories.count(), logger=self.log) @@ -68,7 +67,7 @@ def collect_and_store_fix_commits(self): commit_batch = [] updated_pkg_patch_commit_count = 0 batch_size = 10000 - for adv in progress.iter(advisories.paginated(per_page=batch_size)): + for adv in progress.iter(advisories.iterator(chunk_size=batch_size)): urls = {r.url for r in adv.references.all()} | {p.patch_url for p in adv.patches.all()} for url in urls: @@ -96,7 +95,7 @@ def bulk_commit_batch_update(self, vcs_data_table): commit_hashes = {commit_hash for _, commit_hash in commit_data} existing_impacts = ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only( - "base_purl", "advisory_id" + "id", "base_purl", "advisory_id" ) existing_impact_pairs = { (impact_pkg.base_purl, impact_pkg.advisory_id) for impact_pkg in existing_impacts @@ -107,7 +106,8 @@ def bulk_commit_batch_update(self, vcs_data_table): [ ImpactedPackage(base_purl=base_purl, advisory_id=adv_id) for base_purl, adv_id in new_impacts - ] + ], + ignore_conflicts=True, ) PackageCommitPatch.objects.bulk_create( @@ -121,7 +121,7 @@ def bulk_commit_batch_update(self, vcs_data_table): fetched_impacts = { (impacted_pkg.base_purl, impacted_pkg.advisory_id): impacted_pkg for impacted_pkg in ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only( - "base_purl", "advisory_id" + "id", "base_purl", "advisory_id" ) } @@ -129,17 +129,27 @@ def bulk_commit_batch_update(self, vcs_data_table): (pkg_commit_patch.vcs_url, pkg_commit_patch.commit_hash): pkg_commit_patch for pkg_commit_patch in PackageCommitPatch.objects.filter( commit_hash__in=commit_hashes - ).only("vcs_url", "commit_hash") + ).only("id", "vcs_url", "commit_hash") } - pkg_commit_add_impact_pkg = defaultdict(list) + through_model = PackageCommitPatch.fixed_in_impacts.through + + relations = [] for base_purl, vcs_url, commit_hash, adv_id in vcs_data_table: impacted_pkg_obj = fetched_impacts.get((base_purl, adv_id)) pkg_commit_obj = fetched_pkg_commits.get((vcs_url, commit_hash)) - if impacted_pkg_obj and pkg_commit_obj: - pkg_commit_add_impact_pkg[pkg_commit_obj].append(impacted_pkg_obj) - - for pkg_commit_obj, impact_pkgs in pkg_commit_add_impact_pkg.items(): - pkg_commit_obj.fixed_in_impacts.add(*impact_pkgs) + if impacted_pkg_obj and pkg_commit_obj: + relations.append( + through_model( + packagecommitpatch_id=pkg_commit_obj.id, + impactedpackage_id=impacted_pkg_obj.id, + ) + ) + + through_model.objects.bulk_create( + relations, + ignore_conflicts=True, + batch_size=10000, + ) return len(vcs_data_table)