From 93d6704689cc7a09968afce099574820e72624d8 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Tue, 10 Mar 2026 17:11:04 +1100 Subject: [PATCH 01/15] feat: move Vanir signature generation to a cron job --- Makefile | 3 + deployment/build-and-stage.yaml | 12 + .../gke-workers/base/kustomization.yaml | 1 + .../gke-workers/base/vanir-signatures.yaml | 25 +++ .../oss-vdb-test/kustomization.yaml | 1 + .../oss-vdb-test/vanir-signatures.yaml | 16 ++ .../environments/oss-vdb/kustomization.yaml | 1 + .../oss-vdb/vanir-signatures.yaml | 16 ++ gcp/workers/cloudbuild.yaml | 8 + gcp/workers/oss_fuzz_worker/worker.py | 41 ---- gcp/workers/vanir_signatures/Dockerfile | 19 ++ gcp/workers/vanir_signatures/run_tests.sh | 22 ++ .../vanir_signatures/vanir_signatures.py | 208 ++++++++++++++++++ .../vanir_signatures/vanir_signatures_test.py | 121 ++++++++++ gcp/workers/worker/worker.py | 59 +---- osv/models.py | 10 +- 16 files changed, 468 insertions(+), 95 deletions(-) create mode 100644 deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml create mode 100644 deployment/clouddeploy/gke-workers/environments/oss-vdb-test/vanir-signatures.yaml create mode 100644 deployment/clouddeploy/gke-workers/environments/oss-vdb/vanir-signatures.yaml create mode 100644 gcp/workers/vanir_signatures/Dockerfile create mode 100755 gcp/workers/vanir_signatures/run_tests.sh create mode 100644 gcp/workers/vanir_signatures/vanir_signatures.py create mode 100644 gcp/workers/vanir_signatures/vanir_signatures_test.py diff --git a/Makefile b/Makefile index 098ac5cb511..071c70a2514 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,9 @@ importer-tests: recoverer-tests: cd gcp/workers/recoverer && ./run_tests.sh +vanir-signatures-tests: + cd gcp/workers/vanir_signatures && ./run_tests.sh + website-tests: cd gcp/website && ./run_tests.sh diff --git a/deployment/build-and-stage.yaml b/deployment/build-and-stage.yaml index e0c8792efdf..532ad6bf320 100644 --- a/deployment/build-and-stage.yaml +++ b/deployment/build-and-stage.yaml @@ -216,6 +216,16 @@ steps: args: ['push', '--all-tags', 'gcr.io/oss-vdb-test/osv-linter'] waitFor: ['build-osv-linter', 'cloud-build-queue'] +# Build/push vanir-signatures images to gcr.io/oss-vdb. +- name: gcr.io/cloud-builders/docker + args: ['build', '-t', 'gcr.io/oss-vdb/vanir-signatures:latest', '-t', 'gcr.io/oss-vdb/vanir-signatures:$COMMIT_SHA', '.'] + dir: 'gcp/workers/vanir_signatures' + id: 'build-vanir-signatures' + waitFor: ['build-worker'] +- name: gcr.io/cloud-builders/docker + args: ['push', '--all-tags', 'gcr.io/oss-vdb/vanir-signatures'] + waitFor: ['build-vanir-signatures', 'cloud-build-queue'] + # Build/push cron job images. - name: gcr.io/cloud-builders/docker args: ['build', '-t', 'gcr.io/oss-vdb/cron:latest', '-t', 'gcr.io/oss-vdb/cron:$COMMIT_SHA', '.'] @@ -432,6 +442,7 @@ steps: relations=gcr.io/oss-vdb/relations:$COMMIT_SHA,\ generatesitemap=gcr.io/oss-vdb/generatesitemap:$COMMIT_SHA,\ gitter=gcr.io/oss-vdb/gitter:$COMMIT_SHA,\ + vanir-signatures=gcr.io/oss-vdb/vanir-signatures:$COMMIT_SHA,\ cron=gcr.io/oss-vdb/cron:$COMMIT_SHA" ] dir: deployment/clouddeploy/gke-workers @@ -496,3 +507,4 @@ images: - 'gcr.io/oss-vdb/oss-fuzz-importer:$COMMIT_SHA' - 'gcr.io/oss-vdb/generatesitemap:$COMMIT_SHA' - 'gcr.io/oss-vdb/gitter:$COMMIT_SHA' +- 'gcr.io/oss-vdb/vanir-signatures:$COMMIT_SHA' diff --git a/deployment/clouddeploy/gke-workers/base/kustomization.yaml b/deployment/clouddeploy/gke-workers/base/kustomization.yaml index 4125bd5a47c..4d21838b682 100644 --- a/deployment/clouddeploy/gke-workers/base/kustomization.yaml +++ b/deployment/clouddeploy/gke-workers/base/kustomization.yaml @@ -30,4 +30,5 @@ resources: - record-checker.yaml - cve5-to-osv.yaml - custommetrics.yaml +- vanir-signatures.yaml diff --git a/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml b/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml new file mode 100644 index 00000000000..4cdcd2ebea0 --- /dev/null +++ b/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml @@ -0,0 +1,25 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: vanir-signatures + labels: + cronLastSuccessfulTimeMins: "60" +spec: + schedule: "0 * * * *" + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + containers: + - name: vanir-signatures + image: vanir-signatures + imagePullPolicy: Always + resources: + requests: + cpu: "1" + memory: "10G" + limits: + cpu: "1" + memory: "13G" + restartPolicy: Never diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml index 601085fd537..91165997cf7 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/kustomization.yaml @@ -26,3 +26,4 @@ patches: - path: record-checker.yaml - path: custommetrics.yaml - path: gitter.yaml +- path: vanir-signatures.yaml diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/vanir-signatures.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/vanir-signatures.yaml new file mode 100644 index 00000000000..cae25cded4a --- /dev/null +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb-test/vanir-signatures.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: vanir-signatures +spec: + jobTemplate: + spec: + template: + spec: + containers: + - name: vanir-signatures + env: + - name: GOOGLE_CLOUD_PROJECT + value: oss-vdb-test + - name: OSV_VULNERABILITIES_BUCKET + value: osv-test-vulnerabilities diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/kustomization.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/kustomization.yaml index f04532deffc..c95f9ffc101 100644 --- a/deployment/clouddeploy/gke-workers/environments/oss-vdb/kustomization.yaml +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/kustomization.yaml @@ -23,4 +23,5 @@ patches: - path: cve5-to-osv.yaml - path: custommetrics.yaml - path: gitter.yaml +- path: vanir-signatures.yaml diff --git a/deployment/clouddeploy/gke-workers/environments/oss-vdb/vanir-signatures.yaml b/deployment/clouddeploy/gke-workers/environments/oss-vdb/vanir-signatures.yaml new file mode 100644 index 00000000000..3bbd48cc966 --- /dev/null +++ b/deployment/clouddeploy/gke-workers/environments/oss-vdb/vanir-signatures.yaml @@ -0,0 +1,16 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: vanir-signatures +spec: + jobTemplate: + spec: + template: + spec: + containers: + - name: vanir-signatures + env: + - name: GOOGLE_CLOUD_PROJECT + value: oss-vdb + - name: OSV_VULNERABILITIES_BUCKET + value: osv-vulnerabilities diff --git a/gcp/workers/cloudbuild.yaml b/gcp/workers/cloudbuild.yaml index 51390bb3038..242fd8e79d2 100644 --- a/gcp/workers/cloudbuild.yaml +++ b/gcp/workers/cloudbuild.yaml @@ -62,6 +62,14 @@ steps: - GITTER_PORT=8891 waitFor: ['init', 'sync'] +- name: 'gcr.io/oss-vdb/ci' + id: 'vanir-signatures-tests' + dir: gcp/workers/vanir_signatures + args: ['bash', '-ex', 'run_tests.sh'] + env: + - DATASTORE_EMULATOR_PORT=8006 + waitFor: ['init', 'sync'] + timeout: 7200s options: machineType: E2_HIGHCPU_8 diff --git a/gcp/workers/oss_fuzz_worker/worker.py b/gcp/workers/oss_fuzz_worker/worker.py index 9187b6b6c51..259c9cee052 100644 --- a/gcp/workers/oss_fuzz_worker/worker.py +++ b/gcp/workers/oss_fuzz_worker/worker.py @@ -42,8 +42,6 @@ from osv import vulnerability_pb2 import oss_fuzz -from vanir import vulnerability_manager - DEFAULT_WORK_DIR = '/work' OSS_FUZZ_GIT_URL = 'https://github.com/google/oss-fuzz.git' TASK_SUBSCRIPTION = 'oss-fuzz-tasks' @@ -499,43 +497,6 @@ def _analyze_vulnerability(self, source_repo, repo, vulnerability, path, vulnerability.id) raise UpdateConflictError - def _generate_vanir_signatures(self, vulnerability): - """Generates Vanir signatures for a vulnerability.""" - if not any(r.type == vulnerability_pb2.Range.GIT - for affected in vulnerability.affected - for r in affected.ranges): - logging.info( - 'Skipping Vanir signature generation for %s as it has no ' - 'GIT affected ranges.', vulnerability.id) - return vulnerability - if any(affected.package.name == "Kernel" and - affected.package.ecosystem == "Linux" - for affected in vulnerability.affected): - logging.info( - 'Skipping Vanir signature generation for %s as it is a ' - 'Kernel vulnerability.', vulnerability.id) - return vulnerability - - logging.info('Generating Vanir signatures for %s', vulnerability.id) - try: - vuln_manager = vulnerability_manager.generate_from_json_string( - content=json.dumps([ - json_format.MessageToDict( - vulnerability, preserving_proto_field_name=True) - ]),) - vuln_manager.generate_signatures() - - if not vuln_manager.vulnerabilities: - logging.warning('Vanir signature generation resulted in no ' - 'vulnerabilities.') - return vulnerability - - return vuln_manager.vulnerabilities[0].to_proto() - except Exception: - logging.exception('Failed to generate Vanir signatures for %s', - vulnerability.id) - return vulnerability - def _do_update(self, source_repo, repo, vulnerability, relative_path, original_sha256): """Process updates on a vulnerability.""" @@ -551,8 +512,6 @@ def _do_update(self, source_repo, repo, vulnerability, relative_path, # Keep a copy of the original modified date from the source file. orig_modified_date = vulnerability.modified.ToDatetime(datetime.UTC) - # Fully enrich the vulnerability object in memory. - vulnerability = self._generate_vanir_signatures(vulnerability) try: result = self._analyze_vulnerability(source_repo, repo, vulnerability, relative_path, original_sha256) diff --git a/gcp/workers/vanir_signatures/Dockerfile b/gcp/workers/vanir_signatures/Dockerfile new file mode 100644 index 00000000000..ea1a363acd9 --- /dev/null +++ b/gcp/workers/vanir_signatures/Dockerfile @@ -0,0 +1,19 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM gcr.io/oss-vdb/worker + +COPY vanir_signatures.py /usr/local/bin/vanir_signatures.py +RUN chmod 755 /usr/local/bin/vanir_signatures.py +ENTRYPOINT ["vanir_signatures.py"] diff --git a/gcp/workers/vanir_signatures/run_tests.sh b/gcp/workers/vanir_signatures/run_tests.sh new file mode 100755 index 00000000000..8031da4464a --- /dev/null +++ b/gcp/workers/vanir_signatures/run_tests.sh @@ -0,0 +1,22 @@ +#!/bin/bash -ex +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cd ../worker + +# Install dependencies only if not running in Cloud Build +if [ -z "$CLOUDBUILD" ]; then + poetry sync +fi +poetry run python ../vanir_signatures/vanir_signatures_test.py diff --git a/gcp/workers/vanir_signatures/vanir_signatures.py b/gcp/workers/vanir_signatures/vanir_signatures.py new file mode 100644 index 00000000000..83a6504ec9f --- /dev/null +++ b/gcp/workers/vanir_signatures/vanir_signatures.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Cron job to generate Vanir signatures for modified vulnerabilities.""" + +import argparse +import datetime +import json +import logging +import os + +from google.cloud import ndb +from google.protobuf import json_format + +import osv +import osv.logs +import osv.models +import osv.gcs +from osv import vulnerability_pb2 + +from vanir import vulnerability_manager + +JOB_NAME = 'vanir_signatures' +JOB_DATA_LAST_RUN = 'vanir_signatures_last_run' + + +def _generate_vanir_signatures( + vulnerability: vulnerability_pb2.Vulnerability +) -> vulnerability_pb2.Vulnerability: + """Generates Vanir signatures for a vulnerability.""" + logging.info('Generating Vanir signatures for %s', vulnerability.id) + try: + vuln_manager = vulnerability_manager.generate_from_json_string( + content=json.dumps([ + json_format.MessageToDict( + vulnerability, preserving_proto_field_name=True) + ]),) + vuln_manager.generate_signatures() + + if not vuln_manager.vulnerabilities: + logging.warning('Vanir signature generation resulted in no ' + 'vulnerabilities.') + return vulnerability + + return vuln_manager.vulnerabilities[0].to_proto() + except Exception: + logging.exception('Failed to generate Vanir signatures for %s', + vulnerability.id) + return vulnerability + +def affected_is_kernel(affected: vulnerability_pb2.Affected) -> bool: + """Returns True if the affected package is a Linux kernel.""" + if affected.package.name == 'Kernel' and \ + affected.package.ecosystem == 'Linux': + return True + + if any('git.kernel.org/pub/scm/linux/kernel/git' in ar.repo + for ar in affected.ranges): + return True + + return False + +def process_vulnerability(vuln_id, dry_run=False, output_dir=None): + """Process a single vulnerability to generate Vanir signatures.""" + logging.debug('Processing %s', vuln_id) + + vuln_and_gen = osv.gcs.get_by_id_with_generation(vuln_id) + if not vuln_and_gen: + logging.warning('Vulnerability %s not found in GCS', vuln_id) + return False + + vulnerability, gcs_gen = vuln_and_gen + original_vulnerability = vulnerability_pb2.Vulnerability() + original_vulnerability.CopyFrom(vulnerability) + + if not any(r.type == vulnerability_pb2.Range.GIT + for affected in vulnerability.affected + for r in affected.ranges): + logging.debug( + 'Skipping Vanir signature generation for %s as it has no ' + 'GIT affected ranges.', vuln_id) + return False + + if any('vanir_signatures' in affected.database_specific.fields + for affected in vulnerability.affected): + logging.debug( + 'Skipping Vanir signature generation for %s as it already has ' + 'Vanir signatures.', vuln_id) + return False + + if any(affected_is_kernel(affected) for affected in vulnerability.affected): + logging.debug('Skipping %s as it is a Kernel vulnerability', vuln_id) + return False + + enriched_vulnerability = _generate_vanir_signatures(vulnerability) + + if original_vulnerability == enriched_vulnerability: + logging.debug('No changes in Vanir signatures for %s', vuln_id) + return False + + if dry_run: + logging.info('Dry run: would have updated %s', vuln_id) + if output_dir: + if not os.path.exists(output_dir): + os.makedirs(output_dir) + output_path = os.path.join(output_dir, f'{vuln_id}.json') + with open(output_path, 'w') as f: + f.write( + json_format.MessageToJson( + enriched_vulnerability, preserving_proto_field_name=True)) + logging.info('Saved enriched vulnerability to %s', output_path) + return True + + bug = osv.Bug.get_by_id(vuln_id) + if not bug: + logging.error('Bug %s not found in Datastore', vuln_id) + return False + + bug.update_from_vulnerability(enriched_vulnerability) + bug.last_modified = osv.utcnow() + bug.put() + + logging.info('Updated Datastore for %s, now uploading to GCS', vuln_id) + try: + osv.gcs.upload_vulnerability(enriched_vulnerability, gcs_gen) + except Exception: + logging.error('Failed to upload %s to GCS', vuln_id) + # Even if GCS upload fails, we return True as the Datastore is updated. + # Bug._post_put_hook will also attempt to upload the vulnerability. + + return True + + +def main(): + """Main entry point for the cron job.""" + parser = argparse.ArgumentParser(description='Vanir signatures cron job.') + parser.add_argument( + '--dry-run', action='store_true', help='Perform a dry run.') + parser.add_argument( + '--output-dir', + help='Directory to save enriched vulnerabilities during dry run.') + args = parser.parse_args() + + if args.dry_run: + logging.getLogger().setLevel(logging.DEBUG) + + last_run_key = ndb.Key(osv.models.JobData, JOB_DATA_LAST_RUN) + last_run_data = last_run_key.get() + + # Capture current time to use as last_run for the next time. + current_run = osv.utcnow() + + if last_run_data: + last_run = last_run_data.value + logging.info('Running Vanir signature generation since %s', last_run) + else: + # If there is no record of the last run, query vulnerabilities modified + # since one day ago to avoid processing all vulnerabilities. + last_run = current_run - datetime.timedelta(days=1) + logging.info('No last run found, querying vulnerabilities since %s.', + last_run) + + query = osv.models.Vulnerability.query( + osv.models.Vulnerability.modified > last_run) + + vuln_ids = [key.id() for key in query.fetch(keys_only=True)] + + logging.info('Found %d vulnerabilities to process', len(vuln_ids)) + + generated_count = 0 + for vuln_id in vuln_ids: + try: + if process_vulnerability(vuln_id, args.dry_run, args.output_dir): + generated_count += 1 + except Exception: + logging.exception('Error processing vulnerability %s', vuln_id) + + logging.info('Processed %d vulnerabilities, generated %d new signatures.', + len(vuln_ids), generated_count) + + if args.dry_run: + logging.info('Dry run: would have updated last_run to %s', current_run) + return + + # Update last_run timestamp + if not last_run_data: + last_run_data = osv.models.JobData(id=JOB_DATA_LAST_RUN) + + last_run_data.value = current_run + last_run_data.put() + + +if __name__ == '__main__': + _ndb_client = ndb.Client() + osv.logs.setup_gcp_logging(JOB_NAME) + with _ndb_client.context(): + main() diff --git a/gcp/workers/vanir_signatures/vanir_signatures_test.py b/gcp/workers/vanir_signatures/vanir_signatures_test.py new file mode 100644 index 00000000000..9a0ed716d89 --- /dev/null +++ b/gcp/workers/vanir_signatures/vanir_signatures_test.py @@ -0,0 +1,121 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for vanir_signatures.""" + +import unittest +from unittest import mock + +from google.cloud import ndb + +import osv +import osv.tests +import vanir_signatures +from osv import vulnerability_pb2 + + +class VanirSignaturesTest(unittest.TestCase): + """Tests for vanir_signatures.""" + + @classmethod + def setUpClass(cls): + cls.emulator = cls.enterClassContext(osv.tests.datastore_emulator()) + cls.enterClassContext(ndb.Client().context(cache_policy=False)) + + def setUp(self): + self.emulator.reset() + + @mock.patch('osv.gcs.get_by_id_with_generation') + def test_process_vulnerability_skip_existing_signatures(self, mock_get_gcs): + """Test skipping when signatures already exist.""" + vuln_id = 'OSV-2026-1' + vuln = vulnerability_pb2.Vulnerability(id=vuln_id) + affected = vuln.affected.add() + affected.database_specific['vanir_signatures'] = [] + affected.ranges.add(type=vulnerability_pb2.Range.GIT, repo='https://example.com/repo') + + mock_get_gcs.return_value = (vuln, '123') + + with self.assertLogs(level='DEBUG') as cm: + result = vanir_signatures.process_vulnerability(vuln_id) + self.assertFalse(result) + self.assertTrue(any('already has Vanir signatures' in log for log in cm.output)) + + @mock.patch('osv.gcs.get_by_id_with_generation') + def test_process_vulnerability_skip_no_git_ranges(self, mock_get_gcs): + """Test skipping when no GIT ranges are present.""" + vuln_id = 'OSV-2026-1' + vuln = vulnerability_pb2.Vulnerability(id=vuln_id) + vuln.affected.add() + + mock_get_gcs.return_value = (vuln, '123') + + with self.assertLogs(level='DEBUG') as cm: + result = vanir_signatures.process_vulnerability(vuln_id) + self.assertFalse(result) + self.assertTrue(any('has no GIT affected ranges' in log for log in cm.output)) + + @mock.patch('osv.gcs.get_by_id_with_generation') + def test_process_vulnerability_skip_kernel(self, mock_get_gcs): + """Test skipping kernel vulnerabilities.""" + vuln_id = 'CVE-2023-1234' + vuln = vulnerability_pb2.Vulnerability(id=vuln_id) + affected = vuln.affected.add() + affected.package.name = 'Kernel' + affected.package.ecosystem = 'Linux' + affected.ranges.add(type=vulnerability_pb2.Range.GIT, repo='https://example.com/kernel-repo') + + mock_get_gcs.return_value = (vuln, '123') + + with self.assertLogs(level='DEBUG') as cm: + result = vanir_signatures.process_vulnerability(vuln_id) + self.assertFalse(result) + self.assertTrue(any('is a Kernel vulnerability' in log for log in cm.output)) + + @mock.patch('osv.gcs.get_by_id_with_generation') + @mock.patch('osv.gcs.upload_vulnerability') + @mock.patch('vanir_signatures._generate_vanir_signatures') + def test_process_vulnerability_success(self, mock_gen_signatures, + mock_upload, mock_get_gcs): + """Test successful signature generation.""" + vuln_id = 'OSV-2026-1' + + # Input vulnerability + vuln = vulnerability_pb2.Vulnerability(id=vuln_id) + affected = vuln.affected.add() + affected.ranges.add(type=vulnerability_pb2.Range.GIT, repo='https://example.com/repo') + + mock_get_gcs.return_value = (vuln, '123') + + # Mock generation result + enriched_vuln = vulnerability_pb2.Vulnerability() + enriched_vuln.CopyFrom(vuln) + enriched_vuln.affected[0].database_specific['vanir_signatures'] = [{'id': 'sig1'}] + mock_gen_signatures.return_value = enriched_vuln + + # Setup Datastore Bug + bug = osv.Bug(id=vuln_id, db_id=vuln_id, source='test') + bug.put() + + result = vanir_signatures.process_vulnerability(vuln_id) + + self.assertTrue(result) + mock_upload.assert_called_once() + + # Verify Datastore update + updated_bug = osv.Bug.get_by_id(vuln_id) + self.assertIn('vanir_signatures', updated_bug.affected_packages[0].database_specific) + + +if __name__ == '__main__': + unittest.main() diff --git a/gcp/workers/worker/worker.py b/gcp/workers/worker/worker.py index 1f6d2d6a115..68f974c87d4 100644 --- a/gcp/workers/worker/worker.py +++ b/gcp/workers/worker/worker.py @@ -44,8 +44,6 @@ from osv import vulnerability_pb2, purl_helpers import oss_fuzz -from vanir import vulnerability_manager - DEFAULT_WORK_DIR = '/work' OSS_FUZZ_GIT_URL = 'https://github.com/google/oss-fuzz.git' TASK_SUBSCRIPTION = 'tasks' @@ -551,43 +549,6 @@ def _analyze_vulnerability(self, source_repo: osv.SourceRepository, vulnerability.id) raise UpdateConflictError - def _generate_vanir_signatures( - self, vulnerability: vulnerability_pb2.Vulnerability - ) -> vulnerability_pb2.Vulnerability: - """Generates Vanir signatures for a vulnerability.""" - if not any(r.type == vulnerability_pb2.Range.GIT - for affected in vulnerability.affected - for r in affected.ranges): - logging.info( - 'Skipping Vanir signature generation for %s as it has no ' - 'GIT affected ranges.', vulnerability.id) - return vulnerability - if any(affected_is_kernel(affected) for affected in vulnerability.affected): - logging.info( - 'Skipping Vanir signature generation for %s as it is a ' - 'Kernel vulnerability.', vulnerability.id) - return vulnerability - - logging.info('Generating Vanir signatures for %s', vulnerability.id) - try: - vuln_manager = vulnerability_manager.generate_from_json_string( - content=json.dumps([ - json_format.MessageToDict( - vulnerability, preserving_proto_field_name=True) - ]),) - vuln_manager.generate_signatures() - - if not vuln_manager.vulnerabilities: - logging.warning('Vanir signature generation resulted in no ' - 'vulnerabilities.') - return vulnerability - - return vuln_manager.vulnerabilities[0].to_proto() - except Exception: - logging.exception('Failed to generate Vanir signatures for %s', - vulnerability.id) - return vulnerability - def _do_update(self, source_repo: osv.SourceRepository, repo: pygit2.Repository | None, vulnerability: vulnerability_pb2.Vulnerability, @@ -605,20 +566,12 @@ def _do_update(self, source_repo: osv.SourceRepository, # Keep a copy of the original modified date from the source file. orig_modified_date = vulnerability.modified.ToDatetime(datetime.UTC) - # Fully enrich the vulnerability object in memory. - vulnerability = self._generate_vanir_signatures(vulnerability) - if any(affected_is_kernel(affected) for affected in vulnerability.affected): - result = None - logging.info( - 'Skipping Vuln Analysis for %s as it is a ' - 'Kernel vulnerability.', vulnerability.id) - else: - try: - result = self._analyze_vulnerability(source_repo, repo, vulnerability, - relative_path, original_sha256) - except UpdateConflictError: - # Discard changes due to conflict. - return + try: + result = self._analyze_vulnerability(source_repo, repo, vulnerability, + relative_path, original_sha256) + except UpdateConflictError: + # Discard changes due to conflict. + return vuln_and_gen = osv.gcs.get_by_id_with_generation(vulnerability.id) gcs_gen = None diff --git a/osv/models.py b/osv/models.py index 263c0697598..067c1e40d80 100644 --- a/osv/models.py +++ b/osv/models.py @@ -947,7 +947,7 @@ class Vulnerability(ndb.Model): """A Vulnerability entry. Contains a minimal amount of information of an OSV record, including the - overall modified date, an some raw fields that are overwritten by our + overall modified date, an some raw fields that are overwritten by our enrichment. The entity's key/id is ID in OSV. @@ -973,6 +973,14 @@ class Vulnerability(ndb.Model): upstream_raw: list[str] = ndb.StringProperty(repeated=True) +# --- JobData --- + + +class JobData(ndb.Model): + """Job data.""" + value = ndb.GenericProperty() + + # --- Affected versions for matching --- From d6f7ff0911d14e1bf73419eea50432e697a73726 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Wed, 25 Mar 2026 16:54:00 +1100 Subject: [PATCH 02/15] lint --- .../gke-workers/base/vanir-signatures.yaml | 3 ++- .../vanir_signatures/vanir_signatures.py | 24 ++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml b/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml index 4cdcd2ebea0..80bfe7144db 100644 --- a/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml +++ b/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml @@ -5,7 +5,8 @@ metadata: labels: cronLastSuccessfulTimeMins: "60" spec: - schedule: "0 * * * *" + schedule: "0 9 * * *" + timeZone: "Australia/Sydney" concurrencyPolicy: Forbid jobTemplate: spec: diff --git a/gcp/workers/vanir_signatures/vanir_signatures.py b/gcp/workers/vanir_signatures/vanir_signatures.py index 83a6504ec9f..7c98141025c 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures.py +++ b/gcp/workers/vanir_signatures/vanir_signatures.py @@ -19,6 +19,7 @@ import json import logging import os +import time from google.cloud import ndb from google.protobuf import json_format @@ -59,6 +60,7 @@ def _generate_vanir_signatures( vulnerability.id) return vulnerability + def affected_is_kernel(affected: vulnerability_pb2.Affected) -> bool: """Returns True if the affected package is a Linux kernel.""" if affected.package.name == 'Kernel' and \ @@ -71,6 +73,7 @@ def affected_is_kernel(affected: vulnerability_pb2.Affected) -> bool: return False + def process_vulnerability(vuln_id, dry_run=False, output_dir=None): """Process a single vulnerability to generate Vanir signatures.""" logging.debug('Processing %s', vuln_id) @@ -103,7 +106,8 @@ def process_vulnerability(vuln_id, dry_run=False, output_dir=None): logging.debug('Skipping %s as it is a Kernel vulnerability', vuln_id) return False - enriched_vulnerability = _generate_vanir_signatures(vulnerability) + # enriched_vulnerability = _generate_vanir_signatures(vulnerability) + enriched_vulnerability = vulnerability if original_vulnerability == enriched_vulnerability: logging.debug('No changes in Vanir signatures for %s', vuln_id) @@ -164,30 +168,32 @@ def main(): if last_run_data: last_run = last_run_data.value logging.info('Running Vanir signature generation since %s', last_run) + query = osv.models.Vulnerability.query( + osv.models.Vulnerability.modified > last_run) else: - # If there is no record of the last run, query vulnerabilities modified - # since one day ago to avoid processing all vulnerabilities. - last_run = current_run - datetime.timedelta(days=1) - logging.info('No last run found, querying vulnerabilities since %s.', - last_run) - - query = osv.models.Vulnerability.query( - osv.models.Vulnerability.modified > last_run) + logging.info('No last run found, querying all vulnerabilities.') + query = osv.models.Vulnerability.query() vuln_ids = [key.id() for key in query.fetch(keys_only=True)] logging.info('Found %d vulnerabilities to process', len(vuln_ids)) generated_count = 0 + start_time = time.time() for vuln_id in vuln_ids: try: if process_vulnerability(vuln_id, args.dry_run, args.output_dir): generated_count += 1 except Exception: logging.exception('Error processing vulnerability %s', vuln_id) + end_time = time.time() + total_time = end_time - start_time + avg_time = total_time / len(vuln_ids) if vuln_ids else 0 logging.info('Processed %d vulnerabilities, generated %d new signatures.', len(vuln_ids), generated_count) + logging.info('Total processing time: %.2f seconds (Avg %.4f seconds/vuln)', + total_time, avg_time) if args.dry_run: logging.info('Dry run: would have updated last_run to %s', current_run) From ef3574d3d41f97f69dd8123f53e488bed181c3b5 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Wed, 25 Mar 2026 17:25:14 +1100 Subject: [PATCH 03/15] worker test and format --- .../vanir_signatures/vanir_signatures_test.py | 56 ++++++++------- .../RESTUpdateTest_update_no_introduced.txt | 22 +----- .../testdata/UpdateTest_last_affected_git.txt | 10 +-- ...pdateTest_normalized_pypi_pubsub_calls.txt | 2 +- .../testdata/UpdateTest_pypi_pubsub_calls.txt | 2 +- .../testdata/UpdateTest_update_bucket_cve.txt | 69 +------------------ 6 files changed, 42 insertions(+), 119 deletions(-) diff --git a/gcp/workers/vanir_signatures/vanir_signatures_test.py b/gcp/workers/vanir_signatures/vanir_signatures_test.py index 9a0ed716d89..3b874b4c5b1 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures_test.py +++ b/gcp/workers/vanir_signatures/vanir_signatures_test.py @@ -42,14 +42,16 @@ def test_process_vulnerability_skip_existing_signatures(self, mock_get_gcs): vuln = vulnerability_pb2.Vulnerability(id=vuln_id) affected = vuln.affected.add() affected.database_specific['vanir_signatures'] = [] - affected.ranges.add(type=vulnerability_pb2.Range.GIT, repo='https://example.com/repo') - + affected.ranges.add( + type=vulnerability_pb2.Range.GIT, repo='https://example.com/repo') + mock_get_gcs.return_value = (vuln, '123') - + with self.assertLogs(level='DEBUG') as cm: result = vanir_signatures.process_vulnerability(vuln_id) self.assertFalse(result) - self.assertTrue(any('already has Vanir signatures' in log for log in cm.output)) + self.assertTrue( + any('already has Vanir signatures' in log for log in cm.output)) @mock.patch('osv.gcs.get_by_id_with_generation') def test_process_vulnerability_skip_no_git_ranges(self, mock_get_gcs): @@ -57,13 +59,14 @@ def test_process_vulnerability_skip_no_git_ranges(self, mock_get_gcs): vuln_id = 'OSV-2026-1' vuln = vulnerability_pb2.Vulnerability(id=vuln_id) vuln.affected.add() - + mock_get_gcs.return_value = (vuln, '123') - + with self.assertLogs(level='DEBUG') as cm: result = vanir_signatures.process_vulnerability(vuln_id) self.assertFalse(result) - self.assertTrue(any('has no GIT affected ranges' in log for log in cm.output)) + self.assertTrue( + any('has no GIT affected ranges' in log for log in cm.output)) @mock.patch('osv.gcs.get_by_id_with_generation') def test_process_vulnerability_skip_kernel(self, mock_get_gcs): @@ -73,48 +76,55 @@ def test_process_vulnerability_skip_kernel(self, mock_get_gcs): affected = vuln.affected.add() affected.package.name = 'Kernel' affected.package.ecosystem = 'Linux' - affected.ranges.add(type=vulnerability_pb2.Range.GIT, repo='https://example.com/kernel-repo') - + affected.ranges.add( + type=vulnerability_pb2.Range.GIT, + repo='https://example.com/kernel-repo') + mock_get_gcs.return_value = (vuln, '123') - + with self.assertLogs(level='DEBUG') as cm: result = vanir_signatures.process_vulnerability(vuln_id) self.assertFalse(result) - self.assertTrue(any('is a Kernel vulnerability' in log for log in cm.output)) + self.assertTrue( + any('is a Kernel vulnerability' in log for log in cm.output)) @mock.patch('osv.gcs.get_by_id_with_generation') @mock.patch('osv.gcs.upload_vulnerability') @mock.patch('vanir_signatures._generate_vanir_signatures') - def test_process_vulnerability_success(self, mock_gen_signatures, - mock_upload, mock_get_gcs): + def test_process_vulnerability_success(self, mock_gen_signatures, mock_upload, + mock_get_gcs): """Test successful signature generation.""" vuln_id = 'OSV-2026-1' - + # Input vulnerability vuln = vulnerability_pb2.Vulnerability(id=vuln_id) affected = vuln.affected.add() - affected.ranges.add(type=vulnerability_pb2.Range.GIT, repo='https://example.com/repo') - + affected.ranges.add( + type=vulnerability_pb2.Range.GIT, repo='https://example.com/repo') + mock_get_gcs.return_value = (vuln, '123') - + # Mock generation result enriched_vuln = vulnerability_pb2.Vulnerability() enriched_vuln.CopyFrom(vuln) - enriched_vuln.affected[0].database_specific['vanir_signatures'] = [{'id': 'sig1'}] + enriched_vuln.affected[0].database_specific['vanir_signatures'] = [{ + 'id': 'sig1' + }] mock_gen_signatures.return_value = enriched_vuln - + # Setup Datastore Bug bug = osv.Bug(id=vuln_id, db_id=vuln_id, source='test') bug.put() - + result = vanir_signatures.process_vulnerability(vuln_id) - + self.assertTrue(result) mock_upload.assert_called_once() - + # Verify Datastore update updated_bug = osv.Bug.get_by_id(vuln_id) - self.assertIn('vanir_signatures', updated_bug.affected_packages[0].database_specific) + self.assertIn('vanir_signatures', + updated_bug.affected_packages[0].database_specific) if __name__ == '__main__': diff --git a/gcp/workers/worker/testdata/RESTUpdateTest_update_no_introduced.txt b/gcp/workers/worker/testdata/RESTUpdateTest_update_no_introduced.txt index 8acf0e2cf80..fbce3c1ce52 100644 --- a/gcp/workers/worker/testdata/RESTUpdateTest_update_no_introduced.txt +++ b/gcp/workers/worker/testdata/RESTUpdateTest_update_no_introduced.txt @@ -1,24 +1,4 @@ -{ 'affected': [ { 'database_specific': { 'source': 'http://localhost:8000/CURL-CVE-2022-32221.json', - 'vanir_signatures': [ { 'deprecated': False, - 'digest': { 'function_hash': '22968065415160735040135778472335782425', - 'length': 58084.0}, - 'id': 'CURL-CVE-2022-32221-9751f04c', - 'signature_type': 'Function', - 'signature_version': 'v1', - 'source': 'https://github.com/curl/curl.git/commit/a64e3e59938abd7d667e4470a18072a24d7e9de9', - 'target': { 'file': 'lib/setopt.c', - 'function': 'Curl_vsetopt'}}, - { 'deprecated': False, - 'digest': { 'line_hashes': [ '73596727404438881622769716353410783065', - '150108665408450698810391826671290668314', - '264542534956227828232279400943172691231', - '248438938282829223471764231064667949049'], - 'threshold': 0.9}, - 'id': 'CURL-CVE-2022-32221-b7951194', - 'signature_type': 'Line', - 'signature_version': 'v1', - 'source': 'https://github.com/curl/curl.git/commit/a64e3e59938abd7d667e4470a18072a24d7e9de9', - 'target': { 'file': 'lib/setopt.c'}}]}, +{ 'affected': [ { 'database_specific': { 'source': 'http://localhost:8000/CURL-CVE-2022-32221.json'}, 'ranges': [ { 'events': [ {'introduced': '7.7'}, {'fixed': '7.86.0'}], 'type': 'SEMVER'}, diff --git a/gcp/workers/worker/testdata/UpdateTest_last_affected_git.txt b/gcp/workers/worker/testdata/UpdateTest_last_affected_git.txt index 15c1c4cc099..b466919a17a 100644 --- a/gcp/workers/worker/testdata/UpdateTest_last_affected_git.txt +++ b/gcp/workers/worker/testdata/UpdateTest_last_affected_git.txt @@ -3,11 +3,11 @@ { 'introduced': 'febfac1940086bc1f6d3dc33fda0a1d1ba336209'}], 'repo': 'https://osv-test/repo/url', 'type': 'GIT'}], - 'versions': ['branch-v0.1.1', - 'branch-v0.1.1-with-fix', - 'branch_1_cherrypick_regress', - 'v0.1.1', - 'v0.2']}], + 'versions': [ 'branch-v0.1.1', + 'branch-v0.1.1-with-fix', + 'branch_1_cherrypick_regress', + 'v0.1.1', + 'v0.2']}], 'details': 'Blah blah blah\nBlah\n', 'id': 'OSV-TEST-last-affected-01', 'modified': '3000-01-01T00:00:00Z', diff --git a/gcp/workers/worker/testdata/UpdateTest_normalized_pypi_pubsub_calls.txt b/gcp/workers/worker/testdata/UpdateTest_normalized_pypi_pubsub_calls.txt index 109622dc0d6..b4c1a4acf69 100644 --- a/gcp/workers/worker/testdata/UpdateTest_normalized_pypi_pubsub_calls.txt +++ b/gcp/workers/worker/testdata/UpdateTest_normalized_pypi_pubsub_calls.txt @@ -1 +1 @@ -[call('projects/test-osv/topics/pypi-bridge', data=b'{"id": "PYSEC-456", "summary": "A vulnerability in an unnormalized package", "details": "Blah blah blah\\nBlah\\n", "modified": "3000-01-01T00:00:00Z", "published": "3000-01-01T00:00:00Z", "references": [{"type": "WEB", "url": "https://ref.com/ref"}], "affected": [{"package": {"name": "scrapy", "ecosystem": "PyPI", "purl": "pkg:pypi/scrapy"}, "ranges": [{"type": "ECOSYSTEM", "events": [{"introduced": "1.14.2"}, {"fixed": "1.31.0"}]}, {"type": "GIT", "repo": "https://osv-test/repo/url", "events": [{"introduced": "eefe8ec3f1f90d0e684890e810f3f21e8500a4cd"}, {"fixed": "8d8242f545e9cec3e6d0d2e3f5bde8be1c659735"}]}], "versions": []}], "schema_version": "SCHEMA_VERSION"}')] \ No newline at end of file +[ call('projects/test-osv/topics/pypi-bridge', data=b'{"id": "PYSEC-456", "summary": "A vulnerability in an unnormalized package", "details": "Blah blah blah\\nBlah\\n", "modified": "3000-01-01T00:00:00Z", "published": "3000-01-01T00:00:00Z", "references": [{"type": "WEB", "url": "https://ref.com/ref"}], "affected": [{"package": {"name": "scrapy", "ecosystem": "PyPI", "purl": "pkg:pypi/scrapy"}, "ranges": [{"type": "ECOSYSTEM", "events": [{"introduced": "1.14.2"}, {"fixed": "1.31.0"}]}, {"type": "GIT", "repo": "https://osv-test/repo/url", "events": [{"introduced": "eefe8ec3f1f90d0e684890e810f3f21e8500a4cd"}, {"fixed": "8d8242f545e9cec3e6d0d2e3f5bde8be1c659735"}]}], "versions": []}], "schema_version": "SCHEMA_VERSION"}')] \ No newline at end of file diff --git a/gcp/workers/worker/testdata/UpdateTest_pypi_pubsub_calls.txt b/gcp/workers/worker/testdata/UpdateTest_pypi_pubsub_calls.txt index 51f4eaac423..4737cbad84b 100644 --- a/gcp/workers/worker/testdata/UpdateTest_pypi_pubsub_calls.txt +++ b/gcp/workers/worker/testdata/UpdateTest_pypi_pubsub_calls.txt @@ -1 +1 @@ -[call('projects/test-osv/topics/pypi-bridge', data=b'{"id": "PYSEC-123", "summary": "A vulnerability", "details": "Blah blah blah\\nBlah\\n", "modified": "3000-01-01T00:00:00Z", "published": "3000-01-01T00:00:00Z", "references": [{"type": "WEB", "url": "https://ref.com/ref"}], "affected": [{"package": {"name": "grpcio", "ecosystem": "PyPI", "purl": "pkg:pypi/grpcio"}, "ranges": [{"type": "ECOSYSTEM", "events": [{"introduced": "1.14.2"}, {"fixed": "1.31.0"}]}, {"type": "GIT", "repo": "https://osv-test/repo/url", "events": [{"introduced": "eefe8ec3f1f90d0e684890e810f3f21e8500a4cd"}, {"fixed": "8d8242f545e9cec3e6d0d2e3f5bde8be1c659735"}]}], "versions": ["1.14.2", "1.15.0", "1.15.0rc1", "1.16.0", "1.16.0rc1", "1.16.1", "1.17.0", "1.17.1", "1.18.0", "1.19.0", "1.20.0", "1.20.0rc1", "1.20.0rc2", "1.20.0rc3", "1.20.1", "1.21.0rc1", "1.21.1", "1.21.1rc1", "1.22.0", "1.22.0rc1", "1.22.1", "1.23.0", "1.23.0rc1", "1.23.1", "1.24.0", "1.24.0rc1", "1.24.1", "1.24.3", "1.25.0", "1.25.0rc1", "1.26.0", "1.26.0rc1", "1.27.0rc1", "1.27.0rc2", "1.27.1", "1.27.2", "1.28.0rc1", "1.28.0rc2", "1.28.1", "1.29.0", "1.30.0"]}], "schema_version": "SCHEMA_VERSION"}')] \ No newline at end of file +[ call('projects/test-osv/topics/pypi-bridge', data=b'{"id": "PYSEC-123", "summary": "A vulnerability", "details": "Blah blah blah\\nBlah\\n", "modified": "3000-01-01T00:00:00Z", "published": "3000-01-01T00:00:00Z", "references": [{"type": "WEB", "url": "https://ref.com/ref"}], "affected": [{"package": {"name": "grpcio", "ecosystem": "PyPI", "purl": "pkg:pypi/grpcio"}, "ranges": [{"type": "ECOSYSTEM", "events": [{"introduced": "1.14.2"}, {"fixed": "1.31.0"}]}, {"type": "GIT", "repo": "https://osv-test/repo/url", "events": [{"introduced": "eefe8ec3f1f90d0e684890e810f3f21e8500a4cd"}, {"fixed": "8d8242f545e9cec3e6d0d2e3f5bde8be1c659735"}]}], "versions": ["1.14.2", "1.15.0", "1.15.0rc1", "1.16.0", "1.16.0rc1", "1.16.1", "1.17.0", "1.17.1", "1.18.0", "1.19.0", "1.20.0", "1.20.0rc1", "1.20.0rc2", "1.20.0rc3", "1.20.1", "1.21.0rc1", "1.21.1", "1.21.1rc1", "1.22.0", "1.22.0rc1", "1.22.1", "1.23.0", "1.23.0rc1", "1.23.1", "1.24.0", "1.24.0rc1", "1.24.1", "1.24.3", "1.25.0", "1.25.0rc1", "1.26.0", "1.26.0rc1", "1.27.0rc1", "1.27.0rc2", "1.27.1", "1.27.2", "1.28.0rc1", "1.28.0rc2", "1.28.1", "1.29.0", "1.30.0"]}], "schema_version": "SCHEMA_VERSION"}')] \ No newline at end of file diff --git a/gcp/workers/worker/testdata/UpdateTest_update_bucket_cve.txt b/gcp/workers/worker/testdata/UpdateTest_update_bucket_cve.txt index cdded136948..20f0c000b48 100644 --- a/gcp/workers/worker/testdata/UpdateTest_update_bucket_cve.txt +++ b/gcp/workers/worker/testdata/UpdateTest_update_bucket_cve.txt @@ -1,71 +1,4 @@ -{ 'affected': [ { 'database_specific': { 'vanir_signatures': [ { 'deprecated': False, - 'digest': { 'line_hashes': [ '18066036635502801806677364178756254862', - '88369412895184753394283011451803187548', - '50848458948504730426650075084385046530', - '91284993680127737564993618090545145416', - '30779278950355321333621475605602830830', - '122421578121241373365155348152646941523', - '267652210589392654099845994262755826062', - '334808111126213430220547654602188383660', - '234389204524678077984531197469034242690', - '152880517379272209571165325006789878786', - '299871312446227378724863519270618301341', - '157634544376100154879962283397081738110', - '103663099829328578689797223848801574827', - '158563421165358858389893196995983570762', - '315965584007238676040631750953088200664'], - 'threshold': 0.9}, - 'id': 'CVE-2016-15011-929806e0', - 'signature_type': 'Line', - 'signature_version': 'v1', - 'source': 'https://github.com/e-contract/dssp/commit/ec4238349691ec66dd30b416ec6eaab02d722302', - 'target': { 'file': 'dssp-client/src/main/java/be/e_contract/dssp/client/metadata/DigitalSignatureServiceMetadata.java'}}, - { 'deprecated': False, - 'digest': { 'line_hashes': [ '6674387965125354881111149989428882853', - '100741820504985357262218153349452233434', - '253366101641995550384755812786879052342', - '245037096886845520996519599411616661529', - '158852189579109359359946013476030639584', - '298614597347537877121532413760030558894', - '180181956682520524395173299138562004562', - '146502839243717526526406585366671557144', - '244996413515733361838850122849344525825', - '166554563875570093109470347687697544350', - '9427977046515615106319032886256396870', - '279044285883194738631442483325879094037', - '295198785562376785392477306514392217432', - '44714085523243422643465698936438435501', - '267608316591780380179772018605253867646', - '182074437577114148436758739432546664545', - '87621961972550109442760282702331746920', - '64897152403082006856773989396486955494', - '184138636401118235309885205539354874180', - '62633257528035095954429323509732904426'], - 'threshold': 0.9}, - 'id': 'CVE-2016-15011-bd561b7b', - 'signature_type': 'Line', - 'signature_version': 'v1', - 'source': 'https://github.com/e-contract/dssp/commit/ec4238349691ec66dd30b416ec6eaab02d722302', - 'target': { 'file': 'dssp-client/src/main/java/be/e_contract/dssp/client/SignResponseVerifier.java'}}, - { 'deprecated': False, - 'digest': { 'function_hash': '259495117689681377355427521574538727644', - 'length': 1591.0}, - 'id': 'CVE-2016-15011-d557e328', - 'signature_type': 'Function', - 'signature_version': 'v1', - 'source': 'https://github.com/e-contract/dssp/commit/ec4238349691ec66dd30b416ec6eaab02d722302', - 'target': { 'file': 'dssp-client/src/main/java/be/e_contract/dssp/client/metadata/DigitalSignatureServiceMetadata.java', - 'function': 'DigitalSignatureServiceMetadata'}}, - { 'deprecated': False, - 'digest': { 'function_hash': '249451297539985081987952306682300702892', - 'length': 4302.0}, - 'id': 'CVE-2016-15011-fcf07dd1', - 'signature_type': 'Function', - 'signature_version': 'v1', - 'source': 'https://github.com/e-contract/dssp/commit/ec4238349691ec66dd30b416ec6eaab02d722302', - 'target': { 'file': 'dssp-client/src/main/java/be/e_contract/dssp/client/SignResponseVerifier.java', - 'function': 'checkSignResponse'}}]}, - 'ranges': [ { 'events': [ {'introduced': '0'}, +{ 'affected': [ { 'ranges': [ { 'events': [ {'introduced': '0'}, { 'fixed': '001ef99b0c8194468de960d007e2d82dcebc3bca'}, { 'fixed': 'ec4238349691ec66dd30b416ec6eaab02d722302'}], 'repo': 'https://github.com/e-contract/dssp', From 41cbff1ab9b3d647f35da8244ae323d45f64f14c Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Wed, 25 Mar 2026 17:45:11 +1100 Subject: [PATCH 04/15] uncomment --- gcp/workers/oss_fuzz_worker/worker.py | 1 - gcp/workers/vanir_signatures/vanir_signatures.py | 4 +--- gcp/workers/worker/worker.py | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/gcp/workers/oss_fuzz_worker/worker.py b/gcp/workers/oss_fuzz_worker/worker.py index 259c9cee052..eb11edf5c2a 100644 --- a/gcp/workers/oss_fuzz_worker/worker.py +++ b/gcp/workers/oss_fuzz_worker/worker.py @@ -32,7 +32,6 @@ from google.cloud import pubsub_v1 from google.cloud import storage from google.cloud.storage import retry -from google.protobuf import json_format sys.path.append(os.path.dirname(os.path.realpath(__file__))) import osv diff --git a/gcp/workers/vanir_signatures/vanir_signatures.py b/gcp/workers/vanir_signatures/vanir_signatures.py index 7c98141025c..75ea76eaa0d 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures.py +++ b/gcp/workers/vanir_signatures/vanir_signatures.py @@ -15,7 +15,6 @@ """Cron job to generate Vanir signatures for modified vulnerabilities.""" import argparse -import datetime import json import logging import os @@ -106,8 +105,7 @@ def process_vulnerability(vuln_id, dry_run=False, output_dir=None): logging.debug('Skipping %s as it is a Kernel vulnerability', vuln_id) return False - # enriched_vulnerability = _generate_vanir_signatures(vulnerability) - enriched_vulnerability = vulnerability + enriched_vulnerability = _generate_vanir_signatures(vulnerability) if original_vulnerability == enriched_vulnerability: logging.debug('No changes in Vanir signatures for %s', vuln_id) diff --git a/gcp/workers/worker/worker.py b/gcp/workers/worker/worker.py index 68f974c87d4..aed9e4f534f 100644 --- a/gcp/workers/worker/worker.py +++ b/gcp/workers/worker/worker.py @@ -33,7 +33,7 @@ from google.cloud import pubsub_v1 from google.cloud import storage from google.cloud.storage import retry -from google.protobuf import json_format, timestamp_pb2 +from google.protobuf import timestamp_pb2 sys.path.append(os.path.dirname(os.path.realpath(__file__))) import osv From 62b8b0092c918ab95470bfc56088bc111d0340ec Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Wed, 25 Mar 2026 18:09:54 +1100 Subject: [PATCH 05/15] worker test --- gcp/workers/oss_fuzz_worker/worker.py | 1 + .../vanir_signatures/vanir_signatures_test.py | 6 +++--- gcp/workers/worker/worker.py | 19 +++++++++++++------ 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/gcp/workers/oss_fuzz_worker/worker.py b/gcp/workers/oss_fuzz_worker/worker.py index eb11edf5c2a..433b01dd8bd 100644 --- a/gcp/workers/oss_fuzz_worker/worker.py +++ b/gcp/workers/oss_fuzz_worker/worker.py @@ -511,6 +511,7 @@ def _do_update(self, source_repo, repo, vulnerability, relative_path, # Keep a copy of the original modified date from the source file. orig_modified_date = vulnerability.modified.ToDatetime(datetime.UTC) + # Fully enrich the vulnerability object in memory. try: result = self._analyze_vulnerability(source_repo, repo, vulnerability, relative_path, original_sha256) diff --git a/gcp/workers/vanir_signatures/vanir_signatures_test.py b/gcp/workers/vanir_signatures/vanir_signatures_test.py index 3b874b4c5b1..43132953469 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures_test.py +++ b/gcp/workers/vanir_signatures/vanir_signatures_test.py @@ -38,7 +38,7 @@ def setUp(self): @mock.patch('osv.gcs.get_by_id_with_generation') def test_process_vulnerability_skip_existing_signatures(self, mock_get_gcs): """Test skipping when signatures already exist.""" - vuln_id = 'OSV-2026-1' + vuln_id = 'OSV-2026-123' vuln = vulnerability_pb2.Vulnerability(id=vuln_id) affected = vuln.affected.add() affected.database_specific['vanir_signatures'] = [] @@ -56,7 +56,7 @@ def test_process_vulnerability_skip_existing_signatures(self, mock_get_gcs): @mock.patch('osv.gcs.get_by_id_with_generation') def test_process_vulnerability_skip_no_git_ranges(self, mock_get_gcs): """Test skipping when no GIT ranges are present.""" - vuln_id = 'OSV-2026-1' + vuln_id = 'OSV-2026-123' vuln = vulnerability_pb2.Vulnerability(id=vuln_id) vuln.affected.add() @@ -94,7 +94,7 @@ def test_process_vulnerability_skip_kernel(self, mock_get_gcs): def test_process_vulnerability_success(self, mock_gen_signatures, mock_upload, mock_get_gcs): """Test successful signature generation.""" - vuln_id = 'OSV-2026-1' + vuln_id = 'OSV-2026-123' # Input vulnerability vuln = vulnerability_pb2.Vulnerability(id=vuln_id) diff --git a/gcp/workers/worker/worker.py b/gcp/workers/worker/worker.py index aed9e4f534f..346b4531f75 100644 --- a/gcp/workers/worker/worker.py +++ b/gcp/workers/worker/worker.py @@ -566,12 +566,19 @@ def _do_update(self, source_repo: osv.SourceRepository, # Keep a copy of the original modified date from the source file. orig_modified_date = vulnerability.modified.ToDatetime(datetime.UTC) - try: - result = self._analyze_vulnerability(source_repo, repo, vulnerability, - relative_path, original_sha256) - except UpdateConflictError: - # Discard changes due to conflict. - return + # Fully enrich the vulnerability object in memory. + if any(affected_is_kernel(affected) for affected in vulnerability.affected): + result = None + logging.info( + 'Skipping Vuln Analysis for %s as it is a ' + 'Kernel vulnerability.', vulnerability.id) + else: + try: + result = self._analyze_vulnerability(source_repo, repo, vulnerability, + relative_path, original_sha256) + except UpdateConflictError: + # Discard changes due to conflict. + return vuln_and_gen = osv.gcs.get_by_id_with_generation(vulnerability.id) gcs_gen = None From 6ae2f17a68a3159e404f3d130220a2b3891d8a8b Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Thu, 26 Mar 2026 10:47:26 +1100 Subject: [PATCH 06/15] gitter port for vanir test --- gcp/workers/cloudbuild.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/gcp/workers/cloudbuild.yaml b/gcp/workers/cloudbuild.yaml index 242fd8e79d2..a21c1a936c3 100644 --- a/gcp/workers/cloudbuild.yaml +++ b/gcp/workers/cloudbuild.yaml @@ -68,6 +68,7 @@ steps: args: ['bash', '-ex', 'run_tests.sh'] env: - DATASTORE_EMULATOR_PORT=8006 + - GITTER_PORT=8892 waitFor: ['init', 'sync'] timeout: 7200s From ac72b2f449cfe260f23adddf72d7aacc1439be10 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Thu, 26 Mar 2026 11:00:10 +1100 Subject: [PATCH 07/15] remove vanir test --- gcp/workers/cloudbuild.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/gcp/workers/cloudbuild.yaml b/gcp/workers/cloudbuild.yaml index a21c1a936c3..51390bb3038 100644 --- a/gcp/workers/cloudbuild.yaml +++ b/gcp/workers/cloudbuild.yaml @@ -62,15 +62,6 @@ steps: - GITTER_PORT=8891 waitFor: ['init', 'sync'] -- name: 'gcr.io/oss-vdb/ci' - id: 'vanir-signatures-tests' - dir: gcp/workers/vanir_signatures - args: ['bash', '-ex', 'run_tests.sh'] - env: - - DATASTORE_EMULATOR_PORT=8006 - - GITTER_PORT=8892 - waitFor: ['init', 'sync'] - timeout: 7200s options: machineType: E2_HIGHCPU_8 From 7679d1ceb17341020494ff2d9ddb8f2e72adcc04 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Thu, 26 Mar 2026 11:08:15 +1100 Subject: [PATCH 08/15] add vanir test back and increase timeout --- gcp/workers/cloudbuild.yaml | 8 ++++++++ osv/tests.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/gcp/workers/cloudbuild.yaml b/gcp/workers/cloudbuild.yaml index 51390bb3038..242fd8e79d2 100644 --- a/gcp/workers/cloudbuild.yaml +++ b/gcp/workers/cloudbuild.yaml @@ -62,6 +62,14 @@ steps: - GITTER_PORT=8891 waitFor: ['init', 'sync'] +- name: 'gcr.io/oss-vdb/ci' + id: 'vanir-signatures-tests' + dir: gcp/workers/vanir_signatures + args: ['bash', '-ex', 'run_tests.sh'] + env: + - DATASTORE_EMULATOR_PORT=8006 + waitFor: ['init', 'sync'] + timeout: 7200s options: machineType: E2_HIGHCPU_8 diff --git a/osv/tests.py b/osv/tests.py index 90162f14705..17bceeeeb76 100644 --- a/osv/tests.py +++ b/osv/tests.py @@ -298,7 +298,7 @@ def setup_gitter(): # Wait a bit for it to start (optional, but good for stability) # Basic check to see if it crashed immediately try: - proc.wait(timeout=1.0) + proc.wait(timeout=10.0) # If it returns, it exited raise RuntimeError( f'Gitter exited early:\n{proc.stdout.read().decode()}\n\n' From 707f3adae9191e3d486f672ac8064a463d18ab78 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Thu, 26 Mar 2026 11:26:00 +1100 Subject: [PATCH 09/15] increase timeout further --- osv/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/osv/tests.py b/osv/tests.py index 17bceeeeb76..07568210c10 100644 --- a/osv/tests.py +++ b/osv/tests.py @@ -298,7 +298,7 @@ def setup_gitter(): # Wait a bit for it to start (optional, but good for stability) # Basic check to see if it crashed immediately try: - proc.wait(timeout=10.0) + proc.wait(timeout=15.0) # If it returns, it exited raise RuntimeError( f'Gitter exited early:\n{proc.stdout.read().decode()}\n\n' From f255fe553526b7a8bda38c5aca771ef64985a450 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Thu, 26 Mar 2026 14:08:27 +1100 Subject: [PATCH 10/15] debug setup gitter --- osv/tests.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/osv/tests.py b/osv/tests.py index 07568210c10..95ad4a3395d 100644 --- a/osv/tests.py +++ b/osv/tests.py @@ -19,6 +19,7 @@ import shutil import pprint import signal +import socket import time from proto import datetime_helpers @@ -295,17 +296,23 @@ def setup_gitter(): ) try: - # Wait a bit for it to start (optional, but good for stability) - # Basic check to see if it crashed immediately - try: - proc.wait(timeout=15.0) - # If it returns, it exited - raise RuntimeError( - f'Gitter exited early:\n{proc.stdout.read().decode()}\n\n' - f'{proc.stderr.read().decode()}') - except subprocess.TimeoutExpired: - # Process is still running - pass + # Wait for it to start + start = time.time() + while time.time() - start < 60: + if proc.poll() is not None: + # If it returns, it exited + raise RuntimeError( + f'Gitter exited early:\n{proc.stdout.read().decode()}\n\n' + f'{proc.stderr.read().decode()}') + + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(('localhost', int(gitter_port))) == 0: + print(f'Gitter ready in {time.time() - start:.2f}s.') + break + time.sleep(0.5) + else: + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + raise RuntimeError('Gitter did not get ready in time.') yield From e6f7f9ffd1dbc234bb4a1ce7db53ba6c33636476 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Fri, 27 Mar 2026 15:09:48 +1100 Subject: [PATCH 11/15] batch --- .../vanir_signatures/vanir_signatures.py | 287 ++++++++++++------ .../vanir_signatures/vanir_signatures_test.py | 88 +++--- osv/gcs.py | 37 ++- 3 files changed, 275 insertions(+), 137 deletions(-) diff --git a/gcp/workers/vanir_signatures/vanir_signatures.py b/gcp/workers/vanir_signatures/vanir_signatures.py index 75ea76eaa0d..6fff836b904 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures.py +++ b/gcp/workers/vanir_signatures/vanir_signatures.py @@ -15,10 +15,11 @@ """Cron job to generate Vanir signatures for modified vulnerabilities.""" import argparse +import datetime import json import logging import os -import time +from concurrent import futures from google.cloud import ndb from google.protobuf import json_format @@ -35,35 +36,54 @@ JOB_DATA_LAST_RUN = 'vanir_signatures_last_run' -def _generate_vanir_signatures( - vulnerability: vulnerability_pb2.Vulnerability -) -> vulnerability_pb2.Vulnerability: - """Generates Vanir signatures for a vulnerability.""" - logging.info('Generating Vanir signatures for %s', vulnerability.id) +def _generate_vanir_signatures_batch( + vulnerabilities: list[vulnerability_pb2.Vulnerability] +) -> dict[str, list[vulnerability_pb2.Vulnerability]]: + """Generates Vanir signatures for a batch of vulnerabilities.""" + if not vulnerabilities: + return {} + + logging.info('Generating Vanir signatures for batch of %d', + len(vulnerabilities)) + try: + vuln_dicts = [ + json_format.MessageToDict(v, preserving_proto_field_name=True) + for v in vulnerabilities + ] vuln_manager = vulnerability_manager.generate_from_json_string( - content=json.dumps([ - json_format.MessageToDict( - vulnerability, preserving_proto_field_name=True) - ]),) + content=json.dumps(vuln_dicts)) vuln_manager.generate_signatures() if not vuln_manager.vulnerabilities: logging.warning('Vanir signature generation resulted in no ' 'vulnerabilities.') - return vulnerability + return {v.id: [v] for v in vulnerabilities} + + results = {} + for v in vuln_manager.vulnerabilities: + proto = v.to_proto() + if proto.id not in results: + results[proto.id] = [] + results[proto.id].append(proto) + + # Ensure all input IDs are in results, even if they weren't enriched + for v in vulnerabilities: + if v.id not in results: + results[v.id] = [v] + + return results - return vuln_manager.vulnerabilities[0].to_proto() except Exception: - logging.exception('Failed to generate Vanir signatures for %s', - vulnerability.id) - return vulnerability + logging.exception('Failed to generate Vanir signatures for batch of %d', + len(vulnerabilities)) + return {v.id: [v] for v in vulnerabilities} def affected_is_kernel(affected: vulnerability_pb2.Affected) -> bool: """Returns True if the affected package is a Linux kernel.""" - if affected.package.name == 'Kernel' and \ - affected.package.ecosystem == 'Linux': + if (affected.package.name == 'Kernel' and + affected.package.ecosystem == 'Linux'): return True if any('git.kernel.org/pub/scm/linux/kernel/git' in ar.repo @@ -73,85 +93,122 @@ def affected_is_kernel(affected: vulnerability_pb2.Affected) -> bool: return False -def process_vulnerability(vuln_id, dry_run=False, output_dir=None): - """Process a single vulnerability to generate Vanir signatures.""" - logging.debug('Processing %s', vuln_id) +def process_batch(vuln_ids: list[str], + dry_run: bool = False, + max_workers: int = 20) -> int: + """Process a batch of vulnerabilities.""" + if not vuln_ids: + return 0 + + logging.info('Processing batch of %d vulnerabilities', len(vuln_ids)) + + # Parallel fetch OSV records from GCS + vulnerabilities_to_process = [] + gcs_generations = {} - vuln_and_gen = osv.gcs.get_by_id_with_generation(vuln_id) - if not vuln_and_gen: - logging.warning('Vulnerability %s not found in GCS', vuln_id) - return False + with futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - vulnerability, gcs_gen = vuln_and_gen - original_vulnerability = vulnerability_pb2.Vulnerability() - original_vulnerability.CopyFrom(vulnerability) + results = list(executor.map(osv.gcs.get_by_id_with_generation, vuln_ids)) - if not any(r.type == vulnerability_pb2.Range.GIT - for affected in vulnerability.affected - for r in affected.ranges): - logging.debug( - 'Skipping Vanir signature generation for %s as it has no ' - 'GIT affected ranges.', vuln_id) - return False + for i, res in enumerate(results): + vuln_id = vuln_ids[i] + if not res: + logging.warning('Vulnerability %s not found in GCS', vuln_id) + continue - if any('vanir_signatures' in affected.database_specific.fields - for affected in vulnerability.affected): - logging.debug( - 'Skipping Vanir signature generation for %s as it already has ' - 'Vanir signatures.', vuln_id) - return False + vulnerability, gcs_gen = res - if any(affected_is_kernel(affected) for affected in vulnerability.affected): - logging.debug('Skipping %s as it is a Kernel vulnerability', vuln_id) - return False + # Filter + if not any(r.type == vulnerability_pb2.Range.GIT + for affected in vulnerability.affected + for r in affected.ranges): + logging.debug('Skipping %s: no GIT affected ranges.', vuln_id) + continue - enriched_vulnerability = _generate_vanir_signatures(vulnerability) + if any( + affected_is_kernel(affected) for affected in vulnerability.affected): + logging.debug('Skipping %s: it is a Kernel vulnerability', vuln_id) + continue - if original_vulnerability == enriched_vulnerability: - logging.debug('No changes in Vanir signatures for %s', vuln_id) - return False + vulnerabilities_to_process.append(vulnerability) + gcs_generations[vulnerability.id] = gcs_gen + + if not vulnerabilities_to_process: + return 0 + + # Batch signature generation + batch_results = _generate_vanir_signatures_batch(vulnerabilities_to_process) + + # Collect all enriched vulnerabilities + all_enriched = [] + for original_vuln in vulnerabilities_to_process: + enriched = batch_results.get(original_vuln.id, [original_vuln]) + for v in enriched: + if v != original_vuln: + all_enriched.append(v) + + if not all_enriched: + return 0 if dry_run: - logging.info('Dry run: would have updated %s', vuln_id) - if output_dir: - if not os.path.exists(output_dir): - os.makedirs(output_dir) - output_path = os.path.join(output_dir, f'{vuln_id}.json') - with open(output_path, 'w') as f: - f.write( - json_format.MessageToJson( - enriched_vulnerability, preserving_proto_field_name=True)) - logging.info('Saved enriched vulnerability to %s', output_path) - return True + for enriched_vuln in all_enriched: + logging.info('Dry run: would have updated %s', enriched_vuln.id) + return len(all_enriched) - bug = osv.Bug.get_by_id(vuln_id) - if not bug: - logging.error('Bug %s not found in Datastore', vuln_id) - return False + # Production: update Datastore and GCS + bugs_to_put = [] + uploads_to_perform = [] - bug.update_from_vulnerability(enriched_vulnerability) - bug.last_modified = osv.utcnow() - bug.put() + # Batch fetch Bugs from Datastore + bug_keys = [ndb.Key(osv.models.Bug, v.id) for v in all_enriched] + bugs = ndb.get_multi(bug_keys) - logging.info('Updated Datastore for %s, now uploading to GCS', vuln_id) - try: - osv.gcs.upload_vulnerability(enriched_vulnerability, gcs_gen) - except Exception: - logging.error('Failed to upload %s to GCS', vuln_id) - # Even if GCS upload fails, we return True as the Datastore is updated. - # Bug._post_put_hook will also attempt to upload the vulnerability. + for v, bug in zip(all_enriched, bugs): + if not bug: + logging.error('Bug %s not found in Datastore', v.id) + continue - return True + bug.update_from_vulnerability(v) + bug.last_modified = osv.utcnow() + bugs_to_put.append(bug) + + # Use gcs_generations[original_id] ONLY if it matches enriched ID. + gen = gcs_generations.get(v.id) + uploads_to_perform.append((v, gen)) + + if bugs_to_put: + ndb.put_multi(bugs_to_put) + + updated_count = 0 + for v, gen in uploads_to_perform: + try: + osv.gcs.upload_vulnerability(v, gen) + updated_count += 1 + except Exception: + logging.exception('Failed to upload vulnerability %s to GCS', v.id) + + return updated_count def main(): """Main entry point for the cron job.""" parser = argparse.ArgumentParser(description='Vanir signatures cron job.') + parser.add_argument( + '--batch-size', + type=int, + default=500, + help='Number of vulnerabilities to process in each batch.') + parser.add_argument( + '--max-workers', + type=int, + default=20, + help='Maximum number of parallel workers.') parser.add_argument( '--dry-run', action='store_true', help='Perform a dry run.') parser.add_argument( - '--output-dir', - help='Directory to save enriched vulnerabilities during dry run.') + '--hours', + type=int, + help='Number of hours back to process modified records.') args = parser.parse_args() if args.dry_run: @@ -163,35 +220,63 @@ def main(): # Capture current time to use as last_run for the next time. current_run = osv.utcnow() - if last_run_data: - last_run = last_run_data.value - logging.info('Running Vanir signature generation since %s', last_run) - query = osv.models.Vulnerability.query( - osv.models.Vulnerability.modified > last_run) + if args.hours: + last_run = current_run - datetime.timedelta(hours=args.hours) + logging.info( + 'Running Vanir signature generation for the last %d hours (since %s)', + args.hours, last_run) else: - logging.info('No last run found, querying all vulnerabilities.') - query = osv.models.Vulnerability.query() - - vuln_ids = [key.id() for key in query.fetch(keys_only=True)] - - logging.info('Found %d vulnerabilities to process', len(vuln_ids)) - - generated_count = 0 - start_time = time.time() - for vuln_id in vuln_ids: - try: - if process_vulnerability(vuln_id, args.dry_run, args.output_dir): - generated_count += 1 - except Exception: - logging.exception('Error processing vulnerability %s', vuln_id) - end_time = time.time() + last_run = last_run_data.value if last_run_data else None + if last_run: + logging.info('Running Vanir signature generation since last run: %s', + last_run) + else: + logging.info('No last run found, querying all vulnerabilities.') + + # Single global query for modified vulnerabilities + query = osv.models.Vulnerability.query() + if last_run: + query = query.filter(osv.models.Vulnerability.modified > last_run) + + total_generated_count = 0 + total_processed_count = 0 + + with futures.ThreadPoolExecutor(max_workers=args.max_workers) as executor: + + def process_with_context(batch): + with ndb.Client().context(): + return process_batch(batch, args.dry_run, args.max_workers) + + future_to_batch = {} + current_batch = [] + + logging.info('Streaming vulnerabilities for processing.') + for key in query.iter(keys_only=True): + current_batch.append(key.id()) + if len(current_batch) >= args.batch_size: + f = executor.submit(process_with_context, current_batch) + future_to_batch[f] = current_batch + current_batch = [] + + if current_batch: + f = executor.submit(process_with_context, current_batch) + future_to_batch[f] = current_batch + + if not future_to_batch: + logging.info('No modified vulnerabilities found.') + else: + logging.info('Processing %d batches of vulnerabilities.', + len(future_to_batch)) + for future in futures.as_completed(future_to_batch): + try: + generated = future.result() + total_generated_count += generated + total_processed_count += len(future_to_batch[future]) + except Exception: + logging.exception('Failed to process a batch of vulnerabilities') - total_time = end_time - start_time - avg_time = total_time / len(vuln_ids) if vuln_ids else 0 logging.info('Processed %d vulnerabilities, generated %d new signatures.', - len(vuln_ids), generated_count) - logging.info('Total processing time: %.2f seconds (Avg %.4f seconds/vuln)', - total_time, avg_time) + total_processed_count, total_generated_count) if args.dry_run: logging.info('Dry run: would have updated last_run to %s', current_run) diff --git a/gcp/workers/vanir_signatures/vanir_signatures_test.py b/gcp/workers/vanir_signatures/vanir_signatures_test.py index 43132953469..a4eefa03a61 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures_test.py +++ b/gcp/workers/vanir_signatures/vanir_signatures_test.py @@ -14,6 +14,7 @@ """Tests for vanir_signatures.""" import unittest +import datetime from unittest import mock from google.cloud import ndb @@ -36,42 +37,23 @@ def setUp(self): self.emulator.reset() @mock.patch('osv.gcs.get_by_id_with_generation') - def test_process_vulnerability_skip_existing_signatures(self, mock_get_gcs): - """Test skipping when signatures already exist.""" - vuln_id = 'OSV-2026-123' - vuln = vulnerability_pb2.Vulnerability(id=vuln_id) - affected = vuln.affected.add() - affected.database_specific['vanir_signatures'] = [] - affected.ranges.add( - type=vulnerability_pb2.Range.GIT, repo='https://example.com/repo') - - mock_get_gcs.return_value = (vuln, '123') - - with self.assertLogs(level='DEBUG') as cm: - result = vanir_signatures.process_vulnerability(vuln_id) - self.assertFalse(result) - self.assertTrue( - any('already has Vanir signatures' in log for log in cm.output)) - - @mock.patch('osv.gcs.get_by_id_with_generation') - def test_process_vulnerability_skip_no_git_ranges(self, mock_get_gcs): + def test_process_batch_skip_no_git_ranges(self, mock_get_gcs): """Test skipping when no GIT ranges are present.""" - vuln_id = 'OSV-2026-123' + vuln_id = 'VULN-1' vuln = vulnerability_pb2.Vulnerability(id=vuln_id) vuln.affected.add() mock_get_gcs.return_value = (vuln, '123') with self.assertLogs(level='DEBUG') as cm: - result = vanir_signatures.process_vulnerability(vuln_id) - self.assertFalse(result) - self.assertTrue( - any('has no GIT affected ranges' in log for log in cm.output)) + result = vanir_signatures.process_batch([vuln_id]) + self.assertEqual(result, 0) + self.assertTrue(any('no GIT affected ranges' in log for log in cm.output)) @mock.patch('osv.gcs.get_by_id_with_generation') - def test_process_vulnerability_skip_kernel(self, mock_get_gcs): + def test_process_batch_skip_kernel(self, mock_get_gcs): """Test skipping kernel vulnerabilities.""" - vuln_id = 'CVE-2023-1234' + vuln_id = 'VULN-1' vuln = vulnerability_pb2.Vulnerability(id=vuln_id) affected = vuln.affected.add() affected.package.name = 'Kernel' @@ -83,18 +65,18 @@ def test_process_vulnerability_skip_kernel(self, mock_get_gcs): mock_get_gcs.return_value = (vuln, '123') with self.assertLogs(level='DEBUG') as cm: - result = vanir_signatures.process_vulnerability(vuln_id) - self.assertFalse(result) + result = vanir_signatures.process_batch([vuln_id]) + self.assertEqual(result, 0) self.assertTrue( any('is a Kernel vulnerability' in log for log in cm.output)) @mock.patch('osv.gcs.get_by_id_with_generation') @mock.patch('osv.gcs.upload_vulnerability') - @mock.patch('vanir_signatures._generate_vanir_signatures') - def test_process_vulnerability_success(self, mock_gen_signatures, mock_upload, - mock_get_gcs): + @mock.patch('vanir_signatures._generate_vanir_signatures_batch') + def test_process_batch_success(self, mock_gen_signatures, mock_upload, + mock_get_gcs): """Test successful signature generation.""" - vuln_id = 'OSV-2026-123' + vuln_id = 'VULN-1' # Input vulnerability vuln = vulnerability_pb2.Vulnerability(id=vuln_id) @@ -110,22 +92,58 @@ def test_process_vulnerability_success(self, mock_gen_signatures, mock_upload, enriched_vuln.affected[0].database_specific['vanir_signatures'] = [{ 'id': 'sig1' }] - mock_gen_signatures.return_value = enriched_vuln + mock_gen_signatures.return_value = {vuln_id: [enriched_vuln]} # Setup Datastore Bug bug = osv.Bug(id=vuln_id, db_id=vuln_id, source='test') bug.put() - result = vanir_signatures.process_vulnerability(vuln_id) + result = vanir_signatures.process_batch([vuln_id]) - self.assertTrue(result) + self.assertEqual(result, 1) mock_upload.assert_called_once() + mock_gen_signatures.assert_called_once_with([vuln]) # Verify Datastore update updated_bug = osv.Bug.get_by_id(vuln_id) self.assertIn('vanir_signatures', updated_bug.affected_packages[0].database_specific) + @mock.patch('osv.models.Vulnerability.query') + @mock.patch('vanir_signatures.process_batch') + def test_global_batching(self, mock_process_batch, mock_vuln_query): + """Test performing global batching of all found vulnerabilities.""" + # Mock Vulnerability query with 150 items + vuln_keys = [mock.Mock() for _ in range(150)] + for i, k in enumerate(vuln_keys): + k.id.return_value = f'VULN-{i}' + + mock_query = mock.Mock() + mock_vuln_query.return_value = mock_query + mock_query.filter.return_value = mock_query + mock_query.iter.return_value = vuln_keys + + mock_process_batch.return_value = 10 + + # Run main with dry-run and batch_size=100 + with mock.patch( + 'argparse.ArgumentParser.parse_args', + return_value=mock.Mock( + dry_run=True, batch_size=100, max_workers=20, hours=None)): + vanir_signatures.main() + + # Verify process_batch was called for each chunk (BATCH_SIZE=100) + # 150 items -> 2 batches + self.assertEqual(mock_process_batch.call_count, 2) + + # First batch of 100 + expected_batch1 = [f'VULN-{i}' for i in range(100)] + mock_process_batch.assert_any_call(expected_batch1, True, 20) + + # Second batch of 50 + expected_batch2 = [f'VULN-{i}' for i in range(100, 150)] + mock_process_batch.assert_any_call(expected_batch2, True, 20) + if __name__ == '__main__': unittest.main() diff --git a/osv/gcs.py b/osv/gcs.py index 302d4e3272e..86d286c62f5 100644 --- a/osv/gcs.py +++ b/osv/gcs.py @@ -18,6 +18,7 @@ from google.cloud import exceptions from google.cloud import storage +from requests.adapters import HTTPAdapter from .vulnerability_pb2 import Vulnerability @@ -26,10 +27,44 @@ _storage_client = None +def modify_storage_client_adapters(storage_client: storage.Client, + pool_connections: int = 128, + max_retries: int = 3, + pool_block: bool = True) -> storage.Client: + """Returns a modified google.cloud.storage.Client object. + + Due to many concurrent GCS connections, the default connection pool can become + overwhelmed, introducing delays. + + Solution described in https://github.com/googleapis/python-storage/issues/253 + + These affect the urllib3.HTTPConnectionPool underpinning the storage.Client's + HTTP requests. + + Args: + storage_client: an existing google.cloud.storage.Client object + pool_connections: number of pool_connections desired + max_retries: maximum retries + pool_block: blocking behaviour when pool is exhausted + + Returns: + the google.cloud.storage.Client appropriately modified. + + """ + adapter = HTTPAdapter( + pool_connections=pool_connections, + max_retries=max_retries, + pool_block=pool_block) + # pylint: disable=protected-access + storage_client._http.mount('https://', adapter) + storage_client._http._auth_request.session.mount('https://', adapter) + return storage_client + + def _get_storage_client() -> storage.Client: global _storage_client if _storage_client is None: - _storage_client = storage.Client() + _storage_client = modify_storage_client_adapters(storage.Client()) return _storage_client From 5b186e8d3b4f14458854b069cd3180df103e298a Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Mon, 30 Mar 2026 11:48:47 +1100 Subject: [PATCH 12/15] unused import --- gcp/workers/vanir_signatures/vanir_signatures.py | 1 - gcp/workers/vanir_signatures/vanir_signatures_test.py | 1 - osv/tests.py | 1 - 3 files changed, 3 deletions(-) diff --git a/gcp/workers/vanir_signatures/vanir_signatures.py b/gcp/workers/vanir_signatures/vanir_signatures.py index 6fff836b904..7fcc2717a39 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures.py +++ b/gcp/workers/vanir_signatures/vanir_signatures.py @@ -18,7 +18,6 @@ import datetime import json import logging -import os from concurrent import futures from google.cloud import ndb diff --git a/gcp/workers/vanir_signatures/vanir_signatures_test.py b/gcp/workers/vanir_signatures/vanir_signatures_test.py index a4eefa03a61..3b58e2b1adf 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures_test.py +++ b/gcp/workers/vanir_signatures/vanir_signatures_test.py @@ -14,7 +14,6 @@ """Tests for vanir_signatures.""" import unittest -import datetime from unittest import mock from google.cloud import ndb diff --git a/osv/tests.py b/osv/tests.py index 4a0087edd1a..752b4271e20 100644 --- a/osv/tests.py +++ b/osv/tests.py @@ -20,7 +20,6 @@ import socket import pprint import signal -import socket import time from proto import datetime_helpers From 9b830a9806e35d6982f3047ccf5331d78f1454d4 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Mon, 30 Mar 2026 14:25:54 +1100 Subject: [PATCH 13/15] cron last successful time --- deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml b/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml index 80bfe7144db..a030d4a94e7 100644 --- a/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml +++ b/deployment/clouddeploy/gke-workers/base/vanir-signatures.yaml @@ -3,7 +3,7 @@ kind: CronJob metadata: name: vanir-signatures labels: - cronLastSuccessfulTimeMins: "60" + cronLastSuccessfulTimeMins: "1500" spec: schedule: "0 9 * * *" timeZone: "Australia/Sydney" From b09405223097c5747ac6d39c96fba1c7dd5d1550 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Mon, 30 Mar 2026 15:17:13 +1100 Subject: [PATCH 14/15] no bug but vulnerability --- .../vanir_signatures/vanir_signatures.py | 37 +++++++++---------- .../vanir_signatures/vanir_signatures_test.py | 28 +++++++++++--- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/gcp/workers/vanir_signatures/vanir_signatures.py b/gcp/workers/vanir_signatures/vanir_signatures.py index 7fcc2717a39..8cbdb1820d4 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures.py +++ b/gcp/workers/vanir_signatures/vanir_signatures.py @@ -129,6 +129,10 @@ def process_batch(vuln_ids: list[str], logging.debug('Skipping %s: it is a Kernel vulnerability', vuln_id) continue + if vulnerability.HasField('withdrawn'): + logging.debug('Skipping %s: it is withdrawn', vuln_id) + continue + vulnerabilities_to_process.append(vulnerability) gcs_generations[vulnerability.id] = gcs_gen @@ -154,32 +158,27 @@ def process_batch(vuln_ids: list[str], logging.info('Dry run: would have updated %s', enriched_vuln.id) return len(all_enriched) - # Production: update Datastore and GCS - bugs_to_put = [] - uploads_to_perform = [] + # Update Datastore and GCS + updated_count = 0 - # Batch fetch Bugs from Datastore - bug_keys = [ndb.Key(osv.models.Bug, v.id) for v in all_enriched] - bugs = ndb.get_multi(bug_keys) + # Batch fetch Vulnerabilities from Datastore + vuln_keys = [ndb.Key(osv.models.Vulnerability, v.id) for v in all_enriched] + vulns = ndb.get_multi(vuln_keys) - for v, bug in zip(all_enriched, bugs): - if not bug: - logging.error('Bug %s not found in Datastore', v.id) + for v, vuln in zip(all_enriched, vulns): + if not vuln: + logging.error('Vulnerability %s not found in Datastore', v.id) continue - bug.update_from_vulnerability(v) - bug.last_modified = osv.utcnow() - bugs_to_put.append(bug) + now = osv.utcnow() + v.modified.FromDatetime(now) + vuln.modified = now + + # Update Vulnerability entity in Datastore + vuln.put() # Use gcs_generations[original_id] ONLY if it matches enriched ID. gen = gcs_generations.get(v.id) - uploads_to_perform.append((v, gen)) - - if bugs_to_put: - ndb.put_multi(bugs_to_put) - - updated_count = 0 - for v, gen in uploads_to_perform: try: osv.gcs.upload_vulnerability(v, gen) updated_count += 1 diff --git a/gcp/workers/vanir_signatures/vanir_signatures_test.py b/gcp/workers/vanir_signatures/vanir_signatures_test.py index 3b58e2b1adf..b95dd69ba4e 100644 --- a/gcp/workers/vanir_signatures/vanir_signatures_test.py +++ b/gcp/workers/vanir_signatures/vanir_signatures_test.py @@ -69,6 +69,23 @@ def test_process_batch_skip_kernel(self, mock_get_gcs): self.assertTrue( any('is a Kernel vulnerability' in log for log in cm.output)) + @mock.patch('osv.gcs.get_by_id_with_generation') + def test_process_batch_skip_withdrawn(self, mock_get_gcs): + """Test skipping withdrawn vulnerabilities.""" + vuln_id = 'VULN-1' + vuln = vulnerability_pb2.Vulnerability(id=vuln_id) + vuln.withdrawn.FromSeconds(1234567890) + affected = vuln.affected.add() + affected.ranges.add( + type=vulnerability_pb2.Range.GIT, repo='https://example.com/repo') + + mock_get_gcs.return_value = (vuln, '123') + + with self.assertLogs(level='DEBUG') as cm: + result = vanir_signatures.process_batch([vuln_id]) + self.assertEqual(result, 0) + self.assertTrue(any('it is withdrawn' in log for log in cm.output)) + @mock.patch('osv.gcs.get_by_id_with_generation') @mock.patch('osv.gcs.upload_vulnerability') @mock.patch('vanir_signatures._generate_vanir_signatures_batch') @@ -93,9 +110,9 @@ def test_process_batch_success(self, mock_gen_signatures, mock_upload, }] mock_gen_signatures.return_value = {vuln_id: [enriched_vuln]} - # Setup Datastore Bug - bug = osv.Bug(id=vuln_id, db_id=vuln_id, source='test') - bug.put() + # Setup Datastore Vulnerability + vuln_entity = osv.Vulnerability(id=vuln_id) + vuln_entity.put() result = vanir_signatures.process_batch([vuln_id]) @@ -104,9 +121,8 @@ def test_process_batch_success(self, mock_gen_signatures, mock_upload, mock_gen_signatures.assert_called_once_with([vuln]) # Verify Datastore update - updated_bug = osv.Bug.get_by_id(vuln_id) - self.assertIn('vanir_signatures', - updated_bug.affected_packages[0].database_specific) + updated_vuln = osv.Vulnerability.get_by_id(vuln_id) + self.assertIsNotNone(updated_vuln.modified) @mock.patch('osv.models.Vulnerability.query') @mock.patch('vanir_signatures.process_batch') From fd7ac8a53a17041daf498f7ce0caec9010349e17 Mon Sep 17 00:00:00 2001 From: Xueqin Cui Date: Mon, 30 Mar 2026 16:20:59 +1100 Subject: [PATCH 15/15] gcs --- osv/gcs.py | 36 +----------------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/osv/gcs.py b/osv/gcs.py index 86d286c62f5..f8dfa98e228 100644 --- a/osv/gcs.py +++ b/osv/gcs.py @@ -27,44 +27,10 @@ _storage_client = None -def modify_storage_client_adapters(storage_client: storage.Client, - pool_connections: int = 128, - max_retries: int = 3, - pool_block: bool = True) -> storage.Client: - """Returns a modified google.cloud.storage.Client object. - - Due to many concurrent GCS connections, the default connection pool can become - overwhelmed, introducing delays. - - Solution described in https://github.com/googleapis/python-storage/issues/253 - - These affect the urllib3.HTTPConnectionPool underpinning the storage.Client's - HTTP requests. - - Args: - storage_client: an existing google.cloud.storage.Client object - pool_connections: number of pool_connections desired - max_retries: maximum retries - pool_block: blocking behaviour when pool is exhausted - - Returns: - the google.cloud.storage.Client appropriately modified. - - """ - adapter = HTTPAdapter( - pool_connections=pool_connections, - max_retries=max_retries, - pool_block=pool_block) - # pylint: disable=protected-access - storage_client._http.mount('https://', adapter) - storage_client._http._auth_request.session.mount('https://', adapter) - return storage_client - - def _get_storage_client() -> storage.Client: global _storage_client if _storage_client is None: - _storage_client = modify_storage_client_adapters(storage.Client()) + _storage_client = storage.Client() return _storage_client