diff --git a/osf/management/commands/copy_collection_submission_metadata_to_cedar.py b/osf/management/commands/copy_collection_submission_metadata_to_cedar.py new file mode 100644 index 00000000000..255228b4c96 --- /dev/null +++ b/osf/management/commands/copy_collection_submission_metadata_to_cedar.py @@ -0,0 +1,73 @@ +import logging + +from django.core.management.base import BaseCommand + +from osf.models import CollectionSubmission + +logger = logging.getLogger(__name__) + + +def copy_collection_submission_metadata_to_cedar(dry_run=False, batch_size=100, provider_id=None): + qs = CollectionSubmission.objects.filter( + collection__provider__required_metadata_template__isnull=False, + ).select_related( + 'guid', + 'collection__provider__required_metadata_template', + ) + + if provider_id: + qs = qs.filter(collection__provider___id=provider_id) + + total = qs.count() + logger.info(f'{"[DRY RUN] " if dry_run else ""}Found {total} collection submissions to process') + + processed = errors = 0 + for submission in qs.iterator(chunk_size=batch_size): + if dry_run: + logger.info(f'[DRY RUN] Would sync cedar metadata for submission {submission._id}') + continue + try: + submission.sync_cedar_metadata() + processed += 1 + except Exception as e: + logger.error(f'Failed to sync cedar metadata for submission {submission._id}: {e}') + errors += 1 + + logger.info( + f'{"[DRY RUN] " if dry_run else ""}' + f'Done. Processed {processed}/{total} submissions' + f'{f", {errors} error(s)" if errors else ""}' + ) + + +class Command(BaseCommand): + help = 'Copy CollectionSubmission custom metadata fields to CedarMetadataRecord for providers with a required cedar template.' + + def add_arguments(self, parser): + super().add_arguments(parser) + parser.add_argument( + '--dry-run', + action='store_true', + dest='dry_run', + help='Preview what would be synced without making any changes', + ) + parser.add_argument( + '--batch-size', + type=int, + default=100, + dest='batch_size', + help='Number of submissions to process per iteration (default: 100)', + ) + parser.add_argument( + '--provider', + type=str, + dest='provider_id', + help='Optional collection provider _id to limit processing to a single provider', + ) + + def handle(self, *args, **options): + copy_collection_submission_metadata_to_cedar( + dry_run=options['dry_run'], + batch_size=options['batch_size'], + provider_id=options.get('provider_id'), + ) diff --git a/osf/models/collection_submission.py b/osf/models/collection_submission.py index f2de5ba6610..a0de6971190 100644 --- a/osf/models/collection_submission.py +++ b/osf/models/collection_submission.py @@ -22,6 +22,11 @@ logger = logging.getLogger(__name__) +CEDAR_METADATA_FIELDS = [ + 'collected_type', 'status', 'volume', 'issue', + 'program_area', 'school_type', 'study_design', + 'data_type', 'disease', 'grade_levels', +] class CollectionSubmission(TaxonomizableMixin, BaseModel): primary_identifier_name = 'guid___id' @@ -475,6 +480,19 @@ def remove_from_index(self): logger.exception(e) sentry.log_exception(e) + def sync_cedar_metadata(self): + """Create or update a CedarMetadataRecord from this submission's custom metadata fields.""" + + from osf.models import CedarMetadataRecord + if not (self.collection.provider_id and self.collection.provider.required_metadata_template): + return + template = self.collection.provider.required_metadata_template + metadata = {f: getattr(self, f) for f in CEDAR_METADATA_FIELDS if getattr(self, f, '')} + record, _ = CedarMetadataRecord.objects.get_or_create(guid=self.guid, template=template) + record.metadata = metadata + record.is_published = True + record.save() + def save(self, *args, **kwargs): ret = super().save(*args, **kwargs) self.update_search() diff --git a/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py new file mode 100644 index 00000000000..9c800b34c8c --- /dev/null +++ b/osf_tests/management_commands/test_copy_collection_submission_metadata_to_cedar.py @@ -0,0 +1,220 @@ +import pytest +from faker import Faker +from unittest import mock + +from django.core.management import call_command + +from osf.models import CollectionSubmission, CedarMetadataRecord, CedarMetadataTemplate +from osf.management.commands.copy_collection_submission_metadata_to_cedar import ( + copy_collection_submission_metadata_to_cedar, +) +from osf_tests.factories import ( + CollectionFactory, + CollectionProviderFactory, + NodeFactory, +) +from tests.utils import capture_notifications + +fake = Faker() + + +def make_cedar_template(): + return CedarMetadataTemplate.objects.create( + schema_name=fake.bs(), + cedar_id=fake.md5(), + template_version=1, + template={}, + active=True, + ) + + +def make_collection(provider): + collection = CollectionFactory() + collection.provider = provider + collection.save() + return collection + + +def make_submission(collection, **fields): + node = NodeFactory(is_public=True) + submission = CollectionSubmission( + guid=node.guids.first(), + collection=collection, + creator=node.creator, + **fields, + ) + with capture_notifications(): + submission.save() + return submission + + +@pytest.fixture() +def cedar_template(): + return make_cedar_template() + + +@pytest.fixture() +def provider_with_template(cedar_template): + provider = CollectionProviderFactory() + provider.required_metadata_template = cedar_template + provider.save() + return provider + + +@pytest.fixture() +def provider_without_template(): + return CollectionProviderFactory() + + +@pytest.mark.django_db +class TestCopyCollectionSubmissionMetadataToCedar: + + def test_creates_record_for_submission_with_template(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software', status='active') + + copy_collection_submission_metadata_to_cedar() + + assert CedarMetadataRecord.objects.filter( + guid=submission.guid, + template=cedar_template, + ).exists() + + def test_record_contains_non_empty_fields_only(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='dataset', status='', volume='') + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == {'collected_type': 'dataset'} + + def test_record_is_published(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, status='active') + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.is_published is True + + def test_updates_existing_record(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, status='new') + CedarMetadataRecord.objects.create( + guid=submission.guid, + template=cedar_template, + metadata={'status': 'old'}, + is_published=False, + ) + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == {'status': 'new'} + assert record.is_published is True + + def test_skips_submissions_without_required_template(self, provider_without_template): + collection = make_collection(provider_without_template) + submission = make_submission(collection, collected_type='software') + + copy_collection_submission_metadata_to_cedar() + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_dry_run_makes_no_changes(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + copy_collection_submission_metadata_to_cedar(dry_run=True) + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_provider_filter_processes_only_matching_provider(self, cedar_template): + provider_a = CollectionProviderFactory() + provider_a.required_metadata_template = cedar_template + provider_a.save() + + provider_b = CollectionProviderFactory() + provider_b.required_metadata_template = make_cedar_template() + provider_b.save() + + sub_a = make_submission(make_collection(provider_a), collected_type='software') + sub_b = make_submission(make_collection(provider_b), collected_type='dataset') + + copy_collection_submission_metadata_to_cedar(provider_id=provider_a._id) + + assert CedarMetadataRecord.objects.filter(guid=sub_a.guid, template=cedar_template).exists() + assert not CedarMetadataRecord.objects.filter(guid=sub_b.guid).exists() + + def test_error_on_one_does_not_stop_others(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + make_submission(collection, collected_type='software') + make_submission(collection, collected_type='dataset') + + call_count = 0 + original_sync = CollectionSubmission.sync_cedar_metadata + + def sync_side_effect(self): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise Exception('simulated error') + original_sync(self) + + with mock.patch.object(CollectionSubmission, 'sync_cedar_metadata', sync_side_effect): + copy_collection_submission_metadata_to_cedar() + + assert call_count == 2 + assert CedarMetadataRecord.objects.filter(template=cedar_template).count() == 1 + + def test_call_command_interface(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + call_command('copy_collection_submission_metadata_to_cedar') + + assert CedarMetadataRecord.objects.filter( + guid=submission.guid, + template=cedar_template, + ).exists() + + def test_call_command_dry_run(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission(collection, collected_type='software') + + call_command('copy_collection_submission_metadata_to_cedar', '--dry-run') + + assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists() + + def test_all_cedar_fields_copied(self, provider_with_template, cedar_template): + collection = make_collection(provider_with_template) + submission = make_submission( + collection, + collected_type='software', + status='active', + volume='1', + issue='2', + program_area='health', + school_type='university', + study_design='rct', + data_type='quantitative', + disease='cancer', + grade_levels='K-12', + ) + + copy_collection_submission_metadata_to_cedar() + + record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template) + assert record.metadata == { + 'collected_type': 'software', + 'status': 'active', + 'volume': '1', + 'issue': '2', + 'program_area': 'health', + 'school_type': 'university', + 'study_design': 'rct', + 'data_type': 'quantitative', + 'disease': 'cancer', + 'grade_levels': 'K-12', + }