Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import logging

from django.core.management.base import BaseCommand

from osf.models import CollectionSubmission

logger = logging.getLogger(__name__)


def copy_collection_submission_metadata_to_cedar(dry_run=False, batch_size=100, provider_id=None):
qs = CollectionSubmission.objects.filter(
collection__provider__required_metadata_template__isnull=False,
).select_related(
'guid',
'collection__provider__required_metadata_template',
)

if provider_id:
qs = qs.filter(collection__provider___id=provider_id)

total = qs.count()
logger.info(f'{"[DRY RUN] " if dry_run else ""}Found {total} collection submissions to process')

processed = errors = 0
for submission in qs.iterator(chunk_size=batch_size):
if dry_run:
logger.info(f'[DRY RUN] Would sync cedar metadata for submission {submission._id}')
continue
try:
submission.sync_cedar_metadata()
processed += 1
except Exception as e:
logger.error(f'Failed to sync cedar metadata for submission {submission._id}: {e}')
errors += 1

logger.info(
f'{"[DRY RUN] " if dry_run else ""}'
f'Done. Processed {processed}/{total} submissions'
f'{f", {errors} error(s)" if errors else ""}'
)


class Command(BaseCommand):
help = 'Copy CollectionSubmission custom metadata fields to CedarMetadataRecord for providers with a required cedar template.'

def add_arguments(self, parser):
super().add_arguments(parser)
parser.add_argument(
'--dry-run',
action='store_true',
dest='dry_run',
help='Preview what would be synced without making any changes',
)
parser.add_argument(
'--batch-size',
type=int,
default=100,
dest='batch_size',
help='Number of submissions to process per iteration (default: 100)',
)
parser.add_argument(
'--provider',
type=str,
dest='provider_id',
help='Optional collection provider _id to limit processing to a single provider',
)

def handle(self, *args, **options):
copy_collection_submission_metadata_to_cedar(
dry_run=options['dry_run'],
batch_size=options['batch_size'],
provider_id=options.get('provider_id'),
)
18 changes: 18 additions & 0 deletions osf/models/collection_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@

logger = logging.getLogger(__name__)

CEDAR_METADATA_FIELDS = [
'collected_type', 'status', 'volume', 'issue',
'program_area', 'school_type', 'study_design',
'data_type', 'disease', 'grade_levels',
]

class CollectionSubmission(TaxonomizableMixin, BaseModel):
primary_identifier_name = 'guid___id'
Expand Down Expand Up @@ -475,6 +480,19 @@ def remove_from_index(self):
logger.exception(e)
sentry.log_exception(e)

def sync_cedar_metadata(self):
"""Create or update a CedarMetadataRecord from this submission's custom metadata fields."""

from osf.models import CedarMetadataRecord
if not (self.collection.provider_id and self.collection.provider.required_metadata_template):
return
template = self.collection.provider.required_metadata_template
metadata = {f: getattr(self, f) for f in CEDAR_METADATA_FIELDS if getattr(self, f, '')}
record, _ = CedarMetadataRecord.objects.get_or_create(guid=self.guid, template=template)
record.metadata = metadata
record.is_published = True
record.save()

def save(self, *args, **kwargs):
ret = super().save(*args, **kwargs)
self.update_search()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
import pytest
from faker import Faker
from unittest import mock

from django.core.management import call_command

from osf.models import CollectionSubmission, CedarMetadataRecord, CedarMetadataTemplate
from osf.management.commands.copy_collection_submission_metadata_to_cedar import (
copy_collection_submission_metadata_to_cedar,
)
from osf_tests.factories import (
CollectionFactory,
CollectionProviderFactory,
NodeFactory,
)
from tests.utils import capture_notifications

fake = Faker()


def make_cedar_template():
return CedarMetadataTemplate.objects.create(
schema_name=fake.bs(),
cedar_id=fake.md5(),
template_version=1,
template={},
active=True,
)


def make_collection(provider):
collection = CollectionFactory()
collection.provider = provider
collection.save()
return collection


def make_submission(collection, **fields):
node = NodeFactory(is_public=True)
submission = CollectionSubmission(
guid=node.guids.first(),
collection=collection,
creator=node.creator,
**fields,
)
with capture_notifications():
submission.save()
return submission


@pytest.fixture()
def cedar_template():
return make_cedar_template()


@pytest.fixture()
def provider_with_template(cedar_template):
provider = CollectionProviderFactory()
provider.required_metadata_template = cedar_template
provider.save()
return provider


@pytest.fixture()
def provider_without_template():
return CollectionProviderFactory()


@pytest.mark.django_db
class TestCopyCollectionSubmissionMetadataToCedar:

def test_creates_record_for_submission_with_template(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
submission = make_submission(collection, collected_type='software', status='active')

copy_collection_submission_metadata_to_cedar()

assert CedarMetadataRecord.objects.filter(
guid=submission.guid,
template=cedar_template,
).exists()

def test_record_contains_non_empty_fields_only(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
submission = make_submission(collection, collected_type='dataset', status='', volume='')

copy_collection_submission_metadata_to_cedar()

record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template)
assert record.metadata == {'collected_type': 'dataset'}

def test_record_is_published(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
submission = make_submission(collection, status='active')

copy_collection_submission_metadata_to_cedar()

record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template)
assert record.is_published is True

def test_updates_existing_record(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
submission = make_submission(collection, status='new')
CedarMetadataRecord.objects.create(
guid=submission.guid,
template=cedar_template,
metadata={'status': 'old'},
is_published=False,
)

copy_collection_submission_metadata_to_cedar()

record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template)
assert record.metadata == {'status': 'new'}
assert record.is_published is True

def test_skips_submissions_without_required_template(self, provider_without_template):
collection = make_collection(provider_without_template)
submission = make_submission(collection, collected_type='software')

copy_collection_submission_metadata_to_cedar()

assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists()

def test_dry_run_makes_no_changes(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
submission = make_submission(collection, collected_type='software')

copy_collection_submission_metadata_to_cedar(dry_run=True)

assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists()

def test_provider_filter_processes_only_matching_provider(self, cedar_template):
provider_a = CollectionProviderFactory()
provider_a.required_metadata_template = cedar_template
provider_a.save()

provider_b = CollectionProviderFactory()
provider_b.required_metadata_template = make_cedar_template()
provider_b.save()

sub_a = make_submission(make_collection(provider_a), collected_type='software')
sub_b = make_submission(make_collection(provider_b), collected_type='dataset')

copy_collection_submission_metadata_to_cedar(provider_id=provider_a._id)

assert CedarMetadataRecord.objects.filter(guid=sub_a.guid, template=cedar_template).exists()
assert not CedarMetadataRecord.objects.filter(guid=sub_b.guid).exists()

def test_error_on_one_does_not_stop_others(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
make_submission(collection, collected_type='software')
make_submission(collection, collected_type='dataset')

call_count = 0
original_sync = CollectionSubmission.sync_cedar_metadata

def sync_side_effect(self):
nonlocal call_count
call_count += 1
if call_count == 1:
raise Exception('simulated error')
original_sync(self)

with mock.patch.object(CollectionSubmission, 'sync_cedar_metadata', sync_side_effect):
copy_collection_submission_metadata_to_cedar()

assert call_count == 2
assert CedarMetadataRecord.objects.filter(template=cedar_template).count() == 1

def test_call_command_interface(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
submission = make_submission(collection, collected_type='software')

call_command('copy_collection_submission_metadata_to_cedar')

assert CedarMetadataRecord.objects.filter(
guid=submission.guid,
template=cedar_template,
).exists()

def test_call_command_dry_run(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
submission = make_submission(collection, collected_type='software')

call_command('copy_collection_submission_metadata_to_cedar', '--dry-run')

assert not CedarMetadataRecord.objects.filter(guid=submission.guid).exists()

def test_all_cedar_fields_copied(self, provider_with_template, cedar_template):
collection = make_collection(provider_with_template)
submission = make_submission(
collection,
collected_type='software',
status='active',
volume='1',
issue='2',
program_area='health',
school_type='university',
study_design='rct',
data_type='quantitative',
disease='cancer',
grade_levels='K-12',
)

copy_collection_submission_metadata_to_cedar()

record = CedarMetadataRecord.objects.get(guid=submission.guid, template=cedar_template)
assert record.metadata == {
'collected_type': 'software',
'status': 'active',
'volume': '1',
'issue': '2',
'program_area': 'health',
'school_type': 'university',
'study_design': 'rct',
'data_type': 'quantitative',
'disease': 'cancer',
'grade_levels': 'K-12',
}