From 3a6c26a89576b1886e4fa33f1ccc7c1199119db6 Mon Sep 17 00:00:00 2001 From: Karolina Przerwa Date: Tue, 9 Dec 2025 16:46:00 +0100 Subject: [PATCH 1/2] add(identifiers): add HAL identifiers * fix(identifiers): set relation type to variant form --- .../rdm/records/transform/config.py | 4 +- .../transform/xml_processing/rules/base.py | 39 +++++++++++-------- .../xml_processing/rules/publications.py | 4 +- tests/cds-rdm/conftest.py | 10 +++++ tests/cds-rdm/test_thesis_migration.py | 10 ++--- 5 files changed, 42 insertions(+), 25 deletions(-) diff --git a/cds_migrator_kit/rdm/records/transform/config.py b/cds_migrator_kit/rdm/records/transform/config.py index a056eb81..8597228a 100644 --- a/cds_migrator_kit/rdm/records/transform/config.py +++ b/cds_migrator_kit/rdm/records/transform/config.py @@ -8,7 +8,7 @@ """CDS-RDM transform config module.""" # filters out PIDs which we don't migrate -PIDS_SCHEMES_TO_DROP = ["HAL"] +PIDS_SCHEMES_TO_DROP = [] # validates allowed schemes PIDS_SCHEMES_ALLOWED = ["DOI"] @@ -16,6 +16,7 @@ PID_SCHEMES_TO_STORE_IN_IDENTIFIERS = [ "ARXIV", "HDL", + "HAL" "HANDLE", "URN", "INIS", @@ -25,7 +26,6 @@ IDENTIFIERS_SCHEMES_TO_DROP = [ "SPIRES", - "HAL", "OSTI", "SLAC", "PROQUEST", diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py index 281339c2..7f67fffe 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py @@ -350,19 +350,20 @@ def identifiers(self, key, value): https://github.com/CERNDocumentServer/cds-migrator-kit/issues/21 """ id_value = StringValue(value.get("a", "")).parse() - scheme = StringValue(value.get("9", "")).parse() + original_scheme = StringValue(value.get("9", "")).parse() + scheme = original_scheme.lower() related_works = self.get("related_identifiers", []) - if scheme.upper() in IDENTIFIERS_SCHEMES_TO_DROP: + if original_scheme.upper() in IDENTIFIERS_SCHEMES_TO_DROP: raise IgnoreKey("identifiers") # drop oai harvest info if id_value.startswith("oai:inspirehep.net"): raise IgnoreKey("identifiers") - if scheme.lower() == "arxiv": + if scheme == "arxiv": id_value = id_value.replace("oai:arXiv.org:", "arXiv:") - if scheme.lower() == "cern annual report": + if scheme == "cern annual report": additional_descriptions = self.get("additional_descriptions", []) new_desc = { - "description": f"{scheme} {id_value}", + "description": f"{original_scheme} {id_value}", "type": {"id": "series-information"}, } additional_descriptions.append(new_desc) @@ -377,34 +378,36 @@ def identifiers(self, key, value): self["related_identifiers"] = related_works raise IgnoreKey("identifiers") - is_aleph_number = scheme.lower() == "cercer" or not scheme and "CERCER" in id_value + is_aleph_number = scheme == "cercer" or not scheme and "CERCER" in id_value + if is_aleph_number: scheme = "aleph" - if scheme.lower() == "cds": + elif scheme == "cds": scheme = "cds" - if scheme.lower() == "inspire": + elif scheme == "inspire": validate_inspire_identifier(id_value, key) - rel_id = {"scheme": scheme.lower(), "identifier": id_value} - if scheme.lower() == "admbul": - legacy_scheme = scheme + rel_id = {"scheme": scheme, "identifier": id_value} + + if scheme == "admbul": scheme = "other" - rel_id = {"scheme": scheme, "identifier": f"{legacy_scheme}_{id_value}"} - if scheme.lower() == "agendamaker": + rel_id = {"scheme": scheme, "identifier": f"{original_scheme}_{id_value}"} + if scheme == "agendamaker": indico_id = get_new_indico_id(id_value) scheme = "indico" rel_id = {"scheme": scheme, "identifier": str(indico_id)} - if scheme.lower() == "zentralblatt math": + if scheme == "zentralblatt math": scheme = "url" rel_id = { "scheme": scheme, "identifier": f"https://zbmath.org/?q=an:{id_value}", } + if id_value: if rel_id["scheme"] in RDM_RECORDS_RELATED_IDENTIFIERS_SCHEMES: rel_id.update( { - "relation_type": {"id": "isreferencedby"}, + "relation_type": {"id": "isvariantformof"}, "resource_type": {"id": "publication-other"}, } ) @@ -459,7 +462,7 @@ def _pids(self, key, value): else: new_id.update( { - "relation_type": {"id": "isversionof"}, + "relation_type": {"id": "isvariantformof"}, "resource_type": {"id": qualifier}, } ) @@ -779,6 +782,10 @@ def related_identifiers_787(self, key, value): recid = recid.replace("https://cds.cern.ch/record/", "") relation_map = { + "periodical": { + "relation_type": {"id": "ispublishedin"}, + "resource_type": {"id": "publication-periodical"}, + }, "issue": { "relation_type": {"id": "ispublishedin"}, "resource_type": {"id": "publication-periodicalissue"}, diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py index 35d12201..bfcbdf70 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py @@ -45,7 +45,7 @@ def isbn(self, key, value): new_id = { "identifier": _isbn, "scheme": "isbn", - "relation_type": {"id": "isversionof"}, + "relation_type": {"id": "isvariantformof"}, "resource_type": {"id": "publication-book"}, } else: @@ -53,7 +53,7 @@ def isbn(self, key, value): new_id = { "identifier": _isbn, "scheme": "isbn", - "relation_type": {"id": "isversionof"}, + "relation_type": {"id": "isvariantformof"}, "resource_type": {"id": "publication-book"}, } ids = self.get(destination, []) diff --git a/tests/cds-rdm/conftest.py b/tests/cds-rdm/conftest.py index faa8b4ac..11925a4c 100644 --- a/tests/cds-rdm/conftest.py +++ b/tests/cds-rdm/conftest.py @@ -1472,6 +1472,16 @@ def relation_type_v(app, relation_type): }, ) + vocab = vocabulary_service.create( + system_identity, + { + "id": "isvariantformof", + "props": {"datacite": "Is variant form of"}, + "title": {"en": "Is variant form of"}, + "type": "relationtypes", + }, + ) + Vocabulary.index.refresh() return vocab diff --git a/tests/cds-rdm/test_thesis_migration.py b/tests/cds-rdm/test_thesis_migration.py index cb2ff5aa..f2d387e7 100644 --- a/tests/cds-rdm/test_thesis_migration.py +++ b/tests/cds-rdm/test_thesis_migration.py @@ -96,7 +96,7 @@ def test_full_thesis_stream( { "identifier": "978-3-030-90375-6", "scheme": "isbn", - "relation_type": {"id": "isversionof", "title": {"en": "Is version of"}}, + "relation_type": {"id": "isvariantformof", "title": {"en": "Is variant form of"}}, "resource_type": { "id": "publication-book", "title": { @@ -108,7 +108,7 @@ def test_full_thesis_stream( { "identifier": "978-3-030-90376-3", "scheme": "isbn", - "relation_type": {"id": "isversionof", "title": {"en": "Is version of"}}, + "relation_type": {"id": "isvariantformof", "title": {"en": "Is variant form of"}}, "resource_type": { "id": "publication-book", "title": { @@ -121,7 +121,7 @@ def test_full_thesis_stream( { "identifier": "10.1007/978-3-030-90376-3", "scheme": "doi", - "relation_type": {"id": "isversionof", "title": {"en": "Is version of"}}, + "relation_type": {"id": "isvariantformof", "title": {"en": "Is variant form of"}}, "resource_type": { "id": "publication", "title": {"en": "Publication", "de": "Publikation"}, @@ -130,8 +130,8 @@ def test_full_thesis_stream( { "identifier": "1807850", "relation_type": { - "id": "isreferencedby", - "title": {"de": "Wird referenziert von", "en": "Is referenced by"}, + "id": "isvariantformof", + "title": {"en": "Is variant form of"}, }, "resource_type": { "id": "publication-other", From 8798ba160900140aa262f3815832549999e67683 Mon Sep 17 00:00:00 2001 From: Karolina Przerwa Date: Wed, 10 Dec 2025 11:19:19 +0100 Subject: [PATCH 2/2] add(hr): new resource types assignment --- .../rdm/records/transform/models/hr.py | 1 + .../transform/xml_processing/rules/hr.py | 24 ++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/cds_migrator_kit/rdm/records/transform/models/hr.py b/cds_migrator_kit/rdm/records/transform/models/hr.py index ef5e7fae..90f3935f 100644 --- a/cds_migrator_kit/rdm/records/transform/models/hr.py +++ b/cds_migrator_kit/rdm/records/transform/models/hr.py @@ -35,6 +35,7 @@ class HrModel(CdsOverdo): "100__m", "300__a", # number of pages "591__b", # + "6531_9", # keyword scheme "700__m", "7870_r", # detailed description of record relation (2862345) "8564_8", diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/hr.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/hr.py index 1596452e..cb2c182b 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/hr.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/hr.py @@ -74,10 +74,11 @@ def additional_desc(self, key, value): raise IgnoreKey("additional_descriptions_hr") -@model.over("subjects", "(^6931_)|(^650[12_][7_])|(^653[12_]_)|(^695__)|(^694__)") +@model.over("subjects", "(^6931_)|(^650[12_][7_])|(^653[12_]_)|(^695__)|(^694__)", override=True) @require(["a"]) @for_each_value def hr_subjects(self, key, value): + keyword = value.get("a") if key == "6531_": keyword = value.get("a") if "," in keyword: @@ -87,14 +88,17 @@ def hr_subjects(self, key, value): _subjects.append({"subject": key}) self["subjects"] = _subjects raise IgnoreKey("subjects") - else: - resource_type_map = { - "Presentation": {"id": "presentation"}, - } - resource_type = resource_type_map.get(keyword) - if resource_type: - self["resource_type"] = resource_type - raise IgnoreKey("subjects") + + resource_type_map = { + "Presentation": {"id": "presentation"}, + "Mémos": {"id": "publication-memorandum"}, + "Formulaires": {"id": "publication-form"}, + "Form": {"id": "publication-form"}, + "Modèles de documents": {"id": "publication-doctemplate"}, + } + resource_type = resource_type_map.get(keyword) + if resource_type: + self["resource_type"] = resource_type subjects(self, key, value) @@ -153,6 +157,8 @@ def resource_type(self, key, value): self["subjects"] = subjects if value == "administrativenote": raise IgnoreKey("resource_type") + if value == "cern-admin-e-guide" and self["resource_type"]: + raise IgnoreKey("resource_type") map = { "annualstats": {"id": "publication-report"}, "cern-admin-e-guide": {"id": "publication-other"},