Skip to content
17 changes: 16 additions & 1 deletion osf/metadata/osf_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from django.contrib.contenttypes.models import ContentType
from django import db
from mimetypes import MimeTypes
import rdflib

from api.caching.tasks import get_storage_usage_total
Expand Down Expand Up @@ -44,6 +45,8 @@

logger = logging.getLogger(__name__)

mime = MimeTypes()


##### BEGIN "public" api #####

Expand Down Expand Up @@ -373,7 +376,7 @@ def osf_iri(guid_or_model):
return OSFIO[guid._id]


def osfguid_from_iri(iri):
def osfguid_from_iri(iri: str) -> str:
if iri.startswith(OSFIO):
return without_namespace(iri, OSFIO)
raise ValueError(f'expected iri starting with "{OSFIO}" (got "{iri}")')
Expand Down Expand Up @@ -702,6 +705,18 @@ def gather_files(focus):
yield (DCTERMS.requires, file_focus)


@gather.er(DCAT.mediaType)
def gather_file_mediatype(focus):
mime_type = mime.guess_type(focus.dbmodel.name)
yield (DCAT.mediaType, 'application/octet-stream') if mime_type == (None, None) else (DCAT.mediaType, mime_type[0])
mime_type = mime.guess_type(focus.dbmodel.name)[0]
yield (DCAT.mediaType, (
'application/octet-stream'
if mime_type is None
else mime_type
))


@gather.er(DCTERMS.hasPart, DCTERMS.isPartOf)
def gather_parts(focus):
if isinstance(focus.dbmodel, osfdb.AbstractNode):
Expand Down
3 changes: 3 additions & 0 deletions osf/metadata/serializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@
from .datacite import DataciteJsonMetadataSerializer, DataciteXmlMetadataSerializer
from .google_dataset_json_ld import GoogleDatasetJsonLdSerializer
from .turtle import TurtleMetadataSerializer
from .linkset import SignpostLinkset, SignpostLinksetJSON


METADATA_SERIALIZER_REGISTRY = {
'turtle': TurtleMetadataSerializer,
'datacite-json': DataciteJsonMetadataSerializer,
'datacite-xml': DataciteXmlMetadataSerializer,
'google-dataset-json-ld': GoogleDatasetJsonLdSerializer,
'linkset': SignpostLinkset,
'linkset-json': SignpostLinksetJSON
}


Expand Down
148 changes: 148 additions & 0 deletions osf/metadata/serializers/linkset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""osf.metadata.serializers.signpost_linkset: FAIR signposting with osf metadata
FAIR signposting: https://signposting.org/FAIR/
definition of linkset mediatypes: https://www.rfc-editor.org/rfc/rfc9264.html
"""
from __future__ import annotations
import abc
from collections.abc import (
Iterable,
Iterator
)
from collections import defaultdict
import dataclasses
import json
from urllib.parse import urljoin, urlsplit, urlencode, urlunsplit

import rdflib

from ._base import MetadataSerializer
from osf.metadata.osf_gathering import osfguid_from_iri
from osf.metadata.rdfutils import DOI, DCTERMS, OWL, RDF, OSF, DCAT
from website.settings import DOMAIN
from website.util import web_url_for


@dataclasses.dataclass
class SignpostLink:
anchor_uri: str
relation: str
target_uri: str
target_attrs: Iterable[tuple[str, str]] = ()


class BaseSignpostLinkset(MetadataSerializer, abc.ABC):
def _each_link(self) -> Iterator[SignpostLink]:
focus_iri = self.basket.focus.iri
if self.basket.focus.rdftype == OSF.File:
# collection (file's containing obj)
for _collection_uri in self.basket[OSF.isContainedBy]:
yield SignpostLink(focus_iri, 'collection', str(_collection_uri))

# author
for _creator_iri in self.basket[DCTERMS.creator]:
yield SignpostLink(focus_iri, 'author', str(_creator_iri))

# type
if self.basket.focus.rdftype == OSF.File:
parent_types = set(self.basket[OSF.isContainedBy / (DCTERMS.type | RDF.type)])
for _type_iri in self.basket[DCTERMS.type | RDF.type]:
# check the type differs from parent project / registry / preprint
if _type_iri not in parent_types:
yield SignpostLink(focus_iri, 'type', str(_type_iri))
else:
for _type_iri in self.basket[DCTERMS.type | RDF.type]:
yield SignpostLink(focus_iri, 'type', str(_type_iri))

# cite-as
yield SignpostLink(focus_iri, 'cite-as', next((
_sameas_iri
for _sameas_iri in self.basket[OWL.sameAs]
if _sameas_iri.startswith(DOI)
), focus_iri))

base_metadata_url = urljoin(DOMAIN, web_url_for(
'metadata_download', # name of a view function mapped in website/routes.py
guid=osfguid_from_iri(self.basket.focus.iri),
))
split_base_metadata_url = urlsplit(base_metadata_url)

# describes
yield SignpostLink(
base_metadata_url,
'describes',
focus_iri,
)

from osf.metadata.serializers import METADATA_SERIALIZER_REGISTRY
# describedby
for _format_key, _serializer in METADATA_SERIALIZER_REGISTRY.items():
_metadata_url = urlunsplit(split_base_metadata_url._replace(
query=urlencode({'format': _format_key}),
))
yield SignpostLink(
focus_iri,
'describedby',
_metadata_url,
[('type', _serializer.mediatype)]
)

# license
for _license_uri in self.basket[DCTERMS.rights]:
if not isinstance(_license_uri, rdflib.BNode):
yield SignpostLink(focus_iri, 'license', str(_license_uri))

# item
for _file_iri in self.basket[OSF.contains]:
mime_type = next(self.basket[_file_iri:DCAT.mediaType])
yield SignpostLink(focus_iri, 'item', str(_file_iri), [('type', mime_type)])


class SignpostLinkset(BaseSignpostLinkset):
mediatype = 'application/linkset'

def filename_for_itemid(self, itemid: str):
return f'{itemid}-metadata.linkset'

def serialize(self) -> str | bytes:
"""serialize a linkset for FAIR signposting
see example https://www.rfc-editor.org/rfc/rfc9264.html#section-7.1
FAIR signposting: https://signposting.org/FAIR/
"""
result = ',\n'.join(self._serialize_link(link) for link in self._each_link())
return '{}\n'.format(result)

def _serialize_link(self, link: SignpostLink) -> str:
segments = [
f'<{link.target_uri}>',
f'rel="{link.relation}"',
f'anchor="{link.anchor_uri}"'
]
for key, value in link.target_attrs:
segments.append(f'{key}="{value}"')
return ' ; '.join(segments)

class SignpostLinksetJSON(BaseSignpostLinkset):
mediatype = 'application/linkset+json'

def filename_for_itemid(self, itemid: str):
return f'{itemid}-metadata.linkset.json'

def serialize(self) -> str | bytes:
"""serialize linkset json
definition: https://www.rfc-editor.org/rfc/rfc9264.html#section-4.2
example: https://www.rfc-editor.org/rfc/rfc9264.html#section-7.2
"""
grouped_links = defaultdict(lambda: defaultdict(list))

for link in self._each_link():
link_entry = {'href': link.target_uri}
link_entry.update(link.target_attrs)
grouped_links[link.anchor_uri][link.relation].append(link_entry)

linkset = []
for anchor, relations in grouped_links.items():
anchor_entry = {'anchor': anchor}
anchor_entry.update(relations)
linkset.append(anchor_entry)

return json.dumps({'linkset': linkset}, indent=2)
10 changes: 10 additions & 0 deletions osf_tests/metadata/expected_metadata_files/file_basic.linkset
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<http://localhost:5000/w2ibb> ; rel="collection" ; anchor="http://localhost:5000/w3ibb",
<https://osf.io/vocab/2022/File> ; rel="type" ; anchor="http://localhost:5000/w3ibb",
<http://localhost:5000/w3ibb> ; rel="cite-as" ; anchor="http://localhost:5000/w3ibb",
<http://localhost:5000/w3ibb> ; rel="describes" ; anchor="http://localhost:5000/metadata/w3ibb/",
<http://localhost:5000/metadata/w3ibb/?format=turtle> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="text/turtle; charset=utf-8",
<http://localhost:5000/metadata/w3ibb/?format=datacite-json> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/json",
<http://localhost:5000/metadata/w3ibb/?format=datacite-xml> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/xml",
<http://localhost:5000/metadata/w3ibb/?format=google-dataset-json-ld> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/ld+json",
<http://localhost:5000/metadata/w3ibb/?format=linkset> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/linkset",
<http://localhost:5000/metadata/w3ibb/?format=linkset-json> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/linkset+json"
56 changes: 56 additions & 0 deletions osf_tests/metadata/expected_metadata_files/file_basic.linkset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"linkset": [
{
"anchor": "http://localhost:5000/w3ibb",
"collection": [
{
"href": "http://localhost:5000/w2ibb"
}
],
"type": [
{
"href": "https://osf.io/vocab/2022/File"
}
],
"cite-as": [
{
"href": "http://localhost:5000/w3ibb"
}
],
"describedby": [
{
"href": "http://localhost:5000/metadata/w3ibb/?format=turtle",
"type": "text/turtle; charset=utf-8"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=datacite-json",
"type": "application/json"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=datacite-xml",
"type": "application/xml"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=google-dataset-json-ld",
"type": "application/ld+json"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=linkset",
"type": "application/linkset"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=linkset-json",
"type": "application/linkset+json"
}
]
},
{
"anchor": "http://localhost:5000/metadata/w3ibb/",
"describes": [
{
"href": "http://localhost:5000/w3ibb"
}
]
}
]
}
10 changes: 10 additions & 0 deletions osf_tests/metadata/expected_metadata_files/file_full.linkset
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<http://localhost:5000/w2ibb> ; rel="collection" ; anchor="http://localhost:5000/w3ibb",
<https://osf.io/vocab/2022/File> ; rel="type" ; anchor="http://localhost:5000/w3ibb",
<http://localhost:5000/w3ibb> ; rel="cite-as" ; anchor="http://localhost:5000/w3ibb",
<http://localhost:5000/w3ibb> ; rel="describes" ; anchor="http://localhost:5000/metadata/w3ibb/",
<http://localhost:5000/metadata/w3ibb/?format=turtle> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="text/turtle; charset=utf-8",
<http://localhost:5000/metadata/w3ibb/?format=datacite-json> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/json",
<http://localhost:5000/metadata/w3ibb/?format=datacite-xml> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/xml",
<http://localhost:5000/metadata/w3ibb/?format=google-dataset-json-ld> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/ld+json",
<http://localhost:5000/metadata/w3ibb/?format=linkset> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/linkset",
<http://localhost:5000/metadata/w3ibb/?format=linkset-json> ; rel="describedby" ; anchor="http://localhost:5000/w3ibb" ; type="application/linkset+json"
56 changes: 56 additions & 0 deletions osf_tests/metadata/expected_metadata_files/file_full.linkset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"linkset": [
{
"anchor": "http://localhost:5000/w3ibb",
"collection": [
{
"href": "http://localhost:5000/w2ibb"
}
],
"type": [
{
"href": "https://osf.io/vocab/2022/File"
}
],
"cite-as": [
{
"href": "http://localhost:5000/w3ibb"
}
],
"describedby": [
{
"href": "http://localhost:5000/metadata/w3ibb/?format=turtle",
"type": "text/turtle; charset=utf-8"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=datacite-json",
"type": "application/json"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=datacite-xml",
"type": "application/xml"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=google-dataset-json-ld",
"type": "application/ld+json"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=linkset",
"type": "application/linkset"
},
{
"href": "http://localhost:5000/metadata/w3ibb/?format=linkset-json",
"type": "application/linkset+json"
}
]
},
{
"anchor": "http://localhost:5000/metadata/w3ibb/",
"describes": [
{
"href": "http://localhost:5000/w3ibb"
}
]
}
]
}
11 changes: 11 additions & 0 deletions osf_tests/metadata/expected_metadata_files/preprint_basic.linkset
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<http://localhost:5000/w1ibb> ; rel="author" ; anchor="http://localhost:5000/w4ibb",
<https://schema.datacite.org/meta/kernel-4/#Preprint> ; rel="type" ; anchor="http://localhost:5000/w4ibb",
<https://osf.io/vocab/2022/Preprint> ; rel="type" ; anchor="http://localhost:5000/w4ibb",
<https://doi.org/11.pp/FK2osf.io/w4ibb_v1> ; rel="cite-as" ; anchor="http://localhost:5000/w4ibb",
<http://localhost:5000/w4ibb> ; rel="describes" ; anchor="http://localhost:5000/metadata/w4ibb/",
<http://localhost:5000/metadata/w4ibb/?format=turtle> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="text/turtle; charset=utf-8",
<http://localhost:5000/metadata/w4ibb/?format=datacite-json> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/json",
<http://localhost:5000/metadata/w4ibb/?format=datacite-xml> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/xml",
<http://localhost:5000/metadata/w4ibb/?format=google-dataset-json-ld> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/ld+json",
<http://localhost:5000/metadata/w4ibb/?format=linkset> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/linkset",
<http://localhost:5000/metadata/w4ibb/?format=linkset-json> ; rel="describedby" ; anchor="http://localhost:5000/w4ibb" ; type="application/linkset+json"
Loading
Loading