From 898db00f2fcd6f99157d3aaf937ed5bb3f69cdc0 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 2 Jul 2026 15:32:48 -0400 Subject: [PATCH 1/3] add some sequence diagrams to ARCHITECTURE.md --- ARCHITECTURE.md | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index f27608ffc..8163f80ba 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -9,7 +9,8 @@ In short, SHARE/trove holds metadata records that describe things and makes thos ## Parts -a look at the tangles of communication between different parts of the system: +a slightly simplified look at the tangles of communication between different parts of the system, +as currently implemented: ```mermaid graph LR; @@ -48,6 +49,49 @@ graph LR; subscribers-->oaipmh; ``` +### /trove/ingest +```mermaid +sequenceDiagram + participant ms as metadata source + box shtrove + participant ss as web server + participant sd as db (postgres) + participant sw as worker (celery) + participant sq as queues (rabbitmq) + participant si as indexer + participant se as elasticsearch + end + ms ->> ss: POST /trove/ingest + ss ->> sd: save ResourceIdentifier(s) + ss ->> sd: save Indexcard + ss ->> sd: save ResourceDescription(s) + ss ->> sq: enqueue derive task + ss ->> ms: 201 CREATED (success!) + sq -->> sw: receive derive task + sd <<-->> sw: load metadata record + sw ->> sd: save DerivedIndexcards + sw ->> sq: enqueue indexer message + sq -->> si: bulk receive indexer messages + sd <<-->> si: bulk load metadata + si ->> se: bulk index +``` + +### /trove/index-card-search +```mermaid +sequenceDiagram + participant c as client + box shtrove + participant ss as web server + participant sd as db (postgres) + participant se as elasticsearch + end + c ->> ss: GET /trove/index-card-search + ss ->> se: query (via index strategy) + se ->> ss: result ids (plus context) + ss <<-->> sd: load metadata records + ss ->> c: respond/stream search results (formatted as requested) +``` + ## Code map A brief look at important areas of code as they happen to exist now. From 7336bdd3f7a6542966ababa1405203da54a702e4 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 2 Jul 2026 15:33:14 -0400 Subject: [PATCH 2/3] fill some gaps in trove search api docs --- trove/openapi.py | 2 +- trove/vocab/trove.py | 26 ++++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/trove/openapi.py b/trove/openapi.py index 89c0bee67..43af9ac1c 100644 --- a/trove/openapi.py +++ b/trove/openapi.py @@ -168,7 +168,7 @@ def _openapi_path(path_iri: str, api_graph: primitive_rdf.RdfGraph) -> Tuple[str def _concept_markdown_blocks(concept_iri: str, api_graph: primitive_rdf.RdfGraph) -> Generator[str, None, None]: for _label in api_graph.q(concept_iri, RDFS.label): - yield f'## {_label.unicode_value}' + yield f'## concept: {_label.unicode_value}' for _comment in api_graph.q(concept_iri, RDFS.comment): yield f'' for _desc in api_graph.q(concept_iri, DCTERMS.description): diff --git a/trove/vocab/trove.py b/trove/vocab/trove.py index 5649db6b8..a402563d4 100644 --- a/trove/vocab/trove.py +++ b/trove/vocab/trove.py @@ -502,6 +502,24 @@ def _literal_markdown(text: str, *, language: str) -> literal: the response will have the http header `Content-Disposition: attachment` with a filename based on the query param value, current date, and response content mediatype +''', language='en')}, + }, + TROVE.iriShorthand: { + RDF.type: {RDF.Property, TROVE.QueryParameter}, + JSONAPI_MEMBERNAME: {literal('iriShorthand', language='en')}, + RDFS.label: {literal('iriShorthand', language='en')}, + RDFS.comment: {literal('define a shorthand namespace or alias for IRIs in this query string', language='en')}, + TROVE.jsonSchema: {literal_json({'type': 'string'})}, + DCTERMS.description: {_literal_markdown('''**iriShorthand** is +a query parameter to define a shorthand name used for parsing IRIs in other query parameters + +for example, a request to `/trove/index-card-search` with these query parameters: +- `iriShorthand[blarg]=https://blarg.example/vocab/` +- `iriShorthand[foo]=https://another.example/vocab/foo` +- `cardSearchFilter[blarg:prop]=foo` + +will find cards with the IRI value `` +at the property `` ''', language='en')}, }, TROVE.cardSearchText: { @@ -709,11 +727,15 @@ def _literal_markdown(text: str, *, language: str) -> literal: DCTERMS.description: {_literal_markdown(f'''a **property-path** is a dot-separated path of short-hand IRIs, used in several api parameters -currently the only supported shorthand is defined by [OSFMAP]({osfmap.OSFMAP_LINK}) - for example, `creator.name` is parsed as a two-step path that follows `creator` (aka `dcterms:creator`, ``) and then `name` (aka `foaf:name`, ``) +currently, the only implied shorthand is that defined by [OSFMAP]({osfmap.OSFMAP_LINK}) +-- to search on other properties, use an `iriShorthand` query param to provide an explicit +alias or namespace (e.g. with `iriShorthand[blarg]=https://blarg.example/vocab/`, +`blarg:prop1.blarg:prop2` in another param will be parsed as a two-step property-path +following `` then ``) + most places that allow one property-path also accept a comma-separated set of paths, like `title,description` (which is parsed as two paths: `title` and `description`) or `affiliation,creator.affiliation,funder` (which is parsed as three paths: `affiliation`, From 4a823e161ca82b62aadd932458b463a7b23dfe88 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 2 Jul 2026 15:59:12 -0400 Subject: [PATCH 3/3] clarify --- ARCHITECTURE.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 8163f80ba..60463d703 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -50,6 +50,7 @@ graph LR; ``` ### /trove/ingest +a slightly simplified look at how metadata records are ingested, as currently implemented: ```mermaid sequenceDiagram participant ms as metadata source @@ -68,15 +69,16 @@ sequenceDiagram ss ->> sq: enqueue derive task ss ->> ms: 201 CREATED (success!) sq -->> sw: receive derive task - sd <<-->> sw: load metadata record + sd <<-->> sw: load ResourceDescription(s) sw ->> sd: save DerivedIndexcards sw ->> sq: enqueue indexer message - sq -->> si: bulk receive indexer messages - sd <<-->> si: bulk load metadata + sq -->> si: bulk receive messages + sd <<-->> si: bulk load metadata records si ->> se: bulk index ``` ### /trove/index-card-search +a slightly simplified look at how search requests are served, as currently implemented: ```mermaid sequenceDiagram participant c as client @@ -86,10 +88,9 @@ sequenceDiagram participant se as elasticsearch end c ->> ss: GET /trove/index-card-search - ss ->> se: query (via index strategy) - se ->> ss: result ids (plus context) + ss <<-->> se: query for result ids (and context) ss <<-->> sd: load metadata records - ss ->> c: respond/stream search results (formatted as requested) + ss ->> c: respond/stream search results ``` ## Code map @@ -135,9 +136,7 @@ Uses the [resource description framework](https://www.w3.org/TR/rdf11-primer/#se ### Identifiers -Whenever feasible, use full URI strings to identify resources, concepts, types, and properties that may be exposed outwardly. - -Prefer using open, standard, well-defined namespaces wherever possible ([DCAT](https://www.w3.org/TR/vocab-dcat-3/) is a good place to start; see `trove.vocab.namespaces` for others already in use). When app-specific concepts must be defined, use the `TROVE` namespace (`https://share.osf.io/vocab/2023/trove/`). +Whenever feasible, use full [IRI](https://www.rfc-editor.org/rfc/rfc3987.html) strings (utf-8) to identify resources, concepts, types, and properties that may be exposed outwardly (without converting to URI or using to send requests). Prefer using open, standard, well-defined namespaces wherever possible ([DCAT](https://www.w3.org/TR/vocab-dcat-3/) is a good place to start; see `trove.vocab.namespaces` for others already in use). When app-specific concepts must be defined, use the `TROVE` namespace (`https://share.osf.io/vocab/2023/trove/`). A notable exception (non-URI identifier) is the "source-unique identifier" or "suid" -- essentially a two-tuple `(source, identifier)` that uniquely and persistently identifies a metadata record in a source repository. This `identifier` may be any string value, provided by the external source.