Skip to content

Commit 8a58ae8

Browse files
authored
fix: issue with trig reference counting across graphs (#2085)
The TriG serializer was only considering BNode references inside a single graph and not counting the BNodes subjects as references when considering if a BNode should be serialized as unlabeled blank nodes (i.e. `[ ]`), and as a result it was serializing BNodes as unlabeled if they were in fact referencing BNodes in other graphs. One caveat of this change is that some RDF Datasets may be serialized less succinctly in that unlabeled blank nodes would not be used nodes where it is technically possible to use them. This can be trivially fixed, but a trivial fix increases the computational complexity of serialization significantly. Other changes: - Removed the roundtrip xfail that this change fixed. - Added another roundtrip test which has various combinations of BNode references across graphs in a dataset, this test fails for JSON-LD however, so while this change removes one xfail it also now adds another. - Set the default indent_size and style in `.editorconfig` as to avoid relying on undefined system defaults.
1 parent 04bf774 commit 8a58ae8

File tree

6 files changed

+116
-32
lines changed

6 files changed

+116
-32
lines changed

.editorconfig

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ root = true
66

77
# Unix-style newlines with a newline ending every file
88
[*]
9+
indent_size = 4
10+
indent_style = space
911
end_of_line = lf
1012
insert_final_newline = true
1113
trim_trailing_whitespace = true

CHANGELOG.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,27 @@ and will be removed for release.
309309
<!-- -->
310310
<!-- -->
311311

312+
313+
<!-- -->
314+
<!-- -->
315+
<!-- CHANGE BARRIER: START PR #2085 -->
316+
<!-- -->
317+
<!-- -->
318+
319+
- Fixed serialization of BNodes in TriG.
320+
The TriG serializer was only considering BNode references inside a single
321+
graph and not counting the BNodes subjects as references when considering if a
322+
BNode should be serialized as unlabeled blank nodes (i.e. `[ ]`), and as a
323+
result it was serializing BNodes as unlabeled if they were in fact referencing
324+
BNodes in other graphs.
325+
[PR #2085](https://github.com/RDFLib/rdflib/pull/2085).
326+
327+
<!-- -->
328+
<!-- -->
329+
<!-- CHANGE BARRIER: END PR #2085 -->
330+
<!-- -->
331+
<!-- -->
332+
312333
<!-- -->
313334
<!-- -->
314335
<!-- CHANGE BARRIER: START -->

Taskfile.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,11 @@ tasks:
282282
- task: install:flake8
283283
- task: flake8
284284

285+
cmd:rdfpipe:
286+
desc: Run rdfpipe
287+
cmds:
288+
- cmd: "{{._PYTHON | shellQuote}} -m rdflib.tools.rdfpipe {{.CLI_ARGS}}"
289+
285290
_rimraf:
286291
# This task is a utility task for recursively removing directories, it is
287292
# similar to rm -rf but not identical and it should work wherever there is

rdflib/plugins/serializers/trig.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
See <http://www.w3.org/TR/trig/> for syntax specification.
44
"""
55

6-
from collections import defaultdict
76
from typing import IO, TYPE_CHECKING, Optional, Union
87

98
from rdflib.graph import ConjunctiveGraph, Graph
@@ -37,17 +36,15 @@ def preprocess(self):
3736
for context in self.contexts:
3837
self.store = context
3938
self.getQName(context.identifier)
40-
self._references = defaultdict(int)
4139
self._subjects = {}
4240

4341
for triple in context:
4442
self.preprocessTriple(triple)
4543

46-
self._contexts[context] = (
47-
self.orderSubjects(),
48-
self._subjects,
49-
self._references,
50-
)
44+
for subject in self._subjects.keys():
45+
self._references[subject] += 1
46+
47+
self._contexts[context] = (self.orderSubjects(), self._subjects)
5148

5249
def reset(self):
5350
super(TrigSerializer, self).reset()
@@ -77,11 +74,10 @@ def serialize(
7774
self.startDocument()
7875

7976
firstTime = True
80-
for store, (ordered_subjects, subjects, ref) in self._contexts.items():
77+
for store, (ordered_subjects, subjects) in self._contexts.items():
8178
if not ordered_subjects:
8279
continue
8380

84-
self._references = ref
8581
self._serialized = {}
8682
self.store = store
8783
self._subjects = subjects
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
2+
@prefix dc: <http://purl.org/dc/terms/> .
3+
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
4+
@prefix egdo: <http://example.org/> .
5+
6+
{
7+
egdo:bob dc:publisher "Bob" .
8+
egdo:alice dc:publisher "Alice" .
9+
}
10+
11+
12+
egdo:alice {
13+
_:alice foaf:name "Alice";
14+
foaf:mbox <mailto:alice@work.example.org> .
15+
}
16+
17+
egdo:bob {
18+
_:bob foaf:name "Bob";
19+
foaf:mbox <mailto:bob@oldcorp.example.org>;
20+
foaf:knows _:alice .
21+
}
22+
23+
24+
egdo:blake {
25+
[] foaf:name "Blake" ;
26+
foaf:mbox <mailto:blake@oldcorp.example.org>;
27+
foaf:knows [
28+
foaf:name "Taylor";
29+
foaf:mbox <mailto:taylor@work.example.org>
30+
] .
31+
}
32+
33+
egdo:austin {
34+
_:austin foaf:name "Austin" ;
35+
foaf:mbox <mailto:austin@oldcorp.example.org>;
36+
foaf:knows _:carson.
37+
38+
39+
_:carson foaf:name "Carson" ;
40+
foaf:mbox <mailto:carson@oldcorp.example.org>;
41+
foaf:knows _:austin.
42+
}
43+
44+
egdo:charlie {
45+
_:charlie foaf:name "Charlie" ;
46+
foaf:mbox <mailto:charlie@oldcorp.example.org>;
47+
foaf:knows _:dylan;
48+
foaf:knows _:greer.
49+
50+
_:dylan foaf:name "Dylan" ;
51+
foaf:mbox <mailto:dylan@oldcorp.example.org>;
52+
foaf:knows _:charlie;
53+
foaf:knows _:greer.
54+
55+
_:greer foaf:name "Greer" ;
56+
foaf:mbox <mailto:greer@oldcorp.example.org>;
57+
foaf:knows _:charlie;
58+
foaf:knows _:dylan.
59+
}
60+
61+
egdo:jaime {
62+
_:jaime foaf:name "Jaime";
63+
foaf:mbox <mailto:jaime@work.example.org>;
64+
foaf:knows _:keaton.
65+
}
66+
67+
egdo:kennedy {
68+
_:kennedy foaf:name "Kennedy";
69+
foaf:mbox <mailto:kennedy@oldcorp.example.org>;
70+
foaf:knows _:keaton .
71+
}

test/test_roundtrip.py

Lines changed: 12 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -144,28 +144,6 @@
144144
reason="results in invalid xml element name: <ns1:name(s)/>",
145145
raises=SAXParseException,
146146
),
147-
("trig", "rdf11trig_eg2.trig"): pytest.mark.xfail(
148-
reason="""
149-
Something is going wrong here with blank node serialization. In the second
150-
graph below bob knows someone who does not exist, while in first he knows
151-
someone that does exist and has the name Alice.
152-
153-
AssertionError: in both:
154-
(rdflib.term.BNode('cbb5eb12b5dcf688537b0298cce144c6dd68cf047530d0b4a455a8f31f314244fd'), rdflib.term.URIRef('http://xmlns.com/foaf/0.1/mbox'), rdflib.term.URIRef('mailto:alice@work.example.org'))
155-
(rdflib.term.BNode('cbb5eb12b5dcf688537b0298cce144c6dd68cf047530d0b4a455a8f31f314244fd'), rdflib.term.URIRef('http://xmlns.com/foaf/0.1/name'), rdflib.term.Literal('Alice'))
156-
(rdflib.term.URIRef('http://example.org/alice'), rdflib.term.URIRef('http://purl.org/dc/terms/publisher'), rdflib.term.Literal('Alice'))
157-
(rdflib.term.URIRef('http://example.org/bob'), rdflib.term.URIRef('http://purl.org/dc/terms/publisher'), rdflib.term.Literal('Bob'))
158-
only in first:
159-
(rdflib.term.BNode('cb0'), rdflib.term.URIRef('http://xmlns.com/foaf/0.1/knows'), rdflib.term.BNode('cbb5eb12b5dcf688537b0298cce144c6dd68cf047530d0b4a455a8f31f314244fd'))
160-
(rdflib.term.BNode('cb0'), rdflib.term.URIRef('http://xmlns.com/foaf/0.1/mbox'), rdflib.term.URIRef('mailto:bob@oldcorp.example.org'))
161-
(rdflib.term.BNode('cb0'), rdflib.term.URIRef('http://xmlns.com/foaf/0.1/name'), rdflib.term.Literal('Bob'))
162-
only in second:
163-
(rdflib.term.BNode('cb7be1d0397a49ddd4ae8aa96acc7b6135903c5f3fa5e47bf619c0e4b438aafcc1'), rdflib.term.URIRef('http://xmlns.com/foaf/0.1/knows'), rdflib.term.BNode('cb0'))
164-
(rdflib.term.BNode('cb7be1d0397a49ddd4ae8aa96acc7b6135903c5f3fa5e47bf619c0e4b438aafcc1'), rdflib.term.URIRef('http://xmlns.com/foaf/0.1/mbox'), rdflib.term.URIRef('mailto:bob@oldcorp.example.org'))
165-
(rdflib.term.BNode('cb7be1d0397a49ddd4ae8aa96acc7b6135903c5f3fa5e47bf619c0e4b438aafcc1'), rdflib.term.URIRef('http://xmlns.com/foaf/0.1/name'), rdflib.term.Literal('Bob'))
166-
""",
167-
raises=AssertionError,
168-
),
169147
("json-ld", "diverse_quads.trig"): pytest.mark.xfail(
170148
reason="""
171149
jsonld serializer is dropping datatype:
@@ -204,6 +182,10 @@
204182
"n3",
205183
"data/suites/w3c/n3/N3Tests/cwm_syntax/neg-single-quote.n3",
206184
): pytest.mark.xfail(raises=BadSyntax, reason="no support for single quotes"),
185+
("json-ld", "bnode_refs.trig"): pytest.mark.xfail(
186+
reason="a whole bunch of triples with bnode as subject is not in the reconstituted graph",
187+
raises=AssertionError,
188+
),
207189
}
208190

209191
# This is for files which can only be represented properly in one format
@@ -253,7 +235,13 @@ def roundtrip(
253235
s = g1.serialize(format=testfmt)
254236

255237
if logger.isEnabledFor(logging.DEBUG):
256-
logger.debug("source = %s, serailized = \n%s", source, s)
238+
logger.debug(
239+
"infmt = %s, testfmt = %s, source = %s, serailized = \n%s",
240+
infmt,
241+
testfmt,
242+
source,
243+
s,
244+
)
257245

258246
g2 = graph_type()
259247
if same_public_id:
@@ -525,6 +513,7 @@ def test_n3_suite(
525513
(TEST_DATA_DIR / "variants" / "diverse_triples.nt", "ntriples"),
526514
(TEST_DATA_DIR / "variants" / "diverse_quads.nq", "nquads"),
527515
(TEST_DATA_DIR / "variants" / "diverse_quads.trig", "trig"),
516+
(TEST_DATA_DIR / "roundtrip" / "bnode_refs.trig", "trig"),
528517
(TEST_DATA_DIR / "example-lots_of_graphs.n3", "n3"),
529518
(TEST_DATA_DIR / "issue156.n3", "n3"),
530519
]

0 commit comments

Comments
 (0)