Skip to content

Commit 670620b

Browse files
Use streaming when downloading from DOI (#14384)
* Rewrite step 1 * Fix typo * Use Markdown JavaDoc * Re-ordering of methods * Introduce constants for formatters * Add helper file * Add removal of whitespaces * Syntaxfix * Try to workaround broken reformatter * test(entry): add unit test for BibEntry.trimLeft() (#14422) Co-authored-by: jetbrains-junie[bot] <201638009+jetbrains-junie[bot]@users.noreply.github.com> --------- Co-authored-by: jetbrains-junie[bot] <201638009+jetbrains-junie[bot]@users.noreply.github.com>
1 parent bc9868f commit 670620b

File tree

7 files changed

+151
-103
lines changed

7 files changed

+151
-103
lines changed

jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java

Lines changed: 61 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
package org.jabref.logic.importer.fetcher;
22

3-
import java.io.IOException;
3+
import java.io.InputStream;
44
import java.net.HttpURLConnection;
55
import java.net.MalformedURLException;
66
import java.net.URL;
@@ -32,6 +32,7 @@
3232
import org.jabref.model.entry.field.StandardField;
3333
import org.jabref.model.entry.identifier.DOI;
3434
import org.jabref.model.entry.types.StandardEntryType;
35+
import org.jabref.model.util.DummyFileUpdateMonitor;
3536
import org.jabref.model.util.OptionalUtil;
3637

3738
import com.google.common.util.concurrent.RateLimiter;
@@ -64,6 +65,10 @@ public class DoiFetcher implements IdBasedFetcher, EntryBasedFetcher {
6465
*/
6566
private static final RateLimiter CROSSREF_DCN_RATE_LIMITER = RateLimiter.create(50.0);
6667

68+
private static final FieldFormatterCleanup NORMALIZE_PAGES = new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter());
69+
private static final FieldFormatterCleanup CLEAR_URL = new FieldFormatterCleanup(StandardField.URL, new ClearFormatter());
70+
private static final FieldFormatterCleanup HTML_TO_LATEX_TITLE = new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter());
71+
6772
private final ImportFormatPreferences preferences;
6873

6974
public DoiFetcher(ImportFormatPreferences preferences) {
@@ -116,75 +121,73 @@ protected CompletableFuture<Optional<BibEntry>> asyncPerformSearchById(String id
116121

117122
@Override
118123
public Optional<BibEntry> performSearchById(String identifier) throws FetcherException {
119-
Optional<DOI> doi = DOI.parse(identifier);
120-
121-
if (doi.isEmpty()) {
122-
throw new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier));
123-
}
124+
DOI doi = DOI.parse(identifier)
125+
.orElseThrow(() -> new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier)));
124126

125127
URL doiURL;
126128
try {
127-
doiURL = URLUtil.create(doi.get().getURIAsASCIIString());
129+
doiURL = URLUtil.create(doi.getURIAsASCIIString());
128130
} catch (MalformedURLException e) {
129131
throw new FetcherException("Malformed URL", e);
130132
}
131133

132-
try {
133-
Optional<BibEntry> fetchedEntry;
134+
Optional<BibEntry> fetchedEntry;
134135

135-
// mEDRA does not return a parsable bibtex string
136-
Optional<String> agency = getAgency(doi.get());
137-
if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) {
138-
return new Medra().performSearchById(identifier);
139-
}
140-
141-
// BibTeX data
142-
URLDownload download = getUrlDownload(doiURL);
143-
download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
136+
// mEDRA does not return a parsable bibtex string
137+
Optional<String> agency;
138+
try {
139+
agency = getAgency(doi);
140+
} catch (MalformedURLException e) {
141+
throw new FetcherException("Invalid URL", e);
142+
}
143+
if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) {
144+
return new Medra().performSearchById(identifier);
145+
}
144146

145-
String bibtexString;
146-
URLConnection openConnection;
147+
URLDownload download = getUrlDownload(doiURL);
148+
download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
149+
HttpURLConnection connection = (HttpURLConnection) download.openConnection();
150+
InputStream inputStream = download.asInputStream(connection);
147151

148-
openConnection = download.openConnection();
149-
bibtexString = URLDownload.asString(openConnection).trim();
152+
BibtexParser bibtexParser = new BibtexParser(preferences, new DummyFileUpdateMonitor());
153+
try {
154+
fetchedEntry = bibtexParser.parseEntries(inputStream).stream().findFirst();
155+
} catch (ParseException e) {
156+
throw new FetcherException(doiURL, "Could not parse BibTeX entry", e);
157+
}
158+
// Crossref has a dynamic API rate limit
159+
if (agency.isPresent() && "crossref".equalsIgnoreCase(agency.get())) {
160+
updateCrossrefAPIRate(connection);
161+
}
162+
connection.disconnect();
150163

151-
// BibTeX entry
152-
fetchedEntry = BibtexParser.singleFromString(bibtexString, preferences);
153-
fetchedEntry.ifPresent(this::doPostCleanup);
164+
fetchedEntry.ifPresent(entry -> {
165+
doPostCleanup(entry);
154166

155-
// Crossref has a dynamic API rate limit
156-
if (agency.isPresent() && "crossref".equalsIgnoreCase(agency.get())) {
157-
updateCrossrefAPIRate(openConnection);
167+
// Output warnings in case of inconsistencies
168+
entry.getField(StandardField.DOI)
169+
.filter(entryDoi -> entryDoi.equals(doi.asString()))
170+
.ifPresent(entryDoi -> LOGGER.warn("Fetched entry's DOI {} is different from requested DOI {}", entryDoi, identifier));
171+
if (entry.getField(StandardField.DOI).isEmpty()) {
172+
LOGGER.warn("Fetched entry does not contain doi field {}", identifier);
158173
}
159174

160-
// Check if the entry is an APS journal and add the article id as the page count if page field is missing
161-
if (fetchedEntry.isPresent() && fetchedEntry.get().hasField(StandardField.DOI)) {
162-
BibEntry entry = fetchedEntry.get();
163-
if (isAPSJournal(entry, entry.getField(StandardField.DOI).get()) && !entry.hasField(StandardField.PAGES)) {
164-
setPageCountToArticleId(entry, entry.getField(StandardField.DOI).get());
165-
}
175+
if (isAPSJournal(entry, doi) && !entry.hasField(StandardField.PAGES)) {
176+
setPageNumbersBasedOnDoi(entry, doi);
166177
}
178+
});
167179

168-
if (openConnection instanceof HttpURLConnection connection) {
169-
connection.disconnect();
170-
}
171-
return fetchedEntry;
172-
} catch (IOException e) {
173-
throw new FetcherException(doiURL, Localization.lang("Connection error"), e);
174-
} catch (ParseException e) {
175-
throw new FetcherException(doiURL, "Could not parse BibTeX entry", e);
176-
} catch (JSONException e) {
177-
throw new FetcherException(doiURL, "Could not retrieve Registration Agency", e);
178-
}
180+
return fetchedEntry;
179181
}
180182

181183
private void doPostCleanup(BibEntry entry) {
182-
new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter()).cleanup(entry);
183-
new FieldFormatterCleanup(StandardField.URL, new ClearFormatter()).cleanup(entry);
184-
new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter()).cleanup(entry);
184+
NORMALIZE_PAGES.cleanup(entry);
185+
CLEAR_URL.cleanup(entry);
186+
HTML_TO_LATEX_TITLE.cleanup(entry);
187+
entry.trimLeft();
185188
}
186189

187-
private void updateCrossrefAPIRate(URLConnection existingConnection) {
190+
private synchronized void updateCrossrefAPIRate(URLConnection existingConnection) {
188191
try {
189192
// Assuming this field is given in seconds
190193
String xRateLimitInterval = existingConnection.getHeaderField("X-Rate-Limit-Interval").replaceAll("[^\\.0123456789]", "");
@@ -221,8 +224,9 @@ public List<BibEntry> performSearch(@NonNull BibEntry entry) throws FetcherExcep
221224
public Optional<String> getAgency(DOI doi) throws FetcherException, MalformedURLException {
222225
Optional<String> agency = Optional.empty();
223226
try {
224-
URLDownload download = getUrlDownload(URLUtil.create(DOI.AGENCY_RESOLVER + "/" + URLEncoder.encode(doi.asString(),
225-
StandardCharsets.UTF_8)));
227+
URLDownload download = getUrlDownload(
228+
URLUtil.create(DOI.AGENCY_RESOLVER + "/" + URLEncoder.encode(doi.asString(),
229+
StandardCharsets.UTF_8)));
226230
JSONObject response = new JSONArray(download.asString()).getJSONObject(0);
227231
if (response != null) {
228232
agency = Optional.ofNullable(response.optString("RA"));
@@ -235,18 +239,20 @@ public Optional<String> getAgency(DOI doi) throws FetcherException, MalformedURL
235239
return agency;
236240
}
237241

238-
private void setPageCountToArticleId(BibEntry entry, String doiAsString) {
242+
private void setPageNumbersBasedOnDoi(BibEntry entry, DOI doi) {
243+
String doiAsString = doi.asString();
239244
String articleId = doiAsString.substring(doiAsString.lastIndexOf('.') + 1);
240245
entry.setField(StandardField.PAGES, articleId);
241246
}
242247

243248
// checks if the entry is an APS journal by comparing the organization id and the suffix format
244-
private boolean isAPSJournal(BibEntry entry, String doiAsString) {
249+
private boolean isAPSJournal(BibEntry entry, DOI doi) {
245250
if (!entry.getType().equals(StandardEntryType.Article)) {
246251
return false;
247252
}
248-
String suffix = doiAsString.substring(doiAsString.lastIndexOf('/') + 1);
249-
String organizationId = doiAsString.substring(doiAsString.indexOf('.') + 1, doiAsString.indexOf('/'));
253+
String doiString = doi.asString();
254+
String suffix = doiString.substring(doiString.lastIndexOf('/') + 1);
255+
String organizationId = doiString.substring(doiString.indexOf('.') + 1, doiString.indexOf('/'));
250256
return APS_JOURNAL_ORG_DOI_ID.equals(organizationId) && APS_SUFFIX_PATTERN.matcher(suffix).matches();
251257
}
252258
}

jablib/src/main/java/org/jabref/logic/net/URLDownload.java

Lines changed: 23 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,20 @@
5454
import org.slf4j.Logger;
5555
import org.slf4j.LoggerFactory;
5656

57-
/**
58-
* URL download to a string.
59-
* <p>
60-
* Example:
61-
* <code>
62-
* URLDownload dl = new URLDownload(URL);
63-
* String content = dl.asString(ENCODING);
64-
* dl.toFile(Path); // available in FILE
65-
* String contentType = dl.getMimeType();
66-
* </code>
67-
* <br/><br/>
68-
* Almost each call to a public method creates a new HTTP connection (except for {@link #asString(Charset, URLConnection) asString},
69-
* which uses an already opened connection). Nothing is cached.
70-
*/
57+
/// ## Example
58+
///
59+
/// ``java
60+
/// URLDownload dl = new URLDownload(URL);
61+
/// String content = dl.asString(ENCODING);
62+
/// dl.toFile(Path); // available in FILE
63+
/// String contentType = dl.getMimeType();
64+
/// ``
65+
///
66+
/// Almost every call to a public method creates a new HTTP connection
67+
/// (except for {@link #asString(Charset, URLConnection) asString},
68+
/// which uses an already opened connection).
69+
///
70+
/// Nothing is cached.
7171
public class URLDownload {
7272

7373
public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0";
@@ -139,7 +139,7 @@ public Optional<String> getMimeType() {
139139
// @formatter:on
140140
retries++;
141141
HttpResponse<String> response = Unirest.head(urlToCheck).asString();
142-
// Check if we have redirects, e.g. arxiv will give otherwise content type html for the original url
142+
// Check if we have redirects, e.g. arxiv will give otherwise content type HTML for the original url
143143
// We need to do it "manually", because ".followRedirects(true)" only works for GET not for HEAD
144144
locationHeader = response.getHeaders().getFirst("location");
145145
if (!StringUtil.isNullOrEmpty(locationHeader)) {
@@ -218,34 +218,14 @@ public String asString() throws FetcherException {
218218
return asString(StandardCharsets.UTF_8, this.openConnection());
219219
}
220220

221-
/**
222-
* Downloads the web resource to a String.
223-
*
224-
* @param encoding the desired String encoding
225-
* @return the downloaded string
226-
*/
227-
public String asString(Charset encoding) throws FetcherException {
228-
return asString(encoding, this.openConnection());
229-
}
230-
231-
/**
232-
* Downloads the web resource to a String from an existing connection. Uses UTF-8 as encoding.
233-
*
234-
* @param existingConnection an existing connection
235-
* @return the downloaded string
236-
*/
237-
public static String asString(URLConnection existingConnection) throws FetcherException {
238-
return asString(StandardCharsets.UTF_8, existingConnection);
239-
}
240-
241221
/**
242222
* Downloads the web resource to a String.
243223
*
244224
* @param encoding the desired String encoding
245225
* @param connection an existing connection
246226
* @return the downloaded string
247227
*/
248-
public static String asString(Charset encoding, URLConnection connection) throws FetcherException {
228+
private static String asString(Charset encoding, URLConnection connection) throws FetcherException {
249229
try (InputStream input = new BufferedInputStream(connection.getInputStream());
250230
Writer output = new StringWriter()) {
251231
copy(input, output, encoding);
@@ -285,12 +265,16 @@ public void toFile(Path destination) throws FetcherException {
285265
}
286266
}
287267

288-
/**
289-
* Takes the web resource as the source for a monitored input stream.
290-
*/
268+
/// Uses the web resource as source and creates a monitored input stream.
291269
public ProgressInputStream asInputStream() throws FetcherException {
292270
HttpURLConnection urlConnection = (HttpURLConnection) this.openConnection();
271+
return asInputStream(urlConnection);
272+
}
293273

274+
/// Uses the web resource as source and creates a monitored input stream.
275+
///
276+
/// Exposing the urlConnection is required for dynamic API limiting of CrossRef
277+
public ProgressInputStream asInputStream(HttpURLConnection urlConnection) throws FetcherException {
294278
int responseCode;
295279
try {
296280
responseCode = urlConnection.getResponseCode();

jablib/src/main/java/org/jabref/model/entry/BibEntry.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,4 +1268,10 @@ public boolean isEmpty() {
12681268
}
12691269
return StandardField.AUTOMATIC_FIELDS.containsAll(this.getFields());
12701270
}
1271+
1272+
/// Trims whitespaces at the beginning of the BibEntry
1273+
public void trimLeft() {
1274+
this.parsedSerialization = parsedSerialization.trim(); // we should do "trimLeft", but currently, it is OK as is.
1275+
this.commentsBeforeEntry = commentsBeforeEntry.trim(); // we should do "trimLeft", but currently, it is OK as is.
1276+
}
12711277
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
GET https://doi.org/10.1109/ICWS.2007.59
2+
Accept: application/x-bibtex

jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ class DoiFetcherTest {
5757
.withField(StandardField.JOURNAL, "Chemical Engineering Transactions")
5858
.withField(StandardField.PAGES, "871–876")
5959
.withField(StandardField.VOLUME, "77");
60+
61+
// APS Journal
6062
private final BibEntry bibEntryStenzel2020 = new BibEntry(StandardEntryType.Article)
6163
.withCitationKey("Stenzel_2020")
6264
.withField(StandardField.AUTHOR, "Stenzel, L. and Hayward, A. L. C. and Schollwöck, U. and Heidrich-Meisner, F.")
@@ -68,8 +70,9 @@ class DoiFetcherTest {
6870
.withField(StandardField.DOI, "10.1103/physreva.102.023315")
6971
.withField(StandardField.ISSN, "2469-9934")
7072
.withField(StandardField.PUBLISHER, "American Physical Society (APS)")
71-
.withField(StandardField.PAGES, "023315")
73+
.withField(StandardField.PAGES, "023315") // This is the last part of the DOI
7274
.withField(StandardField.NUMBER, "2");
75+
7376
private final BibEntry bibBenedetto2000 = new BibEntry(StandardEntryType.Article)
7477
.withCitationKey("Benedetto_2000")
7578
.withField(StandardField.AUTHOR, "Benedetto, D. and Caglioti, E. and Marchioro, C.")

jablib/src/test/java/org/jabref/logic/net/URLDownloadTest.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import java.io.File;
44
import java.io.IOException;
55
import java.net.MalformedURLException;
6-
import java.nio.charset.StandardCharsets;
76
import java.nio.file.Path;
87

98
import org.jabref.logic.importer.FetcherClientException;
@@ -34,13 +33,6 @@ void stringDownloadWithSetEncoding() throws MalformedURLException, FetcherExcept
3433
assertTrue(dl.asString().contains("Google"), "google.com should contain google");
3534
}
3635

37-
@Test
38-
void stringDownload() throws MalformedURLException, FetcherException {
39-
URLDownload dl = new URLDownload(URLUtil.create("http://www.google.com"));
40-
41-
assertTrue(dl.asString(StandardCharsets.UTF_8).contains("Google"), "google.com should contain google");
42-
}
43-
4436
@Test
4537
void fileDownload() throws IOException, FetcherException {
4638
File destination = File.createTempFile("jabref-test", ".html");
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package org.jabref.model.entry;
2+
3+
import org.junit.jupiter.api.Test;
4+
5+
import static org.junit.jupiter.api.Assertions.assertEquals;
6+
import static org.junit.jupiter.api.Assertions.assertNotNull;
7+
8+
class BibEntryTrimLeftTest {
9+
10+
@Test
11+
void trimLeftRemovesLeadingWhitespaceFromParsedSerializationAndComments() {
12+
BibEntry entry = new BibEntry();
13+
// include leading and trailing whitespace
14+
entry.setParsedSerialization(" \t\n Some serialization \n\t ");
15+
entry.setCommentsBeforeEntry(" % a comment \n");
16+
17+
// sanity
18+
assertNotNull(entry.getParsedSerialization());
19+
assertNotNull(entry.getUserComments());
20+
21+
entry.trimLeft();
22+
23+
// Current implementation uses String.trim(), which removes leading and trailing whitespace.
24+
// The important bit to test here is that leading whitespace is gone.
25+
assertEquals("Some serialization", entry.getParsedSerialization());
26+
assertEquals("% a comment", entry.getUserComments());
27+
}
28+
29+
@Test
30+
void trimLeftIsIdempotent() {
31+
BibEntry entry = new BibEntry();
32+
entry.setParsedSerialization(" Text ");
33+
entry.setCommentsBeforeEntry(" % c ");
34+
35+
entry.trimLeft();
36+
String afterFirst = entry.getParsedSerialization();
37+
String commentsAfterFirst = entry.getUserComments();
38+
39+
entry.trimLeft();
40+
assertEquals(afterFirst, entry.getParsedSerialization());
41+
assertEquals(commentsAfterFirst, entry.getUserComments());
42+
}
43+
44+
@Test
45+
void trimLeftWithEmptyStringsKeepsEmpty() {
46+
BibEntry entry = new BibEntry();
47+
entry.setParsedSerialization("");
48+
entry.setCommentsBeforeEntry("");
49+
50+
entry.trimLeft();
51+
52+
assertEquals("", entry.getParsedSerialization());
53+
assertEquals("", entry.getUserComments());
54+
}
55+
}

0 commit comments

Comments
 (0)