Use streaming when downloading from DOI (#14384)

koppor · jetbrains-junie[bot] · web-flow · commit 670620b4d68a · 2025-11-24T21:27:19.000Z
* Rewrite step 1 * Fix typo * Use Markdown JavaDoc * Re-ordering of methods * Introduce constants for formatters * Add helper file * Add removal of whitespaces * Syntaxfix * Try to workaround broken reformatter * test(entry): add unit test for BibEntry.trimLeft() (#14422) Co-authored-by: jetbrains-junie[bot] <201638009+jetbrains-junie[bot]@users.noreply.github.com> --------- Co-authored-by: jetbrains-junie[bot] <201638009+jetbrains-junie[bot]@users.noreply.github.com>
diff --git a/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java b/jablib/src/main/java/org/jabref/logic/importer/fetcher/DoiFetcher.java
@@ -1,6 +1,6 @@
 package org.jabref.logic.importer.fetcher;
 
-import java.io.IOException;
+import java.io.InputStream;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URL;
@@ -32,6 +32,7 @@
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.entry.types.StandardEntryType;
+import org.jabref.model.util.DummyFileUpdateMonitor;
 import org.jabref.model.util.OptionalUtil;
 
 import com.google.common.util.concurrent.RateLimiter;
@@ -64,6 +65,10 @@ public class DoiFetcher implements IdBasedFetcher, EntryBasedFetcher {
      */
     private static final RateLimiter CROSSREF_DCN_RATE_LIMITER = RateLimiter.create(50.0);
 
+    private static final FieldFormatterCleanup NORMALIZE_PAGES = new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter());
+    private static final FieldFormatterCleanup CLEAR_URL = new FieldFormatterCleanup(StandardField.URL, new ClearFormatter());
+    private static final FieldFormatterCleanup HTML_TO_LATEX_TITLE = new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter());
+
     private final ImportFormatPreferences preferences;
 
     public DoiFetcher(ImportFormatPreferences preferences) {
@@ -116,75 +121,73 @@ protected CompletableFuture<Optional<BibEntry>> asyncPerformSearchById(String id
 
     @Override
     public Optional<BibEntry> performSearchById(String identifier) throws FetcherException {
-        Optional<DOI> doi = DOI.parse(identifier);
-
-        if (doi.isEmpty()) {
-            throw new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier));
-        }
+        DOI doi = DOI.parse(identifier)
+                     .orElseThrow(() -> new FetcherException(Localization.lang("Invalid DOI: '%0'.", identifier)));
 
         URL doiURL;
         try {
-            doiURL = URLUtil.create(doi.get().getURIAsASCIIString());
+            doiURL = URLUtil.create(doi.getURIAsASCIIString());
         } catch (MalformedURLException e) {
             throw new FetcherException("Malformed URL", e);
         }
 
-        try {
-            Optional<BibEntry> fetchedEntry;
+        Optional<BibEntry> fetchedEntry;
 
-            // mEDRA does not return a parsable bibtex string
-            Optional<String> agency = getAgency(doi.get());
-            if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) {
-                return new Medra().performSearchById(identifier);
-            }
-
-            // BibTeX data
-            URLDownload download = getUrlDownload(doiURL);
-            download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
+        // mEDRA does not return a parsable bibtex string
+        Optional<String> agency;
+        try {
+            agency = getAgency(doi);
+        } catch (MalformedURLException e) {
+            throw new FetcherException("Invalid URL", e);
+        }
+        if (agency.isPresent() && "medra".equalsIgnoreCase(agency.get())) {
+            return new Medra().performSearchById(identifier);
+        }
 
-            String bibtexString;
-            URLConnection openConnection;
+        URLDownload download = getUrlDownload(doiURL);
+        download.addHeader("Accept", MediaTypes.APPLICATION_BIBTEX);
+        HttpURLConnection connection = (HttpURLConnection) download.openConnection();
+        InputStream inputStream = download.asInputStream(connection);
 
-            openConnection = download.openConnection();
-            bibtexString = URLDownload.asString(openConnection).trim();
+        BibtexParser bibtexParser = new BibtexParser(preferences, new DummyFileUpdateMonitor());
+        try {
+            fetchedEntry = bibtexParser.parseEntries(inputStream).stream().findFirst();
+        } catch (ParseException e) {
+            throw new FetcherException(doiURL, "Could not parse BibTeX entry", e);
+        }
+        // Crossref has a dynamic API rate limit
+        if (agency.isPresent() && "crossref".equalsIgnoreCase(agency.get())) {
+            updateCrossrefAPIRate(connection);
+        }
+        connection.disconnect();
 
-            // BibTeX entry
-            fetchedEntry = BibtexParser.singleFromString(bibtexString, preferences);
-            fetchedEntry.ifPresent(this::doPostCleanup);
+        fetchedEntry.ifPresent(entry -> {
+            doPostCleanup(entry);
 
-            // Crossref has a dynamic API rate limit
-            if (agency.isPresent() && "crossref".equalsIgnoreCase(agency.get())) {
-                updateCrossrefAPIRate(openConnection);
+            // Output warnings in case of inconsistencies
+            entry.getField(StandardField.DOI)
+                 .filter(entryDoi -> entryDoi.equals(doi.asString()))
+                 .ifPresent(entryDoi -> LOGGER.warn("Fetched entry's DOI {} is different from requested DOI {}", entryDoi, identifier));
+            if (entry.getField(StandardField.DOI).isEmpty()) {
+                LOGGER.warn("Fetched entry does not contain doi field {}", identifier);
             }
 
-            // Check if the entry is an APS journal and add the article id as the page count if page field is missing
-            if (fetchedEntry.isPresent() && fetchedEntry.get().hasField(StandardField.DOI)) {
-                BibEntry entry = fetchedEntry.get();
-                if (isAPSJournal(entry, entry.getField(StandardField.DOI).get()) && !entry.hasField(StandardField.PAGES)) {
-                    setPageCountToArticleId(entry, entry.getField(StandardField.DOI).get());
-                }
+            if (isAPSJournal(entry, doi) && !entry.hasField(StandardField.PAGES)) {
+                setPageNumbersBasedOnDoi(entry, doi);
             }
+        });
 
-            if (openConnection instanceof HttpURLConnection connection) {
-                connection.disconnect();
-            }
-            return fetchedEntry;
-        } catch (IOException e) {
-            throw new FetcherException(doiURL, Localization.lang("Connection error"), e);
-        } catch (ParseException e) {
-            throw new FetcherException(doiURL, "Could not parse BibTeX entry", e);
-        } catch (JSONException e) {
-            throw new FetcherException(doiURL, "Could not retrieve Registration Agency", e);
-        }
+        return fetchedEntry;
     }
 
     private void doPostCleanup(BibEntry entry) {
-        new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter()).cleanup(entry);
-        new FieldFormatterCleanup(StandardField.URL, new ClearFormatter()).cleanup(entry);
-        new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter()).cleanup(entry);
+        NORMALIZE_PAGES.cleanup(entry);
+        CLEAR_URL.cleanup(entry);
+        HTML_TO_LATEX_TITLE.cleanup(entry);
+        entry.trimLeft();
     }
 
-    private void updateCrossrefAPIRate(URLConnection existingConnection) {
+    private synchronized void updateCrossrefAPIRate(URLConnection existingConnection) {
         try {
             // Assuming this field is given in seconds
             String xRateLimitInterval = existingConnection.getHeaderField("X-Rate-Limit-Interval").replaceAll("[^\\.0123456789]", "");
@@ -221,8 +224,9 @@ public List<BibEntry> performSearch(@NonNull BibEntry entry) throws FetcherExcep
     public Optional<String> getAgency(DOI doi) throws FetcherException, MalformedURLException {
         Optional<String> agency = Optional.empty();
         try {
-            URLDownload download = getUrlDownload(URLUtil.create(DOI.AGENCY_RESOLVER + "/" + URLEncoder.encode(doi.asString(),
-                    StandardCharsets.UTF_8)));
+            URLDownload download = getUrlDownload(
+                    URLUtil.create(DOI.AGENCY_RESOLVER + "/" + URLEncoder.encode(doi.asString(),
+                            StandardCharsets.UTF_8)));
             JSONObject response = new JSONArray(download.asString()).getJSONObject(0);
             if (response != null) {
                 agency = Optional.ofNullable(response.optString("RA"));
@@ -235,18 +239,20 @@ public Optional<String> getAgency(DOI doi) throws FetcherException, MalformedURL
         return agency;
     }
 
-    private void setPageCountToArticleId(BibEntry entry, String doiAsString) {
+    private void setPageNumbersBasedOnDoi(BibEntry entry, DOI doi) {
+        String doiAsString = doi.asString();
         String articleId = doiAsString.substring(doiAsString.lastIndexOf('.') + 1);
         entry.setField(StandardField.PAGES, articleId);
     }
 
     // checks if the entry is an APS journal by comparing the organization id and the suffix format
-    private boolean isAPSJournal(BibEntry entry, String doiAsString) {
+    private boolean isAPSJournal(BibEntry entry, DOI doi) {
         if (!entry.getType().equals(StandardEntryType.Article)) {
             return false;
         }
-        String suffix = doiAsString.substring(doiAsString.lastIndexOf('/') + 1);
-        String organizationId = doiAsString.substring(doiAsString.indexOf('.') + 1, doiAsString.indexOf('/'));
+        String doiString = doi.asString();
+        String suffix = doiString.substring(doiString.lastIndexOf('/') + 1);
+        String organizationId = doiString.substring(doiString.indexOf('.') + 1, doiString.indexOf('/'));
         return APS_JOURNAL_ORG_DOI_ID.equals(organizationId) && APS_SUFFIX_PATTERN.matcher(suffix).matches();
     }
 }
diff --git a/jablib/src/main/java/org/jabref/logic/net/URLDownload.java b/jablib/src/main/java/org/jabref/logic/net/URLDownload.java
@@ -54,20 +54,20 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-/**
- * URL download to a string.
- * <p>
- * Example:
- * <code>
- * URLDownload dl = new URLDownload(URL);
- * String content = dl.asString(ENCODING);
- * dl.toFile(Path); // available in FILE
- * String contentType = dl.getMimeType();
- * </code>
- * <br/><br/>
- * Almost each call to a public method creates a new HTTP connection (except for {@link #asString(Charset, URLConnection) asString},
- * which uses an already opened connection). Nothing is cached.
- */
+/// ## Example
+///
+/// ``java
+/// URLDownload dl = new URLDownload(URL);
+/// String content = dl.asString(ENCODING);
+/// dl.toFile(Path);              // available in FILE
+/// String contentType = dl.getMimeType();
+/// ``
+///
+/// Almost every call to a public method creates a new HTTP connection
+/// (except for {@link #asString(Charset, URLConnection) asString},
+/// which uses an already opened connection).
+///
+/// Nothing is cached.
 public class URLDownload {
 
     public static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0";
@@ -139,7 +139,7 @@ public Optional<String> getMimeType() {
                 // @formatter:on
                 retries++;
                 HttpResponse<String> response = Unirest.head(urlToCheck).asString();
-                // Check if we have redirects, e.g. arxiv will give otherwise content type html for the original url
+                // Check if we have redirects, e.g. arxiv will give otherwise content type HTML for the original url
                 // We need to do it "manually", because ".followRedirects(true)" only works for GET not for HEAD
                 locationHeader = response.getHeaders().getFirst("location");
                 if (!StringUtil.isNullOrEmpty(locationHeader)) {
@@ -218,34 +218,14 @@ public String asString() throws FetcherException {
         return asString(StandardCharsets.UTF_8, this.openConnection());
     }
 
-    /**
-     * Downloads the web resource to a String.
-     *
-     * @param encoding the desired String encoding
-     * @return the downloaded string
-     */
-    public String asString(Charset encoding) throws FetcherException {
-        return asString(encoding, this.openConnection());
-    }
-
-    /**
-     * Downloads the web resource to a String from an existing connection. Uses UTF-8 as encoding.
-     *
-     * @param existingConnection an existing connection
-     * @return the downloaded string
-     */
-    public static String asString(URLConnection existingConnection) throws FetcherException {
-        return asString(StandardCharsets.UTF_8, existingConnection);
-    }
-
     /**
      * Downloads the web resource to a String.
      *
      * @param encoding   the desired String encoding
      * @param connection an existing connection
      * @return the downloaded string
      */
-    public static String asString(Charset encoding, URLConnection connection) throws FetcherException {
+    private static String asString(Charset encoding, URLConnection connection) throws FetcherException {
         try (InputStream input = new BufferedInputStream(connection.getInputStream());
              Writer output = new StringWriter()) {
             copy(input, output, encoding);
@@ -285,12 +265,16 @@ public void toFile(Path destination) throws FetcherException {
         }
     }
 
-    /**
-     * Takes the web resource as the source for a monitored input stream.
-     */
+    /// Uses the web resource as source and creates a monitored input stream.
     public ProgressInputStream asInputStream() throws FetcherException {
         HttpURLConnection urlConnection = (HttpURLConnection) this.openConnection();
+        return asInputStream(urlConnection);
+    }
 
+    /// Uses the web resource as source and creates a monitored input stream.
+    ///
+    /// Exposing the urlConnection is required for dynamic API limiting of CrossRef
+    public ProgressInputStream asInputStream(HttpURLConnection urlConnection) throws FetcherException {
         int responseCode;
         try {
             responseCode = urlConnection.getResponseCode();
diff --git a/jablib/src/main/java/org/jabref/model/entry/BibEntry.java b/jablib/src/main/java/org/jabref/model/entry/BibEntry.java
@@ -1268,4 +1268,10 @@ public boolean isEmpty() {
         }
         return StandardField.AUTOMATIC_FIELDS.containsAll(this.getFields());
     }
+
+    /// Trims whitespaces at the beginning of the BibEntry
+    public void trimLeft() {
+        this.parsedSerialization = parsedSerialization.trim(); // we should do "trimLeft", but currently, it is OK as is.
+        this.commentsBeforeEntry = commentsBeforeEntry.trim(); // we should do "trimLeft", but currently, it is OK as is.
+    }
 }
diff --git a/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.http b/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.http
@@ -0,0 +1,2 @@
+GET https://doi.org/10.1109/ICWS.2007.59
+Accept: application/x-bibtex
diff --git a/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.java b/jablib/src/test/java/org/jabref/logic/importer/fetcher/DoiFetcherTest.java
@@ -57,6 +57,8 @@ class DoiFetcherTest {
             .withField(StandardField.JOURNAL, "Chemical Engineering Transactions")
             .withField(StandardField.PAGES, "871–876")
             .withField(StandardField.VOLUME, "77");
+
+    // APS Journal
     private final BibEntry bibEntryStenzel2020 = new BibEntry(StandardEntryType.Article)
             .withCitationKey("Stenzel_2020")
             .withField(StandardField.AUTHOR, "Stenzel, L. and Hayward, A. L. C. and Schollwöck, U. and Heidrich-Meisner, F.")
@@ -68,8 +70,9 @@ class DoiFetcherTest {
             .withField(StandardField.DOI, "10.1103/physreva.102.023315")
             .withField(StandardField.ISSN, "2469-9934")
             .withField(StandardField.PUBLISHER, "American Physical Society (APS)")
-            .withField(StandardField.PAGES, "023315")
+            .withField(StandardField.PAGES, "023315") // This is the last part of the DOI
             .withField(StandardField.NUMBER, "2");
+
     private final BibEntry bibBenedetto2000 = new BibEntry(StandardEntryType.Article)
             .withCitationKey("Benedetto_2000")
             .withField(StandardField.AUTHOR, "Benedetto, D. and Caglioti, E. and Marchioro, C.")
diff --git a/jablib/src/test/java/org/jabref/logic/net/URLDownloadTest.java b/jablib/src/test/java/org/jabref/logic/net/URLDownloadTest.java
@@ -3,7 +3,6 @@
 import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
-import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 
 import org.jabref.logic.importer.FetcherClientException;
@@ -34,13 +33,6 @@ void stringDownloadWithSetEncoding() throws MalformedURLException, FetcherExcept
         assertTrue(dl.asString().contains("Google"), "google.com should contain google");
     }
 
-    @Test
-    void stringDownload() throws MalformedURLException, FetcherException {
-        URLDownload dl = new URLDownload(URLUtil.create("http://www.google.com"));
-
-        assertTrue(dl.asString(StandardCharsets.UTF_8).contains("Google"), "google.com should contain google");
-    }
-
     @Test
     void fileDownload() throws IOException, FetcherException {
         File destination = File.createTempFile("jabref-test", ".html");
diff --git a/jablib/src/test/java/org/jabref/model/entry/BibEntryTrimLeftTest.java b/jablib/src/test/java/org/jabref/model/entry/BibEntryTrimLeftTest.java
@@ -0,0 +1,55 @@
+package org.jabref.model.entry;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+class BibEntryTrimLeftTest {
+
+    @Test
+    void trimLeftRemovesLeadingWhitespaceFromParsedSerializationAndComments() {
+        BibEntry entry = new BibEntry();
+        // include leading and trailing whitespace
+        entry.setParsedSerialization("  \t\n  Some serialization  \n\t  ");
+        entry.setCommentsBeforeEntry("   % a comment   \n");
+
+        // sanity
+        assertNotNull(entry.getParsedSerialization());
+        assertNotNull(entry.getUserComments());
+
+        entry.trimLeft();
+
+        // Current implementation uses String.trim(), which removes leading and trailing whitespace.
+        // The important bit to test here is that leading whitespace is gone.
+        assertEquals("Some serialization", entry.getParsedSerialization());
+        assertEquals("% a comment", entry.getUserComments());
+    }
+
+    @Test
+    void trimLeftIsIdempotent() {
+        BibEntry entry = new BibEntry();
+        entry.setParsedSerialization("   Text   ");
+        entry.setCommentsBeforeEntry("   % c   ");
+
+        entry.trimLeft();
+        String afterFirst = entry.getParsedSerialization();
+        String commentsAfterFirst = entry.getUserComments();
+
+        entry.trimLeft();
+        assertEquals(afterFirst, entry.getParsedSerialization());
+        assertEquals(commentsAfterFirst, entry.getUserComments());
+    }
+
+    @Test
+    void trimLeftWithEmptyStringsKeepsEmpty() {
+        BibEntry entry = new BibEntry();
+        entry.setParsedSerialization("");
+        entry.setCommentsBeforeEntry("");
+
+        entry.trimLeft();
+
+        assertEquals("", entry.getParsedSerialization());
+        assertEquals("", entry.getUserComments());
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -1268,4 +1268,10 @@ public boolean isEmpty() {`
`1268`	`1268`	`}`
`1269`	`1269`	`return StandardField.AUTOMATIC_FIELDS.containsAll(this.getFields());`
`1270`	`1270`	`}`
	`1271`	`+`
	`1272`	`+ /// Trims whitespaces at the beginning of the BibEntry`
	`1273`	`+ public void trimLeft() {`
	`1274`	`+ this.parsedSerialization = parsedSerialization.trim(); // we should do "trimLeft", but currently, it is OK as is.`
	`1275`	`+ this.commentsBeforeEntry = commentsBeforeEntry.trim(); // we should do "trimLeft", but currently, it is OK as is.`
	`1276`	`+ }`
`1271`	`1277`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+GET https://doi.org/10.1109/ICWS.2007.59`
	`2`	`+Accept: application/x-bibtex`