Skip to content

Commit 47792f5

Browse files
committed
OAK-9752 Migrate to Tika 3.2.3
1 parent f228cd5 commit 47792f5

File tree

9 files changed

+39
-56
lines changed

9 files changed

+39
-56
lines changed

oak-examples/standalone/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@
129129
</dependency>
130130
<dependency>
131131
<groupId>org.apache.tika</groupId>
132-
<artifactId>tika-parsers</artifactId>
132+
<artifactId>tika-parsers-standard-package</artifactId>
133133
<version>${tika.version}</version>
134134
<exclusions>
135135
<exclusion>

oak-examples/webapp/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
</dependency>
5252
<dependency>
5353
<groupId>org.apache.tika</groupId>
54-
<artifactId>tika-parsers</artifactId>
54+
<artifactId>tika-parsers-standard-package</artifactId>
5555
<version>${tika.version}</version>
5656
<exclusions>
5757
<exclusion>

oak-lucene/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@
389389
</dependency>
390390
<dependency>
391391
<groupId>org.apache.tika</groupId>
392-
<artifactId>tika-parsers</artifactId>
392+
<artifactId>tika-parsers-standard-package</artifactId>
393393
<version>${tika.version}</version>
394394
<scope>test</scope>
395395
<exclusions>

oak-parent/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@
6262
<slf4j.version>1.7.36</slf4j.version> <!-- sync with logback version -->
6363
<logback.version>1.2.13</logback.version>
6464
<h2.version>2.1.214</h2.version>
65-
<tika.version>1.28.5</tika.version>
65+
<tika.version>3.2.3</tika.version>
6666
<derby.version>10.15.2.0</derby.version>
6767
<jackson.version>2.17.3</jackson.version>
6868
<testcontainers.version>1.21.1</testcontainers.version>

oak-pojosr/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@
192192
</dependency>
193193
<dependency>
194194
<groupId>org.apache.tika</groupId>
195-
<artifactId>tika-parsers</artifactId>
195+
<artifactId>tika-parsers-standard-package</artifactId>
196196
<version>${tika.version}</version>
197197
<scope>test</scope>
198198
<exclusions>

oak-run/pom.xml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
<jetty.version>9.4.53.v20231009</jetty.version>
3535
<!--
3636
Size History:
37+
+ 2 MB (Tika 3.2.3, OAK-9752)
3738
+ 87 MB (Aws Sdk 2.x, OAK-11935)
3839
+ 84 MB (RDB/Tomcat, OAK-10752)
3940
+ 80 MB (Java 17, OAK-10638)
@@ -52,7 +53,7 @@
5253
+ 41 MB build failing on the release profile (OAK-6250)
5354
+ 38 MB. Initial value. Current 35MB plus a 10%
5455
-->
55-
<max.jar.size>91226112</max.jar.size>
56+
<max.jar.size>93585333</max.jar.size>
5657
</properties>
5758

5859
<build>
@@ -364,7 +365,7 @@
364365
</dependency>
365366
<dependency>
366367
<groupId>org.apache.tika</groupId>
367-
<artifactId>tika-parsers</artifactId>
368+
<artifactId>tika-parsers-standard-package</artifactId>
368369
<version>${tika.version}</version>
369370
</dependency>
370371
<dependency>

oak-run/src/main/assembly/oak-run.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
<excludes>
3333
<exclude>org.apache.lucene</exclude>
3434
<exclude>org.apache.tika:tika-core:*</exclude>
35-
<exclude>org.apache.tika:tika-parsers:*</exclude>
35+
<exclude>org.apache.tika:tika-parsers-standard-package:*</exclude>
3636
<exclude>org.apache.jackrabbit:jackrabbit-aws-ext:*</exclude>
3737
<exclude>io.prometheus:simpleclient*:*</exclude>
3838
</excludes>
@@ -60,7 +60,7 @@
6060
<outputDirectory>/</outputDirectory>
6161
<includes>
6262
<include>org.apache.tika:tika-core</include>
63-
<include>org.apache.tika:tika-parsers</include>
63+
<include>org.apache.tika:tika-parsers-standard-package</include>
6464
<include>commons-logging:commons-logging</include>
6565
</includes>
6666
<useStrictFiltering>true</useStrictFiltering>

oak-search-elastic/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@
146146
</dependency>
147147
<dependency>
148148
<groupId>org.apache.tika</groupId>
149-
<artifactId>tika-parsers</artifactId>
149+
<artifactId>tika-parsers-standard-package</artifactId>
150150
<version>${tika.version}</version>
151151
<scope>test</scope>
152152
<exclusions>

oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TikaParserConfig.java

Lines changed: 28 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,22 @@
2222
import java.io.IOException;
2323
import java.io.InputStream;
2424
import java.util.HashSet;
25+
import java.util.Optional;
2526
import java.util.Set;
2627

27-
import javax.xml.parsers.DocumentBuilder;
28-
29-
import org.apache.jackrabbit.oak.commons.StringUtils;
28+
import org.apache.tika.config.TikaConfig;
3029
import org.apache.tika.exception.TikaException;
3130
import org.apache.tika.mime.MediaType;
3231
import org.apache.tika.parser.ParseContext;
33-
import org.w3c.dom.Element;
34-
import org.w3c.dom.Node;
35-
import org.w3c.dom.NodeList;
32+
import org.apache.tika.parser.Parser;
33+
import org.apache.tika.parser.ParserDecorator;
34+
import org.slf4j.Logger;
35+
import org.slf4j.LoggerFactory;
3636
import org.xml.sax.SAXException;
3737

3838
public class TikaParserConfig {
3939

40-
private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser";
40+
private static final Logger log = LoggerFactory.getLogger(TikaParserConfig.class);
4141

4242
/**
4343
* Determines the set of MediaType which have been configured with an EmptyParser.
@@ -48,50 +48,32 @@ public class TikaParserConfig {
4848
public static Set<MediaType> getNonIndexedMediaTypes(InputStream configStream) throws
4949
TikaException, IOException, SAXException {
5050
Set<MediaType> result = new HashSet<>();
51-
Element element = getBuilder().parse(configStream).getDocumentElement();
52-
NodeList nodes = element.getElementsByTagName("parsers");
53-
if (nodes.getLength() == 1) {
54-
Node parentNode = nodes.item(0);
55-
NodeList parsersNodes = parentNode.getChildNodes();
56-
for (int i = 0; i < parsersNodes.getLength(); i++) {
57-
Node node = parsersNodes.item(i);
58-
if (node instanceof Element) {
59-
String className = ((Element) node).getAttribute("class");
60-
if (EMPTY_PARSER.equals(className)) {
61-
NodeList mimes = ((Element) node).getElementsByTagName("mime");
62-
parseMimeTypes(result, mimes);
63-
}
64-
}
51+
TikaConfig config = new TikaConfig(configStream);
52+
if (config.getParser() instanceof org.apache.tika.parser.CompositeParser) {
53+
// pick the (decorated) empty parser
54+
Optional<Parser> emptyParser = ((org.apache.tika.parser.CompositeParser) config.getParser()).getAllComponentParsers().stream()
55+
.filter(p -> isEmptyParser(p))
56+
.findFirst();
57+
if (emptyParser.isPresent()) {
58+
emptyParser.get().getSupportedTypes(new ParseContext()).forEach(result::add);
6559
}
60+
} else {
61+
log.debug("Tika CompositeParser not used, no parsers configured via custom tika config");
6662
}
6763
return result;
6864
}
6965

70-
71-
private static void parseMimeTypes(Set<MediaType> result, NodeList mimes) {
72-
/*
73-
<parser class="org.apache.tika.parser.EmptyParser">
74-
<mime>application/x-archive</mime>
75-
<mime>application/x-bzip</mime>
76-
<mime>application/x-bzip2</mime>
77-
</parser>
78-
*/
79-
for (int j = 0; j < mimes.getLength(); j++) {
80-
Node mime = mimes.item(j);
81-
if (mime instanceof Element) {
82-
String mimeValue = mime.getTextContent();
83-
mimeValue = StringUtils.emptyToNull(mimeValue);
84-
if (mimeValue != null) {
85-
MediaType mediaType = MediaType.parse(mimeValue.trim());
86-
if (mediaType != null) {
87-
result.add(mediaType);
88-
}
89-
}
90-
}
66+
/**
67+
* Returns true if the given parser is an EmptyParser or decorates an EmptyParser.
68+
* @param parser
69+
* @return {@code true} if the given parser is an EmptyParser or decorates an EmptyParser
70+
*/
71+
private static boolean isEmptyParser(Parser parser) {
72+
if (parser instanceof org.apache.tika.parser.EmptyParser) {
73+
return true;
74+
} else if (parser instanceof org.apache.tika.parser.ParserDecorator) {
75+
return isEmptyParser(((ParserDecorator) parser).getWrappedParser());
9176
}
92-
}
93-
94-
private static DocumentBuilder getBuilder() throws TikaException {
95-
return new ParseContext().getDocumentBuilder();
77+
return false;
9678
}
9779
}

0 commit comments

Comments
 (0)