2222import java .io .IOException ;
2323import java .io .InputStream ;
2424import java .util .HashSet ;
25+ import java .util .Optional ;
2526import java .util .Set ;
2627
27- import javax .xml .parsers .DocumentBuilder ;
28-
29- import org .apache .jackrabbit .oak .commons .StringUtils ;
28+ import org .apache .tika .config .TikaConfig ;
3029import org .apache .tika .exception .TikaException ;
3130import org .apache .tika .mime .MediaType ;
3231import org .apache .tika .parser .ParseContext ;
33- import org .w3c .dom .Element ;
34- import org .w3c .dom .Node ;
35- import org .w3c .dom .NodeList ;
32+ import org .apache .tika .parser .Parser ;
33+ import org .apache .tika .parser .ParserDecorator ;
34+ import org .slf4j .Logger ;
35+ import org .slf4j .LoggerFactory ;
3636import org .xml .sax .SAXException ;
3737
3838public class TikaParserConfig {
3939
40- private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser" ;
40+ private static final Logger log = LoggerFactory . getLogger ( TikaParserConfig . class ) ;
4141
4242 /**
4343 * Determines the set of MediaType which have been configured with an EmptyParser.
@@ -48,50 +48,32 @@ public class TikaParserConfig {
4848 public static Set <MediaType > getNonIndexedMediaTypes (InputStream configStream ) throws
4949 TikaException , IOException , SAXException {
5050 Set <MediaType > result = new HashSet <>();
51- Element element = getBuilder ().parse (configStream ).getDocumentElement ();
52- NodeList nodes = element .getElementsByTagName ("parsers" );
53- if (nodes .getLength () == 1 ) {
54- Node parentNode = nodes .item (0 );
55- NodeList parsersNodes = parentNode .getChildNodes ();
56- for (int i = 0 ; i < parsersNodes .getLength (); i ++) {
57- Node node = parsersNodes .item (i );
58- if (node instanceof Element ) {
59- String className = ((Element ) node ).getAttribute ("class" );
60- if (EMPTY_PARSER .equals (className )) {
61- NodeList mimes = ((Element ) node ).getElementsByTagName ("mime" );
62- parseMimeTypes (result , mimes );
63- }
64- }
51+ TikaConfig config = new TikaConfig (configStream );
52+ if (config .getParser () instanceof org .apache .tika .parser .CompositeParser ) {
53+ // pick the (decorated) empty parser
54+ Optional <Parser > emptyParser = ((org .apache .tika .parser .CompositeParser ) config .getParser ()).getAllComponentParsers ().stream ()
55+ .filter (p -> isEmptyParser (p ))
56+ .findFirst ();
57+ if (emptyParser .isPresent ()) {
58+ emptyParser .get ().getSupportedTypes (new ParseContext ()).forEach (result ::add );
6559 }
60+ } else {
61+ log .debug ("Tika CompositeParser not used, no parsers configured via custom tika config" );
6662 }
6763 return result ;
6864 }
6965
70-
71- private static void parseMimeTypes (Set <MediaType > result , NodeList mimes ) {
72- /*
73- <parser class="org.apache.tika.parser.EmptyParser">
74- <mime>application/x-archive</mime>
75- <mime>application/x-bzip</mime>
76- <mime>application/x-bzip2</mime>
77- </parser>
78- */
79- for (int j = 0 ; j < mimes .getLength (); j ++) {
80- Node mime = mimes .item (j );
81- if (mime instanceof Element ) {
82- String mimeValue = mime .getTextContent ();
83- mimeValue = StringUtils .emptyToNull (mimeValue );
84- if (mimeValue != null ) {
85- MediaType mediaType = MediaType .parse (mimeValue .trim ());
86- if (mediaType != null ) {
87- result .add (mediaType );
88- }
89- }
90- }
66+ /**
67+ * Returns true if the given parser is an EmptyParser or decorates an EmptyParser.
68+ * @param parser
69+ * @return {@code true} if the given parser is an EmptyParser or decorates an EmptyParser
70+ */
71+ private static boolean isEmptyParser (Parser parser ) {
72+ if (parser instanceof org .apache .tika .parser .EmptyParser ) {
73+ return true ;
74+ } else if (parser instanceof org .apache .tika .parser .ParserDecorator ) {
75+ return isEmptyParser (((ParserDecorator ) parser ).getWrappedParser ());
9176 }
92- }
93-
94- private static DocumentBuilder getBuilder () throws TikaException {
95- return new ParseContext ().getDocumentBuilder ();
77+ return false ;
9678 }
9779}
0 commit comments