Skip to content

Commit df93565

Browse files
committed
✨ Add tika fallback for text extraction
1 parent 5d985d0 commit df93565

File tree

12 files changed

+327
-186
lines changed

12 files changed

+327
-186
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ data/archive
33
data/model_type_prediction.ftz
44
debug.sqlite3
55
data/servicelayer-archive
6+
# tika
7+
contrib/tika-server.jar
68
# documentation
79
site
810
# Byte-compiled / optimized / DLL files

ingestors/ingestor.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import logging
2+
import typing
23

34
from rigour.mime import normalize_extension, normalize_mimetype
45

6+
if typing.TYPE_CHECKING:
7+
from ingestors.manager import Manager
8+
59
log = logging.getLogger(__name__)
610

711

@@ -12,7 +16,7 @@ class Ingestor(object):
1216
EXTENSIONS = []
1317
SCORE = 3
1418

15-
def __init__(self, manager):
19+
def __init__(self, manager: "Manager"):
1620
self.manager = manager
1721

1822
def ingest(self, file_path, entity):

ingestors/manager.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from ingestors import __version__
2727
from ingestors.directory import DirectoryIngestor
2828
from ingestors.exc import ENCRYPTED_MSG, ProcessingException
29+
from ingestors.ingestor import Ingestor
30+
from ingestors.misc.tika import TikaIngestor
2931
from ingestors.settings import Settings
3032
from ingestors.util import filter_text, remove_directory
3133

@@ -146,7 +148,7 @@ def emit_text_fragment(self, entity, texts, fragment):
146148
doc.add("indexText", texts)
147149
self.emit_entity(doc, fragment=safe_fragment(fragment))
148150

149-
def auction(self, file_path, entity):
151+
def auction(self, file_path, entity) -> type[Ingestor]:
150152
if not entity.has("mimeType"):
151153
if file_path.is_dir():
152154
entity.add("mimeType", DirectoryIngestor.MIME_TYPE)
@@ -163,6 +165,10 @@ def auction(self, file_path, entity):
163165
best_score = score
164166
best_cls = cls
165167

168+
settings = Settings()
169+
if settings.tika_fallback:
170+
best_cls = TikaIngestor
171+
166172
if best_cls is None:
167173
raise ProcessingException("Format not supported")
168174
return best_cls

ingestors/misc/tika.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from typing import Any
2+
3+
from followthemoney import EntityProxy, model
4+
from rigour.mime import normalize_mimetype
5+
6+
from ingestors.exc import ProcessingException
7+
from ingestors.ingestor import Ingestor
8+
from ingestors.support.tika import TikaSupport
9+
10+
11+
def extract_mimetype(result: dict[str, Any]) -> str | None:
12+
mime = result.get("Content-Type")
13+
if mime and ";" in mime:
14+
return normalize_mimetype(mime.split(";")[0])
15+
return normalize_mimetype(mime)
16+
17+
18+
class TikaIngestor(Ingestor, TikaSupport):
19+
SCORE = 1
20+
21+
def ingest(self, file_path: str, entity: EntityProxy):
22+
with open(file_path, "rb") as fh:
23+
try:
24+
result = self.extract_tika(fh, cache_key=entity.first("contentHash"))
25+
if result:
26+
patch = model.make_entity(entity.schema)
27+
patch.id = entity.id
28+
patch.add("mimeType", extract_mimetype(result))
29+
patch.add("bodyText", result["content"])
30+
self.manager.emit_entity(patch, fragment="tika")
31+
except Exception as exc:
32+
raise ProcessingException("Cannot extract tika text: %s" % exc) from exc

ingestors/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ class Settings(OpenAlephSettings):
2323
convert_timeout: int = 300
2424
"""Headless libreoffice document convert timeout in seconds"""
2525

26+
tika_fallback: bool = False
27+
"""Use Apache Tika as a text extraction fallback"""
28+
2629

2730
_settings = OpenAlephSettings()
2831

ingestors/support/tika.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import logging
2+
from io import BytesIO
3+
from typing import Any
4+
5+
from normality import collapse_spaces
6+
from tika import parser
7+
8+
from ingestors.support.cache import CacheSupport
9+
10+
log = logging.getLogger(__name__)
11+
12+
13+
class TikaSupport(CacheSupport):
14+
def extract_tika(
15+
self, fh: BytesIO, cache_key: str | None = None
16+
) -> dict[str, Any] | None:
17+
_cache_key = None
18+
if cache_key:
19+
_cache_key = self.cache_key("tika", cache_key)
20+
result = self.tags.get(_cache_key)
21+
if result is not None:
22+
log.info("Tika: cached result for checksum %s" % cache_key)
23+
return result
24+
25+
result = parser.from_file(fh)
26+
if isinstance(result, dict):
27+
text = result.get("content")
28+
if text:
29+
result["content"] = collapse_spaces(text)
30+
if _cache_key:
31+
self.tags.set(_cache_key, result)
32+
return result

poetry.lock

Lines changed: 222 additions & 181 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ dependencies = [
4141
"boto3 (>=1.11.9,<2.0.0)",
4242
"openaleph-procrastinate (>=5.0.0,<6.0.0)",
4343
"openaleph-servicelayer (>=1.23.3, <1.24.0)",
44+
"tika (>=3.1.0,<4.0.0)",
4445
]
4546

4647
[project.scripts]
@@ -87,6 +88,7 @@ svg = "ingestors.media.svg:SVGIngestor"
8788
audio = "ingestors.media.audio:AudioIngestor"
8889
video = "ingestors.media.video:VideoIngestor"
8990
json = "ingestors.misc.jsonfile:JSONIngestor"
91+
tika = "ingestors.misc.tika:TikaIngestor"
9092

9193
[project.optional-dependencies]
9294
ocr = ["tesserocr (==2.6.2)"]
@@ -116,3 +118,5 @@ OPENALEPH_ANALYZE_DEFER = 0
116118
OPENALEPH_INDEX_DEFER = 0
117119
PROCRASTINATE_DB_URI = "memory://"
118120
FTM_FRAGMENTS_URI = "sqlite:///ingest_test.db"
121+
TIKA_PATH = "./contrib"
122+
INGESTORS_TIKA_FALLBACK = 1

requirements-dev.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ bump2version==1.0.1 ; python_version >= "3.11" and python_version < "3.14"
44
cfgv==3.4.0 ; python_version >= "3.11" and python_version < "3.14"
55
click==8.2.1 ; python_version >= "3.11" and python_version < "3.14"
66
colorama==0.4.6 ; python_version >= "3.11" and python_version < "3.14" and (platform_system == "Windows" or sys_platform == "win32")
7-
coverage==7.10.5 ; python_version >= "3.11" and python_version < "3.14"
7+
coverage==7.10.6 ; python_version >= "3.11" and python_version < "3.14"
88
distlib==0.4.0 ; python_version >= "3.11" and python_version < "3.14"
99
fakeredis==2.30.3 ; python_version >= "3.11" and python_version < "3.14"
1010
filelock==3.19.1 ; python_version >= "3.11" and python_version < "3.14"

requirements.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ openaleph-servicelayer==1.23.3 ; python_version >= "3.11" and python_version < "
7575
openpyxl==3.1.5 ; python_version >= "3.11" and python_version < "3.14"
7676
orjson==3.11.3 ; python_version >= "3.11" and python_version < "3.14"
7777
pathspec==0.12.1 ; python_version >= "3.11" and python_version < "3.14"
78-
phonenumbers==9.0.12 ; python_version >= "3.11" and python_version < "3.14"
78+
phonenumbers==9.0.13 ; python_version >= "3.11" and python_version < "3.14"
7979
pillow==11.3.0 ; python_version >= "3.11" and python_version < "3.14"
8080
platformdirs==4.4.0 ; python_version >= "3.11" and python_version < "3.14"
8181
prefixdate==0.5.0 ; python_version >= "3.11" and python_version < "3.14"
@@ -111,7 +111,7 @@ rapidfuzz==3.14.0 ; python_version >= "3.11" and python_version < "3.14"
111111
rarfile==4.2 ; python_version >= "3.11" and python_version < "3.14"
112112
rdflib==7.1.4 ; python_version >= "3.11" and python_version < "3.14"
113113
redis==6.4.0 ; python_version >= "3.11" and python_version < "3.14"
114-
regex==2025.7.34 ; python_version >= "3.11" and python_version < "3.14"
114+
regex==2025.8.29 ; python_version >= "3.11" and python_version < "3.14"
115115
requests==2.32.5 ; python_version >= "3.11" and python_version < "3.14"
116116
rich==14.1.0 ; python_version >= "3.11" and python_version < "3.14"
117117
rigour==1.3.0 ; python_version >= "3.11" and python_version < "3.14"
@@ -120,6 +120,7 @@ s3transfer==0.13.1 ; python_version >= "3.11" and python_version < "3.14"
120120
scikit-learn==1.7.1 ; python_version >= "3.11" and python_version < "3.14"
121121
scipy==1.16.1 ; python_version >= "3.11" and python_version < "3.14"
122122
sentry-sdk==2.0.1 ; python_version >= "3.11" and python_version < "3.14"
123+
setuptools==80.9.0 ; python_version >= "3.11" and python_version < "3.14"
123124
shellingham==1.5.4 ; python_version >= "3.11" and python_version < "3.14"
124125
shortuuid==1.0.13 ; python_version >= "3.11" and python_version < "3.14"
125126
six==1.17.0 ; python_version >= "3.11" and python_version < "3.14"
@@ -130,6 +131,7 @@ structlog==24.4.0 ; python_version >= "3.11" and python_version < "3.14"
130131
texttable==1.7.0 ; python_version >= "3.11" and python_version < "3.14"
131132
textual==4.0.0 ; python_version >= "3.11" and python_version < "3.14"
132133
threadpoolctl==3.6.0 ; python_version >= "3.11" and python_version < "3.14"
134+
tika==3.1.0 ; python_version >= "3.11" and python_version < "3.14"
133135
tqdm==4.67.1 ; python_version >= "3.11" and python_version < "3.14"
134136
typer==0.17.3 ; python_version >= "3.11" and python_version < "3.14"
135137
typing-extensions==4.15.0 ; python_version >= "3.11" and python_version < "3.14"

0 commit comments

Comments
 (0)