Skip to content

Commit 84c761d

Browse files
committed
✨ now possible to read file contents at any time
1 parent 4275f79 commit 84c761d

File tree

4 files changed

+33
-19
lines changed

4 files changed

+33
-19
lines changed

mindee/http.py

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import os
2-
from typing import Optional, Tuple
2+
from typing import Optional
33

44
import requests
55

@@ -82,19 +82,6 @@ def set_api_key_from_env(self) -> None:
8282
self.api_key = env_key
8383
logger.debug("Set from environment: %s", self.envvar_key_name)
8484

85-
@staticmethod
86-
def _read_document(
87-
input_file: InputDocument, close_file: bool
88-
) -> Tuple[str, bytes]:
89-
logger.debug("Reading data from: %s", input_file.filename)
90-
input_file.file_object.seek(0)
91-
data = input_file.file_object.read()
92-
if close_file:
93-
input_file.file_object.close()
94-
else:
95-
input_file.file_object.seek(0)
96-
return input_file.filename, data
97-
9885
def predict_request(
9986
self,
10087
input_file: InputDocument,
@@ -109,7 +96,7 @@ def predict_request(
10996
:param close_file: Whether to `close()` the file after parsing it.
11097
:return: requests response
11198
"""
112-
files = {"document": self._read_document(input_file, close_file)}
99+
files = {"document": input_file.read_contents(close_file)}
113100
headers = {"Authorization": self.api_key, "User-Agent": USER_AGENT}
114101
data = {}
115102
if include_words:
@@ -132,7 +119,7 @@ def training_request(
132119
:return: requests response
133120
:param close_file: Whether to `close()` the file after parsing it.
134121
"""
135-
files = {"document": self._read_document(input_file, close_file)}
122+
files = {"document": input_file.read_contents(close_file)}
136123
headers = {"Authorization": self.api_key, "User-Agent": USER_AGENT}
137124
params = {"training": True, "with_candidates": True}
138125

mindee/inputs.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import io
33
import os
44
from mimetypes import guess_type
5-
from typing import BinaryIO, Optional
5+
from typing import BinaryIO, Optional, Tuple
66

77
import pikepdf
88

@@ -96,7 +96,7 @@ def is_pdf_empty(self) -> bool:
9696
"""
9797
Check if the PDF is empty.
9898
99-
:return: (void) Check if the document contain only empty pages
99+
:return: ``True`` if the PDF is empty
100100
"""
101101
self.file_object.seek(0)
102102
with pikepdf.open(self.file_object) as pdf:
@@ -131,6 +131,22 @@ def check_pdf_open(self) -> None:
131131
except Exception as err:
132132
raise RuntimeError("Couldn't open PDF file") from err
133133

134+
def read_contents(self, close_file: bool) -> Tuple[str, bytes]:
135+
"""
136+
Read the contents of the input file.
137+
138+
:param close_file: whether to close the file after reading
139+
:return: a Tuple with the file name and binary data
140+
"""
141+
logger.debug("Reading data from: %s", self.filename)
142+
self.file_object.seek(0)
143+
data = self.file_object.read()
144+
if close_file:
145+
self.file_object.close()
146+
else:
147+
self.file_object.seek(0)
148+
return self.filename, data
149+
134150

135151
class FileDocument(InputDocument):
136152
def __init__(

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ lxml==4.8.0
1414
# via pikepdf
1515
packaging==21.3
1616
# via pikepdf
17-
pikepdf==5.0.1
17+
pikepdf==5.1.2
1818
# via mindee (setup.py)
1919
pillow==9.0.1
2020
# via pikepdf

tests/test_inputs.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,17 @@ def test_pdf_reconstruct_ok():
1919
assert isinstance(input_file.file_object, io.BytesIO)
2020

2121

22+
def test_read_contents():
23+
input_doc = PathDocument("./tests/data/invoices/invoice.pdf")
24+
contents = input_doc.read_contents(close_file=False)
25+
assert contents[0] == "invoice.pdf"
26+
assert isinstance(contents[1], bytes)
27+
assert not input_doc.file_object.closed
28+
29+
input_doc.read_contents(close_file=True)
30+
assert input_doc.file_object.closed
31+
32+
2233
def test_pdf_reconstruct_no_cut():
2334
input_file = PathDocument("./tests/data/invoices/invoice_10p.pdf", cut_pdf=False)
2435
assert input_file.count_pdf_pages() == 10

0 commit comments

Comments
 (0)