Skip to content

Commit 4275f79

Browse files
committed
the MIME type is not extension
1 parent f2bd51d commit 4275f79

File tree

4 files changed

+51
-46
lines changed

4 files changed

+51
-46
lines changed

mindee/documents/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def __init__(
3838
if input_file:
3939
self.filepath = input_file.filepath
4040
self.filename = input_file.filename
41-
self.file_extension = input_file.file_extension
41+
self.file_extension = input_file.file_mimetype
4242

4343
self.type = document_type
4444

mindee/documents/financial_document.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def request(
165165
:param include_words: Include Mindee vision words in http_response
166166
:param close_file: Whether to `close()` the file after parsing it.
167167
"""
168-
if "pdf" in input_file.file_extension:
168+
if "pdf" in input_file.file_mimetype:
169169
# invoices is index 0, receipts 1 (this should be cleaned up)
170170
index = 0
171171
else:

mindee/inputs.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from mindee.logger import logger
1010

11-
ALLOWED_EXTENSIONS = [
11+
ALLOWED_MIME_TYPES = [
1212
"image/png",
1313
"image/jpg",
1414
"image/jpeg",
@@ -20,12 +20,12 @@
2020
INPUT_TYPE_BASE64 = "base64"
2121
INPUT_TYPE_BYTES = "bytes"
2222
INPUT_TYPE_PATH = "path"
23-
INPUT_TYPE_DUMMY = "dummy"
2423

2524

2625
class InputDocument:
2726
file_object: BinaryIO
2827
filename: str
28+
file_mimetype: str
2929
input_type: str
3030
filepath: Optional[str] = None
3131
cut_pdf: bool
@@ -39,17 +39,9 @@ def __init__(
3939
):
4040
assert 0 < n_pdf_pages <= 3
4141
self.input_type = input_type
42-
self.file_extension = guess_type(self.filename)[0]
42+
self._check_mimetype()
4343

44-
if (
45-
self.file_extension not in ALLOWED_EXTENSIONS
46-
and self.input_type != INPUT_TYPE_DUMMY
47-
):
48-
raise AssertionError(
49-
"File type not allowed, must be in {%s}" % ", ".join(ALLOWED_EXTENSIONS)
50-
)
51-
52-
if self.file_extension == "application/pdf":
44+
if self.file_mimetype == "application/pdf":
5345
self.check_pdf_open()
5446
count_pages = self.count_pdf_pages()
5547
if cut_pdf is True:
@@ -61,6 +53,19 @@ def __init__(
6153
raise AssertionError(f"PDF pages are empty in: {self.filename}")
6254
logger.debug("Loaded new document '%s' from %s", self.filename, self.input_type)
6355

56+
def _check_mimetype(self) -> None:
57+
file_mimetype = guess_type(self.filename)[0]
58+
if file_mimetype:
59+
self.file_mimetype = file_mimetype
60+
else:
61+
raise AssertionError(f"Could not determine MIME type of '{self.filename}'")
62+
63+
if self.file_mimetype not in ALLOWED_MIME_TYPES:
64+
raise AssertionError(
65+
"File type not allowed, must be one of {%s}"
66+
% ", ".join(ALLOWED_MIME_TYPES)
67+
)
68+
6469
def count_pdf_pages(self) -> int:
6570
"""
6671
Count the pages in the PDF.
@@ -73,10 +78,10 @@ def count_pdf_pages(self) -> int:
7378

7479
def merge_pdf_pages(self, page_numbers: list) -> None:
7580
"""
76-
Create a new PDF from pages.
81+
Create a new PDF from pages and set it to ``file_object``.
7782
7883
:param page_numbers: List of pages number to use for merging in the original PDF.
79-
:return: (void) Set the Input.file with the reconstructed pdf stream
84+
:return: None
8085
"""
8186
self.file_object.seek(0)
8287
new_pdf = pikepdf.Pdf.new()

mindee/response.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,37 @@
55
from mindee.logger import logger
66

77

8+
class DocumentResponse:
9+
http_response: Dict[str, Any]
10+
"""Raw HTTP response JSON"""
11+
document_type: str
12+
"""Document type"""
13+
14+
def __init__(
15+
self,
16+
doc_config: DocumentConfig,
17+
http_response: dict,
18+
pages: List[Document],
19+
document_type: str,
20+
document=TypeDocument,
21+
):
22+
"""
23+
Container for the raw API response and the parsed document.
24+
25+
:param http_response: Raw HTTP response object
26+
:param pages: List of document objects, page level
27+
:param document: reconstructed object from all pages
28+
:param document_type: Document class
29+
"""
30+
self.http_response = http_response
31+
self.document_type = document_type
32+
setattr(self, doc_config.singular_name, document)
33+
setattr(self, doc_config.plural_name, pages)
34+
35+
836
def format_response(
937
doc_config: DocumentConfig, http_response: dict, document_type: str, input_file
10-
):
38+
) -> DocumentResponse:
1139
"""
1240
Create a `DocumentResponse`.
1341
@@ -21,7 +49,7 @@ def format_response(
2149
http_response["input_type"] = input_file.input_type
2250
http_response["filename"] = input_file.filename
2351
http_response["filepath"] = input_file.filepath
24-
http_response["file_extension"] = input_file.file_extension
52+
http_response["file_extension"] = input_file.file_mimetype
2553
pages = []
2654

2755
logger.debug("Handling API response")
@@ -50,31 +78,3 @@ def format_response(
5078
document_type=document_type,
5179
document=document_level,
5280
)
53-
54-
55-
class DocumentResponse:
56-
http_response: Dict[str, Any]
57-
"""Raw HTTP response JSON"""
58-
document_type: str
59-
"""Document type"""
60-
61-
def __init__(
62-
self,
63-
doc_config: DocumentConfig,
64-
http_response: dict,
65-
pages: List[Document],
66-
document_type: str,
67-
document=TypeDocument,
68-
):
69-
"""
70-
Container for the raw API response and the parsed document.
71-
72-
:param http_response: Raw HTTP response object
73-
:param pages: List of document objects, page level
74-
:param document: reconstructed object from all pages
75-
:param document_type: Document class
76-
"""
77-
self.http_response = http_response
78-
self.document_type = document_type
79-
setattr(self, doc_config.singular_name, document)
80-
setattr(self, doc_config.plural_name, pages)

0 commit comments

Comments
 (0)