Skip to content

Commit 26cb4c8

Browse files
✨ Add support for Invoice Splitter V1 (#130)
1 parent 04887ed commit 26cb4c8

File tree

14 files changed

+645
-30
lines changed

14 files changed

+645
-30
lines changed

docs/client.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,10 @@ PredictResponse
2121
---------------
2222
.. autoclass:: mindee.response.PredictResponse
2323
:members:
24+
25+
AsyncPredictResponse
26+
--------------------
27+
.. autoclass:: mindee.response.AsyncPredictResponse
28+
:members:
29+
.. autoclass:: mindee.response.Job
30+
:members:
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from mindee import Client, documents
2+
from time import sleep
3+
4+
# Init a new client
5+
mindee_client = Client(api_key="my-api-key")
6+
7+
# Load a file from disk
8+
input_doc = mindee_client.doc_from_path("/path/to/the/file.ext")
9+
10+
# Put the document class in a local variable to keep the code DRY
11+
12+
doc_class = documents.TypeInvoiceSplitterV1
13+
14+
# Limit the amount of API calls to retrieve your document
15+
MAX_RETRIES = 10
16+
17+
# How many seconds to wait in-between tries
18+
INTERVAL_SECS = 6
19+
20+
# Counter to keep track of how many times we try to retrieve the document
21+
times_tried = 1
22+
23+
24+
queue_result = input_doc.enqueue(doc_class)
25+
26+
# Get the id of the queue (job)
27+
queue_id = queue_result.job.job_id
28+
29+
# Recursive function that tries to retrieve the completed document.
30+
# If the document is not "complete", try again
31+
def get_doc_from_async_queue(queue_id, times_tried=0):
32+
33+
# Have we exceeded our retry count?
34+
if times_tried >= MAX_RETRIES:
35+
raise Exception(f"Maximum retries reached {times_tried}")
36+
37+
# Wait for a few seconds before fetching
38+
sleep(INTERVAL_SECS)
39+
40+
# Fetch and parse the result, using the same type
41+
parsed_result = input_doc.parse_queued(doc_class, queue_id)
42+
43+
# Check whether the result is ready
44+
if parsed_result.job.status == "completed":
45+
46+
# Print a brief summary of the parsed data
47+
print(parsed_result.document.document)
48+
return
49+
50+
# Otherwise, try again...
51+
else:
52+
get_doc_from_async_queue(queue_id, times_tried+1)
53+
54+
# Start the recursion...
55+
get_doc_from_async_queue(queue_id)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Invoice Splitter V1
2+
-------------------
3+
4+
**Sample Code:**
5+
6+
.. literalinclude:: /extras/code_samples/invoice_splitter_v1_async.txt
7+
:language: Python
8+
9+
.. autoclass:: mindee.documents.InvoiceSplitterV1
10+
:members:

mindee/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
from mindee.client import Client, PageOptions
2-
from mindee.response import PredictResponse
2+
from mindee.response import AsyncPredictResponse, Job, PredictResponse

mindee/client.py

Lines changed: 184 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
UrlInputSource,
1616
)
1717
from mindee.logger import logger
18-
from mindee.response import PredictResponse
18+
from mindee.response import AsyncPredictResponse, PredictResponse
1919

2020

2121
def get_bound_classname(type_var) -> str:
@@ -84,41 +84,105 @@ def parse(
8484

8585
logger.debug("Parsing document as '%s'", endpoint_name)
8686

87-
found = []
88-
for k in self.doc_configs.keys():
89-
if k[1] == endpoint_name:
90-
found.append(k)
87+
doc_config = self._check_config(endpoint_name, account_name)
88+
if not isinstance(self.input_doc, UrlInputSource):
89+
if page_options and self.input_doc.is_pdf():
90+
self.input_doc.process_pdf(
91+
page_options.operation,
92+
page_options.on_min_pages,
93+
page_options.page_indexes,
94+
)
95+
return self._make_request(
96+
document_class, doc_config, include_words, close_file, cropper
97+
)
9198

92-
if len(found) == 0:
93-
raise RuntimeError(f"Document type not configured: {endpoint_name}")
99+
def enqueue(
100+
self,
101+
document_class: TypeDocument,
102+
endpoint_name: Optional[str] = None,
103+
account_name: Optional[str] = None,
104+
include_words: bool = False,
105+
close_file: bool = True,
106+
page_options: Optional[PageOptions] = None,
107+
cropper: bool = False,
108+
) -> AsyncPredictResponse[TypeDocument]:
109+
"""
110+
Enqueueing to an async endpoint.
94111
95-
if account_name:
96-
config_key = (account_name, endpoint_name)
97-
elif len(found) == 1:
98-
config_key = found[0]
99-
else:
100-
usernames = [k[0] for k in found]
112+
:param document_class: The document class to use.
113+
The response object will be instantiated based on this parameter.
114+
115+
:param endpoint_name: For custom endpoints, the "API name" field in the "Settings" page of the API Builder.
116+
Do not set for standard (off the shelf) endpoints.
117+
118+
:param account_name: For custom endpoints, your account or organization username on the API Builder.
119+
This is normally not required unless you have a custom endpoint which has the
120+
same name as standard (off the shelf) endpoint.
121+
Do not set for standard (off the shelf) endpoints.
122+
123+
:param include_words: Whether to include the full text for each page.
124+
This performs a full OCR operation on the server and will increase response time.
125+
126+
:param close_file: Whether to ``close()`` the file after parsing it.
127+
Set to ``False`` if you need to access the file after this operation.
128+
129+
:param page_options: If set, remove pages from the document as specified.
130+
This is done before sending the file to the server and is useful to avoid page limitations.
131+
132+
:param cropper: Whether to include cropper results for each page.
133+
This performs a cropping operation on the server and will increase response time.
134+
"""
135+
bound_classname = get_bound_classname(document_class)
136+
if bound_classname != documents.CustomV1.__name__:
137+
endpoint_name = get_bound_classname(document_class)
138+
elif endpoint_name is None:
101139
raise RuntimeError(
102-
(
103-
"Duplicate configuration detected.\n"
104-
f"You specified a document_type '{endpoint_name}' in your custom config.\n"
105-
"To avoid confusion, please add the 'account_name' attribute to "
106-
f"the parse method, one of {usernames}."
107-
)
140+
f"endpoint_name is required when using {bound_classname} class"
108141
)
109142

110-
doc_config = self.doc_configs[config_key]
111-
doc_config.check_api_keys()
143+
logger.debug("Enqueuing document as '%s'", endpoint_name)
144+
145+
doc_config = self._check_config(endpoint_name, account_name)
112146
if not isinstance(self.input_doc, UrlInputSource):
113147
if page_options and self.input_doc.is_pdf():
114148
self.input_doc.process_pdf(
115149
page_options.operation,
116150
page_options.on_min_pages,
117151
page_options.page_indexes,
118152
)
119-
return self._make_request(
120-
document_class, doc_config, include_words, close_file, cropper
121-
)
153+
return self._predict_async(doc_config, include_words, close_file, cropper)
154+
155+
def parse_queued(
156+
self,
157+
document_class: TypeDocument,
158+
queue_id: str,
159+
endpoint_name: Optional[str] = None,
160+
account_name: Optional[str] = None,
161+
) -> AsyncPredictResponse[TypeDocument]:
162+
"""
163+
Parses a queued document.
164+
165+
:param queue_id: queue_id received from the API
166+
:param endpoint_name: For custom endpoints, the "API name" field in the "Settings" page of the API Builder.
167+
Do not set for standard (off the shelf) endpoints.
168+
:param account_name: For custom endpoints, your account or organization username on the API Builder.
169+
This is normally not required unless you have a custom endpoint which has the
170+
same name as standard (off the shelf) endpoint.
171+
Do not set for standard (off the shelf) endpoints.
172+
"""
173+
bound_classname = get_bound_classname(document_class)
174+
if bound_classname != documents.CustomV1.__name__:
175+
endpoint_name = get_bound_classname(document_class)
176+
elif endpoint_name is None:
177+
raise RuntimeError(
178+
f"endpoint_name is required when using {bound_classname} class"
179+
)
180+
181+
logger.debug("Fetching queued document as '%s'", endpoint_name)
182+
183+
doc_config = self._check_config(endpoint_name, account_name)
184+
185+
return self._get_queued_document(doc_config, queue_id)
122186

123187
def _make_request(
124188
self,
@@ -145,18 +209,108 @@ def _make_request(
145209
raise HTTPException(
146210
f"API {response.status_code} HTTP error: {json.dumps(dict_response)}"
147211
)
212+
148213
return PredictResponse[TypeDocument](
149214
http_response=dict_response,
150215
doc_config=doc_config,
151216
input_source=self.input_doc,
152217
response_ok=response.ok,
153218
)
154219

220+
def _predict_async(
221+
self,
222+
doc_config: DocumentConfig,
223+
include_words: bool = False,
224+
close_file: bool = True,
225+
cropper: bool = False,
226+
) -> AsyncPredictResponse[TypeDocument]:
227+
"""
228+
Sends a document to the queue, and sends back an asynchronous predict response.
229+
230+
:param doc_config: Configuration of the document.
231+
"""
232+
response = doc_config.endpoints[0].predict_async_req_post(
233+
self.input_doc, include_words, close_file, cropper
234+
)
235+
236+
dict_response = response.json()
237+
238+
if not response.ok and self.raise_on_error:
239+
raise HTTPException(
240+
f"API {response.status_code} HTTP error: {json.dumps(dict_response)}"
241+
)
242+
243+
return AsyncPredictResponse[TypeDocument](
244+
http_response=dict_response,
245+
doc_config=doc_config,
246+
input_source=self.input_doc,
247+
response_ok=response.ok,
248+
)
249+
250+
def _get_queued_document(
251+
self,
252+
doc_config: DocumentConfig,
253+
queue_id: str,
254+
) -> AsyncPredictResponse[TypeDocument]:
255+
"""
256+
Fetches a document or a Job from a given queue.
257+
258+
:param queue_id: Queue_id received from the API
259+
:param doc_config: Pre-checked document configuration.
260+
"""
261+
queue_response = doc_config.endpoints[0].document_queue_req_get(
262+
queue_id=queue_id
263+
)
264+
265+
if (
266+
not queue_response.status_code
267+
or queue_response.status_code < 200
268+
or queue_response.status_code > 302
269+
):
270+
raise HTTPException(
271+
f"API {queue_response.status_code} HTTP error: {json.dumps(queue_response)}"
272+
)
273+
274+
return AsyncPredictResponse[TypeDocument](
275+
http_response=queue_response.json(),
276+
doc_config=doc_config,
277+
input_source=self.input_doc,
278+
response_ok=queue_response.ok,
279+
)
280+
155281
def close(self) -> None:
156282
"""Close the file object."""
157283
if not isinstance(self.input_doc, UrlInputSource):
158284
self.input_doc.file_object.close()
159285

286+
def _check_config(self, endpoint_name, account_name) -> DocumentConfig:
287+
found = []
288+
for k in self.doc_configs.keys():
289+
if k[1] == endpoint_name:
290+
found.append(k)
291+
292+
if len(found) == 0:
293+
raise RuntimeError(f"Document type not configured: {endpoint_name}")
294+
295+
if account_name:
296+
config_key = (account_name, endpoint_name)
297+
elif len(found) == 1:
298+
config_key = found[0]
299+
else:
300+
usernames = [k[0] for k in found]
301+
raise RuntimeError(
302+
(
303+
"Duplicate configuration detected.\n"
304+
f"You specified a document_type '{endpoint_name}' in your custom config.\n"
305+
"To avoid confusion, please add the 'account_name' attribute to "
306+
f"the parse method, one of {usernames}."
307+
)
308+
)
309+
310+
doc_config = self.doc_configs[config_key]
311+
doc_config.check_api_keys()
312+
return doc_config
313+
160314

161315
class ConfigSpec(NamedTuple):
162316
doc_class: Type[Document]
@@ -281,7 +435,13 @@ def _init_default_endpoints(self) -> None:
281435
url_name="license_plates",
282436
version="1",
283437
),
438+
ConfigSpec(
439+
doc_class=documents.InvoiceSplitterV1,
440+
url_name="invoice_splitter",
441+
version="1",
442+
),
284443
]
444+
285445
for config in configs:
286446
config_key = (OTS_OWNER, config.doc_class.__name__)
287447
self._doc_configs[config_key] = self._standard_doc_config(

mindee/documents/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
TypeFinancialV1,
99
)
1010
from mindee.documents.invoice import InvoiceV3, InvoiceV4, TypeInvoiceV3, TypeInvoiceV4
11+
from mindee.documents.invoice_splitter import InvoiceSplitterV1, TypeInvoiceSplitterV1
1112
from mindee.documents.passport import PassportV1, TypePassportV1
1213
from mindee.documents.proof_of_address import ProofOfAddressV1, TypeProofOfAddressV1
1314
from mindee.documents.receipt import (

mindee/documents/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def check_api_keys(self) -> None:
2828
raise RuntimeError(
2929
(
3030
f"Missing API key for '{endpoint.url_name} v{endpoint.version}',"
31-
"check your Client configuration.\n"
31+
" check your Client configuration.\n"
3232
"You can set this using the "
3333
f"'{API_KEY_ENV_NAME}' environment variable."
3434
)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .invoice_splitter_v1 import InvoiceSplitterV1, TypeInvoiceSplitterV1

0 commit comments

Comments
 (0)