1515 UrlInputSource ,
1616)
1717from mindee .logger import logger
18- from mindee .response import PredictResponse
18+ from mindee .response import AsyncPredictResponse , PredictResponse
1919
2020
2121def get_bound_classname (type_var ) -> str :
@@ -84,41 +84,105 @@ def parse(
8484
8585 logger .debug ("Parsing document as '%s'" , endpoint_name )
8686
87- found = []
88- for k in self .doc_configs .keys ():
89- if k [1 ] == endpoint_name :
90- found .append (k )
87+ doc_config = self ._check_config (endpoint_name , account_name )
88+ if not isinstance (self .input_doc , UrlInputSource ):
89+ if page_options and self .input_doc .is_pdf ():
90+ self .input_doc .process_pdf (
91+ page_options .operation ,
92+ page_options .on_min_pages ,
93+ page_options .page_indexes ,
94+ )
95+ return self ._make_request (
96+ document_class , doc_config , include_words , close_file , cropper
97+ )
9198
92- if len (found ) == 0 :
93- raise RuntimeError (f"Document type not configured: { endpoint_name } " )
99+ def enqueue (
100+ self ,
101+ document_class : TypeDocument ,
102+ endpoint_name : Optional [str ] = None ,
103+ account_name : Optional [str ] = None ,
104+ include_words : bool = False ,
105+ close_file : bool = True ,
106+ page_options : Optional [PageOptions ] = None ,
107+ cropper : bool = False ,
108+ ) -> AsyncPredictResponse [TypeDocument ]:
109+ """
110+ Enqueueing to an async endpoint.
94111
95- if account_name :
96- config_key = (account_name , endpoint_name )
97- elif len (found ) == 1 :
98- config_key = found [0 ]
99- else :
100- usernames = [k [0 ] for k in found ]
112+ :param document_class: The document class to use.
113+ The response object will be instantiated based on this parameter.
114+
115+ :param endpoint_name: For custom endpoints, the "API name" field in the "Settings" page of the API Builder.
116+ Do not set for standard (off the shelf) endpoints.
117+
118+ :param account_name: For custom endpoints, your account or organization username on the API Builder.
119+ This is normally not required unless you have a custom endpoint which has the
120+ same name as standard (off the shelf) endpoint.
121+ Do not set for standard (off the shelf) endpoints.
122+
123+ :param include_words: Whether to include the full text for each page.
124+ This performs a full OCR operation on the server and will increase response time.
125+
126+ :param close_file: Whether to ``close()`` the file after parsing it.
127+ Set to ``False`` if you need to access the file after this operation.
128+
129+ :param page_options: If set, remove pages from the document as specified.
130+ This is done before sending the file to the server and is useful to avoid page limitations.
131+
132+ :param cropper: Whether to include cropper results for each page.
133+ This performs a cropping operation on the server and will increase response time.
134+ """
135+ bound_classname = get_bound_classname (document_class )
136+ if bound_classname != documents .CustomV1 .__name__ :
137+ endpoint_name = get_bound_classname (document_class )
138+ elif endpoint_name is None :
101139 raise RuntimeError (
102- (
103- "Duplicate configuration detected.\n "
104- f"You specified a document_type '{ endpoint_name } ' in your custom config.\n "
105- "To avoid confusion, please add the 'account_name' attribute to "
106- f"the parse method, one of { usernames } ."
107- )
140+ f"endpoint_name is required when using { bound_classname } class"
108141 )
109142
110- doc_config = self .doc_configs [config_key ]
111- doc_config .check_api_keys ()
143+ logger .debug ("Enqueuing document as '%s'" , endpoint_name )
144+
145+ doc_config = self ._check_config (endpoint_name , account_name )
112146 if not isinstance (self .input_doc , UrlInputSource ):
113147 if page_options and self .input_doc .is_pdf ():
114148 self .input_doc .process_pdf (
115149 page_options .operation ,
116150 page_options .on_min_pages ,
117151 page_options .page_indexes ,
118152 )
119- return self ._make_request (
120- document_class , doc_config , include_words , close_file , cropper
121- )
153+ return self ._predict_async (doc_config , include_words , close_file , cropper )
154+
155+ def parse_queued (
156+ self ,
157+ document_class : TypeDocument ,
158+ queue_id : str ,
159+ endpoint_name : Optional [str ] = None ,
160+ account_name : Optional [str ] = None ,
161+ ) -> AsyncPredictResponse [TypeDocument ]:
162+ """
163+ Parses a queued document.
164+
165+ :param queue_id: queue_id received from the API
166+ :param endpoint_name: For custom endpoints, the "API name" field in the "Settings" page of the API Builder.
167+ Do not set for standard (off the shelf) endpoints.
168+ :param account_name: For custom endpoints, your account or organization username on the API Builder.
169+ This is normally not required unless you have a custom endpoint which has the
170+ same name as standard (off the shelf) endpoint.
171+ Do not set for standard (off the shelf) endpoints.
172+ """
173+ bound_classname = get_bound_classname (document_class )
174+ if bound_classname != documents .CustomV1 .__name__ :
175+ endpoint_name = get_bound_classname (document_class )
176+ elif endpoint_name is None :
177+ raise RuntimeError (
178+ f"endpoint_name is required when using { bound_classname } class"
179+ )
180+
181+ logger .debug ("Fetching queued document as '%s'" , endpoint_name )
182+
183+ doc_config = self ._check_config (endpoint_name , account_name )
184+
185+ return self ._get_queued_document (doc_config , queue_id )
122186
123187 def _make_request (
124188 self ,
@@ -145,18 +209,108 @@ def _make_request(
145209 raise HTTPException (
146210 f"API { response .status_code } HTTP error: { json .dumps (dict_response )} "
147211 )
212+
148213 return PredictResponse [TypeDocument ](
149214 http_response = dict_response ,
150215 doc_config = doc_config ,
151216 input_source = self .input_doc ,
152217 response_ok = response .ok ,
153218 )
154219
220+ def _predict_async (
221+ self ,
222+ doc_config : DocumentConfig ,
223+ include_words : bool = False ,
224+ close_file : bool = True ,
225+ cropper : bool = False ,
226+ ) -> AsyncPredictResponse [TypeDocument ]:
227+ """
228+ Sends a document to the queue, and sends back an asynchronous predict response.
229+
230+ :param doc_config: Configuration of the document.
231+ """
232+ response = doc_config .endpoints [0 ].predict_async_req_post (
233+ self .input_doc , include_words , close_file , cropper
234+ )
235+
236+ dict_response = response .json ()
237+
238+ if not response .ok and self .raise_on_error :
239+ raise HTTPException (
240+ f"API { response .status_code } HTTP error: { json .dumps (dict_response )} "
241+ )
242+
243+ return AsyncPredictResponse [TypeDocument ](
244+ http_response = dict_response ,
245+ doc_config = doc_config ,
246+ input_source = self .input_doc ,
247+ response_ok = response .ok ,
248+ )
249+
250+ def _get_queued_document (
251+ self ,
252+ doc_config : DocumentConfig ,
253+ queue_id : str ,
254+ ) -> AsyncPredictResponse [TypeDocument ]:
255+ """
256+ Fetches a document or a Job from a given queue.
257+
258+ :param queue_id: Queue_id received from the API
259+ :param doc_config: Pre-checked document configuration.
260+ """
261+ queue_response = doc_config .endpoints [0 ].document_queue_req_get (
262+ queue_id = queue_id
263+ )
264+
265+ if (
266+ not queue_response .status_code
267+ or queue_response .status_code < 200
268+ or queue_response .status_code > 302
269+ ):
270+ raise HTTPException (
271+ f"API { queue_response .status_code } HTTP error: { json .dumps (queue_response )} "
272+ )
273+
274+ return AsyncPredictResponse [TypeDocument ](
275+ http_response = queue_response .json (),
276+ doc_config = doc_config ,
277+ input_source = self .input_doc ,
278+ response_ok = queue_response .ok ,
279+ )
280+
155281 def close (self ) -> None :
156282 """Close the file object."""
157283 if not isinstance (self .input_doc , UrlInputSource ):
158284 self .input_doc .file_object .close ()
159285
286+ def _check_config (self , endpoint_name , account_name ) -> DocumentConfig :
287+ found = []
288+ for k in self .doc_configs .keys ():
289+ if k [1 ] == endpoint_name :
290+ found .append (k )
291+
292+ if len (found ) == 0 :
293+ raise RuntimeError (f"Document type not configured: { endpoint_name } " )
294+
295+ if account_name :
296+ config_key = (account_name , endpoint_name )
297+ elif len (found ) == 1 :
298+ config_key = found [0 ]
299+ else :
300+ usernames = [k [0 ] for k in found ]
301+ raise RuntimeError (
302+ (
303+ "Duplicate configuration detected.\n "
304+ f"You specified a document_type '{ endpoint_name } ' in your custom config.\n "
305+ "To avoid confusion, please add the 'account_name' attribute to "
306+ f"the parse method, one of { usernames } ."
307+ )
308+ )
309+
310+ doc_config = self .doc_configs [config_key ]
311+ doc_config .check_api_keys ()
312+ return doc_config
313+
160314
161315class ConfigSpec (NamedTuple ):
162316 doc_class : Type [Document ]
@@ -281,7 +435,13 @@ def _init_default_endpoints(self) -> None:
281435 url_name = "license_plates" ,
282436 version = "1" ,
283437 ),
438+ ConfigSpec (
439+ doc_class = documents .InvoiceSplitterV1 ,
440+ url_name = "invoice_splitter" ,
441+ version = "1" ,
442+ ),
284443 ]
444+
285445 for config in configs :
286446 config_key = (OTS_OWNER , config .doc_class .__name__ )
287447 self ._doc_configs [config_key ] = self ._standard_doc_config (
0 commit comments