✨ add OCR output in CLI (#144)

ianardee · web-flow · commit ea5c154d0082 · 2023-06-23T10:33:41.000+02:00
diff --git a/docs/guide/python-cli.md b/docs/guide/python-cli.md
@@ -44,12 +44,18 @@ python3 -m mindee invoice parse /path/to/invoice.pdf
 python3 -m mindee custom -u pikachu -k xxxxxxx pokemon_card /path/to/card.jpg
 ```
 
-### You can get the full parsed output as well
+### Printing the raw parsed data instead of the summary
 
 ```shell
 python3 -m mindee invoice parse -o parsed /path/to/invoice.pdf
 ```
 
+### Extracting all the words using OCR
+
+```shell
+python3 -m mindee invoice parse -t /path/to/invoice.pdf
+```
+
 ### In the Git repo, there's a helper script for it
 
 ```shell
diff --git a/mindee/cli.py b/mindee/cli.py
@@ -140,7 +140,7 @@ def process_parse(args: Namespace, client: Client, doc_class) -> None:
         parsed_data = input_doc.parse(
             doc_class, include_words=args.include_words, page_options=page_options
         )
-    display_doc(args.output_type, parsed_data)
+    display_doc(args.output_type, args.include_words, parsed_data)
 
 
 def process_parse_queued(args: Namespace, client: Client, doc_class) -> None:
@@ -158,20 +158,25 @@ def process_parse_queued(args: Namespace, client: Client, doc_class) -> None:
             document_class=doc_class, queue_id=args.queue_id
         )
     if parsed_data.job.status == "completed" and parsed_data.document is not None:
-        display_doc(args.output_type, parsed_data.document)
+        display_doc(args.output_type, args.include_words, parsed_data.document)
     else:
         print(parsed_data.job)
 
 
-def display_doc(output_type: str, document_response: PredictResponse):
+def display_doc(output_type: str, include_words: bool, response: PredictResponse):
     """Display the parsed document."""
     if output_type == "raw":
-        print(json.dumps(document_response.http_response, indent=2))
+        print(json.dumps(response.http_response, indent=2))
     elif output_type == "parsed":
-        doc = document_response.document
-        print(json.dumps(doc, indent=2, default=serialize_for_json))
+        if include_words:
+            print(json.dumps(response.ocr, indent=2, default=serialize_for_json))
+        print(json.dumps(response.document, indent=2, default=serialize_for_json))
     else:
-        print(document_response.document)
+        if include_words:
+            print("OCR Begin >>>>>>>>>>\n")
+            print(response.ocr)
+            print("<<<<<<<<<< OCR End\n")
+        print(response.document)
 
 
 def process_parse_enqueue(args: Namespace, client: Client, doc_class) -> None: