Skip to content

Commit 7eb2775

Browse files
✨ add support for invoice splitter auto-extraction (#253)
1 parent eda5fba commit 7eb2775

File tree

19 files changed

+355
-12
lines changed

19 files changed

+355
-12
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import os
2+
3+
from mindee import Client
4+
from mindee.extraction.common.pdf_extractor import PdfExtractor
5+
from mindee.input import PathInput
6+
from mindee.product import InvoiceSplitterV1, InvoiceV4
7+
8+
api_key = os.getenv("MINDEE_API_KEY")
9+
mindee_client = Client(api_key=api_key)
10+
11+
input_path = "path/to/your/file.ext"
12+
input_source = PathInput(input_path)
13+
14+
if input_source.is_pdf():
15+
pdf_extractor = PdfExtractor(input_source)
16+
if pdf_extractor.get_page_count() > 1:
17+
invoice_splitter_response = mindee_client.enqueue_and_parse(
18+
InvoiceSplitterV1, input_source, close_file=False
19+
)
20+
page_groups = (
21+
invoice_splitter_response.document.inference.prediction.invoice_page_groups
22+
)
23+
extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict=False)
24+
25+
for extracted_pdf in extracted_pdfs:
26+
# Optional: Save the files locally
27+
# extracted_pdf.write_to_file("output/path")
28+
29+
invoice_result = mindee_client.parse(
30+
InvoiceV4, extracted_pdf.as_input_source()
31+
)
32+
print(invoice_result.document)
33+
else:
34+
invoice_result = mindee_client.parse(InvoiceV4, input_source)
35+
print(invoice_result.document)
36+
else:
37+
invoice_result = mindee_client.parse(InvoiceV4, input_source)
38+
print(invoice_result.document)

examples/multi_receipts_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from mindee import Client, PredictResponse, product
2-
from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import (
2+
from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import (
33
extract_receipts,
44
)
55

mindee/extraction/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from mindee.extraction.common.extracted_image import ExtractedImage
2+
from mindee.extraction.common.image_extractor import (
3+
attach_image_as_new_file,
4+
extract_multiple_images_from_source,
5+
)
6+
from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor
7+
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
8+
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from mindee.extraction.common.extracted_image import ExtractedImage
2+
from mindee.extraction.common.image_extractor import (
3+
attach_image_as_new_file,
4+
extract_multiple_images_from_source,
5+
)

mindee/image_extraction/common/image_extractor.py renamed to mindee/extraction/common/image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
from PIL import Image
66

77
from mindee.error import MindeeError
8+
from mindee.extraction.common import ExtractedImage
89
from mindee.geometry import Point, get_min_max_x, get_min_max_y
9-
from mindee.image_extraction.common import ExtractedImage
1010
from mindee.input import BytesInput, LocalInputSource
1111

1212

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor

mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py renamed to mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from typing import List
22

33
from mindee.error import MindeeError
4-
from mindee.image_extraction.common.extracted_image import ExtractedImage
5-
from mindee.image_extraction.common.image_extractor import (
4+
from mindee.extraction.common.extracted_image import ExtractedImage
5+
from mindee.extraction.common.image_extractor import (
66
extract_multiple_images_from_source,
77
)
88
from mindee.input import LocalInputSource
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
2+
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from pathlib import Path
2+
from typing import BinaryIO
3+
4+
import pypdfium2 as pdfium
5+
6+
from mindee.error import MindeeError
7+
from mindee.input import BytesInput
8+
9+
10+
class ExtractedPdf:
11+
"""An extracted sub-Pdf."""
12+
13+
pdf_bytes: BinaryIO
14+
filename: str
15+
16+
def __init__(self, pdf_bytes: BinaryIO, filename: str):
17+
self.pdf_bytes = pdf_bytes
18+
self.filename = filename
19+
20+
def get_page_count(self) -> int:
21+
"""Get the number of pages in the PDF file."""
22+
try:
23+
pdf = pdfium.PdfDocument(self.pdf_bytes)
24+
return len(pdf)
25+
except Exception as exc:
26+
raise MindeeError(
27+
"Could not retrieve page count from Extracted PDF object."
28+
) from exc
29+
30+
def write_to_file(self, output_path: str):
31+
"""
32+
Writes the contents of the current PDF object to a file.
33+
34+
:param output_path: Path of the destination file. If not extension is provided, pdf will be appended by default.
35+
"""
36+
out_path = Path(output_path)
37+
if out_path.resolve().is_dir():
38+
raise MindeeError("Provided path is not a file.")
39+
if not output_path or not out_path.parent.exists():
40+
raise MindeeError("Invalid save path provided {}.")
41+
if out_path.suffix.lower() != "pdf":
42+
out_path = out_path.parent / (out_path.stem + "." + "pdf")
43+
with open(out_path, "wb") as out_file:
44+
out_file.write(self.pdf_bytes.read())
45+
46+
def as_input_source(self) -> BytesInput:
47+
"""Returns the current PDF object as a usable BytesInput source."""
48+
self.pdf_bytes.seek(0)
49+
return BytesInput(self.pdf_bytes.read(), self.filename)

0 commit comments

Comments
 (0)