|
| 1 | +from pathlib import Path |
| 2 | +from typing import BinaryIO |
| 3 | + |
| 4 | +import pypdfium2 as pdfium |
| 5 | + |
| 6 | +from mindee.error import MindeeError |
| 7 | +from mindee.input import BytesInput |
| 8 | + |
| 9 | + |
| 10 | +class ExtractedPdf: |
| 11 | + """An extracted sub-Pdf.""" |
| 12 | + |
| 13 | + pdf_bytes: BinaryIO |
| 14 | + filename: str |
| 15 | + |
| 16 | + def __init__(self, pdf_bytes: BinaryIO, filename: str): |
| 17 | + self.pdf_bytes = pdf_bytes |
| 18 | + self.filename = filename |
| 19 | + |
| 20 | + def get_page_count(self) -> int: |
| 21 | + """Get the number of pages in the PDF file.""" |
| 22 | + try: |
| 23 | + pdf = pdfium.PdfDocument(self.pdf_bytes) |
| 24 | + return len(pdf) |
| 25 | + except Exception as exc: |
| 26 | + raise MindeeError( |
| 27 | + "Could not retrieve page count from Extracted PDF object." |
| 28 | + ) from exc |
| 29 | + |
| 30 | + def write_to_file(self, output_path: str): |
| 31 | + """ |
| 32 | + Writes the contents of the current PDF object to a file. |
| 33 | +
|
| 34 | + :param output_path: Path of the destination file. If not extension is provided, pdf will be appended by default. |
| 35 | + """ |
| 36 | + out_path = Path(output_path) |
| 37 | + if out_path.resolve().is_dir(): |
| 38 | + raise MindeeError("Provided path is not a file.") |
| 39 | + if not output_path or not out_path.parent.exists(): |
| 40 | + raise MindeeError("Invalid save path provided {}.") |
| 41 | + if out_path.suffix.lower() != "pdf": |
| 42 | + out_path = out_path.parent / (out_path.stem + "." + "pdf") |
| 43 | + with open(out_path, "wb") as out_file: |
| 44 | + out_file.write(self.pdf_bytes.read()) |
| 45 | + |
| 46 | + def as_input_source(self) -> BytesInput: |
| 47 | + """Returns the current PDF object as a usable BytesInput source.""" |
| 48 | + self.pdf_bytes.seek(0) |
| 49 | + return BytesInput(self.pdf_bytes.read(), self.filename) |
0 commit comments