diff --git a/pageindex/client.py b/pageindex/client.py index 894dab181..1924ad1bf 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -5,12 +5,10 @@ import concurrent.futures from pathlib import Path -import PyPDF2 - from .page_index import page_index from .page_index_md import md_to_tree from .retrieve import get_document, get_document_structure, get_page_content -from .utils import ConfigLoader, remove_fields +from .utils import ConfigLoader, read_pdf_pages, remove_fields META_INDEX = "_meta.json" @@ -32,7 +30,8 @@ class PageIndexClient: For agent-based QA, see examples/agentic_vectorless_rag_demo.py. """ - def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None): + def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, + workspace: str = None): if api_key: os.environ["OPENAI_API_KEY"] = api_key elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): @@ -74,14 +73,11 @@ def index(self, file_path: str, mode: str = "auto") -> str: if_add_node_summary='yes', if_add_node_text='yes', if_add_node_id='yes', - if_add_doc_description='yes' + if_add_doc_description='yes', ) # Extract per-page text so queries don't need the original PDF - pages = [] - with open(file_path, 'rb') as f: - pdf_reader = PyPDF2.PdfReader(f) - for i, page in enumerate(pdf_reader.pages, 1): - pages.append({'page': i, 'content': page.extract_text() or ''}) + page_texts = read_pdf_pages(file_path) + pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)] self.documents[doc_id] = { 'id': doc_id, diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..735f1ed1e 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1065,9 +1065,9 @@ async def tree_parser(page_list, opt, doc=None, logger=None): def page_index_main(doc, opt=None): logger = JsonLogger(doc) - + is_valid_pdf = ( - (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or + (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or isinstance(doc, BytesIO) ) if not is_valid_pdf: @@ -1112,7 +1112,7 @@ async def page_index_builder(): def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None): - + user_opt = { arg: value for arg, value in locals().items() if arg != "doc" and value is not None diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index 55c38509c..81c643eb8 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -1,10 +1,9 @@ import json -import PyPDF2 try: - from .utils import get_number_of_pages, remove_fields + from .utils import get_number_of_pages, read_pdf_pages, remove_fields except ImportError: - from utils import get_number_of_pages, remove_fields + from utils import get_number_of_pages, read_pdf_pages, remove_fields # ── Helpers ────────────────────────────────────────────────────────────────── @@ -42,15 +41,13 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: {'page': p, 'content': page_map[p]} for p in page_nums if p in page_map ] - path = doc_info['path'] - with open(path, 'rb') as f: - pdf_reader = PyPDF2.PdfReader(f) - total = len(pdf_reader.pages) - valid_pages = [p for p in page_nums if 1 <= p <= total] - return [ - {'page': p, 'content': pdf_reader.pages[p - 1].extract_text() or ''} - for p in valid_pages - ] + all_pages = read_pdf_pages(doc_info['path']) + total = len(all_pages) + valid_pages = [p for p in page_nums if 1 <= p <= total] + return [ + {'page': p, 'content': all_pages[p - 1]} + for p in valid_pages + ] def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: diff --git a/pageindex/utils.py b/pageindex/utils.py index f00ccf3a7..6563d268a 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -384,30 +384,64 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): - if pdf_parser == "PyPDF2": - pdf_reader = PyPDF2.PdfReader(pdf_path) - page_list = [] - for page_num in range(len(pdf_reader.pages)): - page = pdf_reader.pages[page_num] - page_text = page.extract_text() - token_length = litellm.token_counter(model=model, text=page_text) - page_list.append((page_text, token_length)) - return page_list - elif pdf_parser == "PyMuPDF": - if isinstance(pdf_path, BytesIO): - pdf_stream = pdf_path - doc = pymupdf.open(stream=pdf_stream, filetype="pdf") - elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"): - doc = pymupdf.open(pdf_path) - page_list = [] - for page in doc: - page_text = page.get_text() - token_length = litellm.token_counter(model=model, text=page_text) - page_list.append((page_text, token_length)) - return page_list - else: - raise ValueError(f"Unsupported PDF parser: {pdf_parser}") +SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF") + +# Module-level setting. Override by mutating this attribute or setting +# PAGEINDEX_PDF_PARSER in the environment before import. +DEFAULT_PDF_PARSER = os.getenv("PAGEINDEX_PDF_PARSER") or SUPPORTED_PDF_PARSERS[0] + + +def read_pdf_pages(doc): + """Return a list of per-page text strings using the currently configured parser.""" + parser = DEFAULT_PDF_PARSER + + if parser == "PyPDF2": + reader = PyPDF2.PdfReader(doc) + return [(p.extract_text() or "") for p in reader.pages] + + if parser == "pypdfium2": + try: + import pypdfium2 as pdfium + except ImportError as e: + raise ImportError( + "DEFAULT_PDF_PARSER='pypdfium2' requires the optional dependency. " + "Install it with: pip install pypdfium2" + ) from e + source = doc.getvalue() if isinstance(doc, BytesIO) else str(doc) + pdf = pdfium.PdfDocument(source) + try: + pages = [] + for i in range(len(pdf)): + page = pdf[i] + tp = page.get_textpage() + try: + text = (tp.get_text_bounded() or "").replace("\r\n", "\n") + finally: + tp.close() + page.close() + pages.append(text) + return pages + finally: + pdf.close() + + if parser == "PyMuPDF": + if isinstance(doc, BytesIO): + d = pymupdf.open(stream=doc, filetype="pdf") + else: + d = pymupdf.open(str(doc)) + try: + return [p.get_text() for p in d] + finally: + d.close() + + raise ValueError( + f"Unsupported DEFAULT_PDF_PARSER={parser!r}. Choose from {SUPPORTED_PDF_PARSERS}." + ) + + +def get_page_tokens(pdf_path, model=None): + pages = read_pdf_pages(pdf_path) + return [(text, litellm.token_counter(model=model, text=text)) for text in pages] diff --git a/requirements.txt b/requirements.txt index e6ad80531..e6ab06388 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ litellm==1.83.7 # openai-agents # optional: required for examples/agentic_vectorless_rag_demo.py pymupdf==1.26.4 +# pypdfium2 # optional: enables pdf_parser="pypdfium2" (cleaner text, faster, Apache 2.0) PyPDF2==3.0.1 python-dotenv==1.2.2 pyyaml==6.0.2 diff --git a/run_pageindex.py b/run_pageindex.py index 673439d89..354874771 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -2,8 +2,8 @@ import os import json from pageindex import * +import pageindex.utils as pageindex_utils from pageindex.page_index_md import md_to_tree -from pageindex.utils import ConfigLoader if __name__ == "__main__": # Set up argument parser @@ -28,7 +28,9 @@ help='Whether to add doc description to the doc') parser.add_argument('--if-add-node-text', type=str, default=None, help='Whether to add text to the node') - + parser.add_argument('--pdf-parser', type=str, default=None, + help='PDF text extractor: PyPDF2 (default), pypdfium2 (requires `pip install pypdfium2`), or PyMuPDF') + # Markdown specific arguments parser.add_argument('--if-thinning', type=str, default='no', help='Whether to apply tree thinning for markdown (markdown only)') @@ -62,7 +64,11 @@ 'if_add_doc_description': args.if_add_doc_description, 'if_add_node_text': args.if_add_node_text, } - opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) + opt = pageindex_utils.ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) + + # CLI flag overrides the module-level default (and env var PAGEINDEX_PDF_PARSER). + if args.pdf_parser: + pageindex_utils.DEFAULT_PDF_PARSER = args.pdf_parser # Process the PDF toc_with_page_number = page_index_main(args.pdf_path, opt) @@ -93,8 +99,7 @@ import asyncio # Use ConfigLoader to get consistent defaults (matching PDF behavior) - from pageindex.utils import ConfigLoader - config_loader = ConfigLoader() + config_loader = pageindex_utils.ConfigLoader() # Create options dict with user args user_opt = {