From 9539fe7513c65878494bb3f23f560d5c2f539ab1 Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:04:07 +0800 Subject: [PATCH 1/9] Add pypdfium2 as optional PDF parser Default behavior unchanged. Users can opt in via pdf_parser="pypdfium2" for cleaner text extraction (no broken words, correct Unicode) and 3-5x faster parsing. PyPDF2 remains the only required dependency; pypdfium2 is lazy-imported. --- pageindex/client.py | 23 ++++++------ pageindex/config.yaml | 3 +- pageindex/page_index.py | 7 ++-- pageindex/retrieve.py | 28 ++++++++------- pageindex/utils.py | 79 +++++++++++++++++++++++++++++------------ requirements.txt | 1 + run_pageindex.py | 5 ++- 7 files changed, 95 insertions(+), 51 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index 894dab181..e04574ce3 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -5,12 +5,10 @@ import concurrent.futures from pathlib import Path -import PyPDF2 - from .page_index import page_index from .page_index_md import md_to_tree from .retrieve import get_document, get_document_structure, get_page_content -from .utils import ConfigLoader, remove_fields +from .utils import ConfigLoader, read_pdf_pages, remove_fields META_INDEX = "_meta.json" @@ -32,7 +30,8 @@ class PageIndexClient: For agent-based QA, see examples/agentic_vectorless_rag_demo.py. """ - def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, workspace: str = None): + def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, + workspace: str = None, pdf_parser: str = None): if api_key: os.environ["OPENAI_API_KEY"] = api_key elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): @@ -43,9 +42,12 @@ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = overrides["model"] = model if retrieve_model: overrides["retrieve_model"] = retrieve_model + if pdf_parser: + overrides["pdf_parser"] = pdf_parser opt = ConfigLoader().load(overrides or None) self.model = opt.model self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model) + self.pdf_parser = opt.pdf_parser if self.workspace: self.workspace.mkdir(parents=True, exist_ok=True) self.documents = {} @@ -74,14 +76,12 @@ def index(self, file_path: str, mode: str = "auto") -> str: if_add_node_summary='yes', if_add_node_text='yes', if_add_node_id='yes', - if_add_doc_description='yes' + if_add_doc_description='yes', + pdf_parser=self.pdf_parser, ) # Extract per-page text so queries don't need the original PDF - pages = [] - with open(file_path, 'rb') as f: - pdf_reader = PyPDF2.PdfReader(f) - for i, page in enumerate(pdf_reader.pages, 1): - pages.append({'page': i, 'content': page.extract_text() or ''}) + page_texts = read_pdf_pages(file_path, pdf_parser=self.pdf_parser) + pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)] self.documents[doc_id] = { 'id': doc_id, @@ -90,6 +90,7 @@ def index(self, file_path: str, mode: str = "auto") -> str: 'doc_name': result.get('doc_name', ''), 'doc_description': result.get('doc_description', ''), 'page_count': len(pages), + 'pdf_parser': self.pdf_parser, 'structure': result['structure'], 'pages': pages, } @@ -140,6 +141,8 @@ def _make_meta_entry(doc: dict) -> dict: } if doc.get('type') == 'pdf': entry['page_count'] = doc.get('page_count') + if doc.get('pdf_parser'): + entry['pdf_parser'] = doc['pdf_parser'] elif doc.get('type') == 'md': entry['line_count'] = doc.get('line_count') return entry diff --git a/pageindex/config.yaml b/pageindex/config.yaml index 591fe9331..e7c13e53c 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -7,4 +7,5 @@ max_token_num_each_node: 20000 if_add_node_id: "yes" if_add_node_summary: "yes" if_add_doc_description: "no" -if_add_node_text: "no" \ No newline at end of file +if_add_node_text: "no" +pdf_parser: "PyPDF2" # text extractor: "PyPDF2" (default, no extra install), "pypdfium2" (pip install pypdfium2), or "PyMuPDF" \ No newline at end of file diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 9004309fb..d80896f75 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1074,7 +1074,7 @@ def page_index_main(doc, opt=None): raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') - page_list = get_page_tokens(doc, model=opt.model) + page_list = get_page_tokens(doc, model=opt.model, pdf_parser=opt.pdf_parser) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) @@ -1111,8 +1111,9 @@ async def page_index_builder(): def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, - if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None): - + if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None, + pdf_parser=None): + user_opt = { arg: value for arg, value in locals().items() if arg != "doc" and value is not None diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index 55c38509c..e4ce3397e 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -1,10 +1,9 @@ import json -import PyPDF2 try: - from .utils import get_number_of_pages, remove_fields + from .utils import get_number_of_pages, read_pdf_pages, remove_fields except ImportError: - from utils import get_number_of_pages, remove_fields + from utils import get_number_of_pages, read_pdf_pages, remove_fields # ── Helpers ────────────────────────────────────────────────────────────────── @@ -34,7 +33,11 @@ def _count_pages(doc_info: dict) -> int: def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: - """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF.""" + """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF. + + Honors the parser recorded on the document so cache-miss reads stay consistent + with the originally-indexed text. Defaults to PyPDF2 for legacy documents. + """ cached_pages = doc_info.get('pages') if cached_pages: page_map = {p['page']: p['content'] for p in cached_pages} @@ -42,15 +45,14 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: {'page': p, 'content': page_map[p]} for p in page_nums if p in page_map ] - path = doc_info['path'] - with open(path, 'rb') as f: - pdf_reader = PyPDF2.PdfReader(f) - total = len(pdf_reader.pages) - valid_pages = [p for p in page_nums if 1 <= p <= total] - return [ - {'page': p, 'content': pdf_reader.pages[p - 1].extract_text() or ''} - for p in valid_pages - ] + parser = doc_info.get('pdf_parser') or 'PyPDF2' + all_pages = read_pdf_pages(doc_info['path'], pdf_parser=parser) + total = len(all_pages) + valid_pages = [p for p in page_nums if 1 <= p <= total] + return [ + {'page': p, 'content': all_pages[p - 1]} + for p in valid_pages + ] def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: diff --git a/pageindex/utils.py b/pageindex/utils.py index f00ccf3a7..5c73a6200 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -384,30 +384,63 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): +SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF") + + +def read_pdf_pages(doc, pdf_parser="PyPDF2"): + """Return a list of per-page text strings using the selected parser. + + `doc` may be a file path (str/Path) or a BytesIO. `pdf_parser` is one of + SUPPORTED_PDF_PARSERS. PyPDF2 is the default and only required dependency; + pypdfium2 is lazy-imported so users opt in by installing it separately. + """ if pdf_parser == "PyPDF2": - pdf_reader = PyPDF2.PdfReader(pdf_path) - page_list = [] - for page_num in range(len(pdf_reader.pages)): - page = pdf_reader.pages[page_num] - page_text = page.extract_text() - token_length = litellm.token_counter(model=model, text=page_text) - page_list.append((page_text, token_length)) - return page_list - elif pdf_parser == "PyMuPDF": - if isinstance(pdf_path, BytesIO): - pdf_stream = pdf_path - doc = pymupdf.open(stream=pdf_stream, filetype="pdf") - elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"): - doc = pymupdf.open(pdf_path) - page_list = [] - for page in doc: - page_text = page.get_text() - token_length = litellm.token_counter(model=model, text=page_text) - page_list.append((page_text, token_length)) - return page_list - else: - raise ValueError(f"Unsupported PDF parser: {pdf_parser}") + reader = PyPDF2.PdfReader(doc) + return [(p.extract_text() or "") for p in reader.pages] + + if pdf_parser == "pypdfium2": + try: + import pypdfium2 as pdfium + except ImportError as e: + raise ImportError( + "pdf_parser='pypdfium2' requires the optional dependency. " + "Install it with: pip install pypdfium2" + ) from e + source = doc.getvalue() if isinstance(doc, BytesIO) else str(doc) + pdf = pdfium.PdfDocument(source) + try: + pages = [] + for i in range(len(pdf)): + page = pdf[i] + tp = page.get_textpage() + try: + text = (tp.get_text_bounded() or "").replace("\r\n", "\n") + finally: + tp.close() + page.close() + pages.append(text) + return pages + finally: + pdf.close() + + if pdf_parser == "PyMuPDF": + if isinstance(doc, BytesIO): + d = pymupdf.open(stream=doc, filetype="pdf") + else: + d = pymupdf.open(str(doc)) + try: + return [p.get_text() for p in d] + finally: + d.close() + + raise ValueError( + f"Unsupported pdf_parser={pdf_parser!r}. Choose from {SUPPORTED_PDF_PARSERS}." + ) + + +def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): + pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser) + return [(text, litellm.token_counter(model=model, text=text)) for text in pages] diff --git a/requirements.txt b/requirements.txt index e6ad80531..e6ab06388 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ litellm==1.83.7 # openai-agents # optional: required for examples/agentic_vectorless_rag_demo.py pymupdf==1.26.4 +# pypdfium2 # optional: enables pdf_parser="pypdfium2" (cleaner text, faster, Apache 2.0) PyPDF2==3.0.1 python-dotenv==1.2.2 pyyaml==6.0.2 diff --git a/run_pageindex.py b/run_pageindex.py index 673439d89..76661d814 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -28,7 +28,9 @@ help='Whether to add doc description to the doc') parser.add_argument('--if-add-node-text', type=str, default=None, help='Whether to add text to the node') - + parser.add_argument('--pdf-parser', type=str, default=None, + help='PDF text extractor: PyPDF2 (default), pypdfium2 (requires `pip install pypdfium2`), or PyMuPDF') + # Markdown specific arguments parser.add_argument('--if-thinning', type=str, default='no', help='Whether to apply tree thinning for markdown (markdown only)') @@ -61,6 +63,7 @@ 'if_add_node_summary': args.if_add_node_summary, 'if_add_doc_description': args.if_add_doc_description, 'if_add_node_text': args.if_add_node_text, + 'pdf_parser': args.pdf_parser, } opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) From 3b2ddef822bc34065484bc2327731c47c4a5ee8c Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:08:50 +0800 Subject: [PATCH 2/9] Keep pdf_parser default in code, not config.yaml --- pageindex/config.yaml | 3 +-- pageindex/utils.py | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pageindex/config.yaml b/pageindex/config.yaml index e7c13e53c..591fe9331 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -7,5 +7,4 @@ max_token_num_each_node: 20000 if_add_node_id: "yes" if_add_node_summary: "yes" if_add_doc_description: "no" -if_add_node_text: "no" -pdf_parser: "PyPDF2" # text extractor: "PyPDF2" (default, no extra install), "pypdfium2" (pip install pypdfium2), or "PyMuPDF" \ No newline at end of file +if_add_node_text: "no" \ No newline at end of file diff --git a/pageindex/utils.py b/pageindex/utils.py index 5c73a6200..a5adc5461 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -685,10 +685,14 @@ def format_structure(structure, order=None): class ConfigLoader: + # Code-side defaults for non-tuning settings (kept out of config.yaml). + # yaml entries override these if present. + _CODE_DEFAULTS = {"pdf_parser": "PyPDF2"} + def __init__(self, default_path: str = None): if default_path is None: default_path = Path(__file__).parent / "config.yaml" - self._default_dict = self._load_yaml(default_path) + self._default_dict = {**self._CODE_DEFAULTS, **self._load_yaml(default_path)} @staticmethod def _load_yaml(path): From de5858190023047c85483bc707354bfe21059ef0 Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:17:58 +0800 Subject: [PATCH 3/9] Drop unnecessary docstring --- pageindex/retrieve.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index e4ce3397e..9a10681ef 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -33,11 +33,7 @@ def _count_pages(doc_info: dict) -> int: def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: - """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF. - - Honors the parser recorded on the document so cache-miss reads stay consistent - with the originally-indexed text. Defaults to PyPDF2 for legacy documents. - """ + """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF.""" cached_pages = doc_info.get('pages') if cached_pages: page_map = {p['page']: p['content'] for p in cached_pages} From 1629ef4318c551aaea8af069425efe537ce7cbb5 Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:20:45 +0800 Subject: [PATCH 4/9] Take pdf_parser out of ConfigLoader, use plain function arg --- pageindex/client.py | 6 ++---- pageindex/page_index.py | 14 +++++++------- pageindex/utils.py | 6 +----- run_pageindex.py | 3 +-- 4 files changed, 11 insertions(+), 18 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index e04574ce3..cdd3d7cc7 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -31,7 +31,7 @@ class PageIndexClient: For agent-based QA, see examples/agentic_vectorless_rag_demo.py. """ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, - workspace: str = None, pdf_parser: str = None): + workspace: str = None, pdf_parser: str = "PyPDF2"): if api_key: os.environ["OPENAI_API_KEY"] = api_key elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): @@ -42,12 +42,10 @@ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = overrides["model"] = model if retrieve_model: overrides["retrieve_model"] = retrieve_model - if pdf_parser: - overrides["pdf_parser"] = pdf_parser opt = ConfigLoader().load(overrides or None) self.model = opt.model self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model) - self.pdf_parser = opt.pdf_parser + self.pdf_parser = pdf_parser if self.workspace: self.workspace.mkdir(parents=True, exist_ok=True) self.documents = {} diff --git a/pageindex/page_index.py b/pageindex/page_index.py index d80896f75..ef9ac096f 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1063,18 +1063,18 @@ async def tree_parser(page_list, opt, doc=None, logger=None): return toc_tree -def page_index_main(doc, opt=None): +def page_index_main(doc, opt=None, pdf_parser="PyPDF2"): logger = JsonLogger(doc) - + is_valid_pdf = ( - (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or + (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or isinstance(doc, BytesIO) ) if not is_valid_pdf: raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') - page_list = get_page_tokens(doc, model=opt.model, pdf_parser=opt.pdf_parser) + page_list = get_page_tokens(doc, model=opt.model, pdf_parser=pdf_parser) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) @@ -1112,14 +1112,14 @@ async def page_index_builder(): def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None, - pdf_parser=None): + pdf_parser="PyPDF2"): user_opt = { arg: value for arg, value in locals().items() - if arg != "doc" and value is not None + if arg not in ("doc", "pdf_parser") and value is not None } opt = ConfigLoader().load(user_opt) - return page_index_main(doc, opt) + return page_index_main(doc, opt, pdf_parser=pdf_parser) def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None): diff --git a/pageindex/utils.py b/pageindex/utils.py index a5adc5461..5c73a6200 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -685,14 +685,10 @@ def format_structure(structure, order=None): class ConfigLoader: - # Code-side defaults for non-tuning settings (kept out of config.yaml). - # yaml entries override these if present. - _CODE_DEFAULTS = {"pdf_parser": "PyPDF2"} - def __init__(self, default_path: str = None): if default_path is None: default_path = Path(__file__).parent / "config.yaml" - self._default_dict = {**self._CODE_DEFAULTS, **self._load_yaml(default_path)} + self._default_dict = self._load_yaml(default_path) @staticmethod def _load_yaml(path): diff --git a/run_pageindex.py b/run_pageindex.py index 76661d814..d4eaa51ed 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -63,12 +63,11 @@ 'if_add_node_summary': args.if_add_node_summary, 'if_add_doc_description': args.if_add_doc_description, 'if_add_node_text': args.if_add_node_text, - 'pdf_parser': args.pdf_parser, } opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) # Process the PDF - toc_with_page_number = page_index_main(args.pdf_path, opt) + toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or "PyPDF2") print('Parsing done, saving to file...') # Save results From ec1aaca4c9be8772e294cbed059912b17bad0dc8 Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:24:01 +0800 Subject: [PATCH 5/9] Centralize default parser as DEFAULT_PDF_PARSER constant --- pageindex/client.py | 4 ++-- pageindex/page_index.py | 4 ++-- pageindex/retrieve.py | 6 +++--- pageindex/utils.py | 5 +++-- run_pageindex.py | 4 ++-- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index cdd3d7cc7..30a6bb079 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -8,7 +8,7 @@ from .page_index import page_index from .page_index_md import md_to_tree from .retrieve import get_document, get_document_structure, get_page_content -from .utils import ConfigLoader, read_pdf_pages, remove_fields +from .utils import ConfigLoader, DEFAULT_PDF_PARSER, read_pdf_pages, remove_fields META_INDEX = "_meta.json" @@ -31,7 +31,7 @@ class PageIndexClient: For agent-based QA, see examples/agentic_vectorless_rag_demo.py. """ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, - workspace: str = None, pdf_parser: str = "PyPDF2"): + workspace: str = None, pdf_parser: str = DEFAULT_PDF_PARSER): if api_key: os.environ["OPENAI_API_KEY"] = api_key elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): diff --git a/pageindex/page_index.py b/pageindex/page_index.py index ef9ac096f..201824ce6 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1063,7 +1063,7 @@ async def tree_parser(page_list, opt, doc=None, logger=None): return toc_tree -def page_index_main(doc, opt=None, pdf_parser="PyPDF2"): +def page_index_main(doc, opt=None, pdf_parser=DEFAULT_PDF_PARSER): logger = JsonLogger(doc) is_valid_pdf = ( @@ -1112,7 +1112,7 @@ async def page_index_builder(): def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None, - pdf_parser="PyPDF2"): + pdf_parser=DEFAULT_PDF_PARSER): user_opt = { arg: value for arg, value in locals().items() diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index 9a10681ef..52bc2ebcf 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -1,9 +1,9 @@ import json try: - from .utils import get_number_of_pages, read_pdf_pages, remove_fields + from .utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields except ImportError: - from utils import get_number_of_pages, read_pdf_pages, remove_fields + from utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields # ── Helpers ────────────────────────────────────────────────────────────────── @@ -41,7 +41,7 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: {'page': p, 'content': page_map[p]} for p in page_nums if p in page_map ] - parser = doc_info.get('pdf_parser') or 'PyPDF2' + parser = doc_info.get('pdf_parser') or DEFAULT_PDF_PARSER all_pages = read_pdf_pages(doc_info['path'], pdf_parser=parser) total = len(all_pages) valid_pages = [p for p in page_nums if 1 <= p <= total] diff --git a/pageindex/utils.py b/pageindex/utils.py index 5c73a6200..e0ebb54f1 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -385,9 +385,10 @@ def add_preface_if_needed(data): SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF") +DEFAULT_PDF_PARSER = SUPPORTED_PDF_PARSERS[0] -def read_pdf_pages(doc, pdf_parser="PyPDF2"): +def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER): """Return a list of per-page text strings using the selected parser. `doc` may be a file path (str/Path) or a BytesIO. `pdf_parser` is one of @@ -438,7 +439,7 @@ def read_pdf_pages(doc, pdf_parser="PyPDF2"): ) -def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): +def get_page_tokens(pdf_path, model=None, pdf_parser=DEFAULT_PDF_PARSER): pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser) return [(text, litellm.token_counter(model=model, text=text)) for text in pages] diff --git a/run_pageindex.py b/run_pageindex.py index d4eaa51ed..295f3ede8 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -3,7 +3,7 @@ import json from pageindex import * from pageindex.page_index_md import md_to_tree -from pageindex.utils import ConfigLoader +from pageindex.utils import ConfigLoader, DEFAULT_PDF_PARSER if __name__ == "__main__": # Set up argument parser @@ -67,7 +67,7 @@ opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) # Process the PDF - toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or "PyPDF2") + toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or DEFAULT_PDF_PARSER) print('Parsing done, saving to file...') # Save results From 108cb28518f144f747211c99e4c736ef8bdd3f47 Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:40:32 +0800 Subject: [PATCH 6/9] Move pdf_parser off doc dict, pass via call args --- pageindex/client.py | 5 +---- pageindex/retrieve.py | 9 ++++----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index 30a6bb079..1d364095f 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -88,7 +88,6 @@ def index(self, file_path: str, mode: str = "auto") -> str: 'doc_name': result.get('doc_name', ''), 'doc_description': result.get('doc_description', ''), 'page_count': len(pages), - 'pdf_parser': self.pdf_parser, 'structure': result['structure'], 'pages': pages, } @@ -139,8 +138,6 @@ def _make_meta_entry(doc: dict) -> dict: } if doc.get('type') == 'pdf': entry['page_count'] = doc.get('page_count') - if doc.get('pdf_parser'): - entry['pdf_parser'] = doc['pdf_parser'] elif doc.get('type') == 'md': entry['line_count'] = doc.get('line_count') return entry @@ -232,4 +229,4 @@ def get_page_content(self, doc_id: str, pages: str) -> str: """Return page content for the given pages string (e.g. '5-7', '3,8', '12').""" if self.workspace: self._ensure_doc_loaded(doc_id) - return get_page_content(self.documents, doc_id, pages) + return get_page_content(self.documents, doc_id, pages, pdf_parser=self.pdf_parser) diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index 52bc2ebcf..dabd2583e 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -32,7 +32,7 @@ def _count_pages(doc_info: dict) -> int: return get_number_of_pages(doc_info['path']) -def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: +def _get_pdf_page_content(doc_info: dict, page_nums: list[int], pdf_parser: str = DEFAULT_PDF_PARSER) -> list[dict]: """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF.""" cached_pages = doc_info.get('pages') if cached_pages: @@ -41,8 +41,7 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: {'page': p, 'content': page_map[p]} for p in page_nums if p in page_map ] - parser = doc_info.get('pdf_parser') or DEFAULT_PDF_PARSER - all_pages = read_pdf_pages(doc_info['path'], pdf_parser=parser) + all_pages = read_pdf_pages(doc_info['path'], pdf_parser=pdf_parser) total = len(all_pages) valid_pages = [p for p in page_nums if 1 <= p <= total] return [ @@ -105,7 +104,7 @@ def get_document_structure(documents: dict, doc_id: str) -> str: return json.dumps(structure_no_text, ensure_ascii=False) -def get_page_content(documents: dict, doc_id: str, pages: str) -> str: +def get_page_content(documents: dict, doc_id: str, pages: str, pdf_parser: str = DEFAULT_PDF_PARSER) -> str: """ Retrieve page content for a document. @@ -126,7 +125,7 @@ def get_page_content(documents: dict, doc_id: str, pages: str) -> str: try: if doc_info.get('type') == 'pdf': - content = _get_pdf_page_content(doc_info, page_nums) + content = _get_pdf_page_content(doc_info, page_nums, pdf_parser=pdf_parser) else: content = _get_md_page_content(doc_info, page_nums) except Exception as e: From 63e11ef152102101f22a6c9d8f6ff074004ea942 Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:46:13 +0800 Subject: [PATCH 7/9] Make PageIndexClient parser-agnostic, pdf_parser per index() call --- pageindex/client.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index 1d364095f..f74c825b5 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -31,7 +31,7 @@ class PageIndexClient: For agent-based QA, see examples/agentic_vectorless_rag_demo.py. """ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, - workspace: str = None, pdf_parser: str = DEFAULT_PDF_PARSER): + workspace: str = None): if api_key: os.environ["OPENAI_API_KEY"] = api_key elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): @@ -45,15 +45,14 @@ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = opt = ConfigLoader().load(overrides or None) self.model = opt.model self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model) - self.pdf_parser = pdf_parser if self.workspace: self.workspace.mkdir(parents=True, exist_ok=True) self.documents = {} if self.workspace: self._load_workspace() - def index(self, file_path: str, mode: str = "auto") -> str: - """Index a document. Returns a document_id.""" + def index(self, file_path: str, mode: str = "auto", pdf_parser: str = DEFAULT_PDF_PARSER) -> str: + """Index a document. Returns a document_id. pdf_parser only affects PDF mode.""" # Persist a canonical absolute path so workspace reloads do not # reinterpret caller-relative paths against the workspace directory. file_path = os.path.abspath(os.path.expanduser(file_path)) @@ -75,10 +74,10 @@ def index(self, file_path: str, mode: str = "auto") -> str: if_add_node_text='yes', if_add_node_id='yes', if_add_doc_description='yes', - pdf_parser=self.pdf_parser, + pdf_parser=pdf_parser, ) # Extract per-page text so queries don't need the original PDF - page_texts = read_pdf_pages(file_path, pdf_parser=self.pdf_parser) + page_texts = read_pdf_pages(file_path, pdf_parser=pdf_parser) pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)] self.documents[doc_id] = { @@ -226,7 +225,12 @@ def get_document_structure(self, doc_id: str) -> str: return get_document_structure(self.documents, doc_id) def get_page_content(self, doc_id: str, pages: str) -> str: - """Return page content for the given pages string (e.g. '5-7', '3,8', '12').""" + """Return page content for the given pages string (e.g. '5-7', '3,8', '12'). + + Cache hit returns originally-indexed text. The rare cache-miss path + re-reads with the default parser; callers needing parser-consistent + fallback can use the low-level retrieve.get_page_content directly. + """ if self.workspace: self._ensure_doc_loaded(doc_id) - return get_page_content(self.documents, doc_id, pages, pdf_parser=self.pdf_parser) + return get_page_content(self.documents, doc_id, pages) From 4dec4d66a98d249af7d324420ce568f8cecfb1be Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 18:43:51 +0800 Subject: [PATCH 8/9] Replace pdf_parser plumbing with mutable DEFAULT_PDF_PARSER global --- pageindex/client.py | 16 +++++----------- pageindex/page_index.py | 11 +++++------ pageindex/retrieve.py | 12 ++++++------ pageindex/utils.py | 28 ++++++++++++++-------------- run_pageindex.py | 9 +++++++-- 5 files changed, 37 insertions(+), 39 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index f74c825b5..1924ad1bf 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -8,7 +8,7 @@ from .page_index import page_index from .page_index_md import md_to_tree from .retrieve import get_document, get_document_structure, get_page_content -from .utils import ConfigLoader, DEFAULT_PDF_PARSER, read_pdf_pages, remove_fields +from .utils import ConfigLoader, read_pdf_pages, remove_fields META_INDEX = "_meta.json" @@ -51,8 +51,8 @@ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = if self.workspace: self._load_workspace() - def index(self, file_path: str, mode: str = "auto", pdf_parser: str = DEFAULT_PDF_PARSER) -> str: - """Index a document. Returns a document_id. pdf_parser only affects PDF mode.""" + def index(self, file_path: str, mode: str = "auto") -> str: + """Index a document. Returns a document_id.""" # Persist a canonical absolute path so workspace reloads do not # reinterpret caller-relative paths against the workspace directory. file_path = os.path.abspath(os.path.expanduser(file_path)) @@ -74,10 +74,9 @@ def index(self, file_path: str, mode: str = "auto", pdf_parser: str = DEFAULT_PD if_add_node_text='yes', if_add_node_id='yes', if_add_doc_description='yes', - pdf_parser=pdf_parser, ) # Extract per-page text so queries don't need the original PDF - page_texts = read_pdf_pages(file_path, pdf_parser=pdf_parser) + page_texts = read_pdf_pages(file_path) pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)] self.documents[doc_id] = { @@ -225,12 +224,7 @@ def get_document_structure(self, doc_id: str) -> str: return get_document_structure(self.documents, doc_id) def get_page_content(self, doc_id: str, pages: str) -> str: - """Return page content for the given pages string (e.g. '5-7', '3,8', '12'). - - Cache hit returns originally-indexed text. The rare cache-miss path - re-reads with the default parser; callers needing parser-consistent - fallback can use the low-level retrieve.get_page_content directly. - """ + """Return page content for the given pages string (e.g. '5-7', '3,8', '12').""" if self.workspace: self._ensure_doc_loaded(doc_id) return get_page_content(self.documents, doc_id, pages) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 201824ce6..735f1ed1e 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1063,7 +1063,7 @@ async def tree_parser(page_list, opt, doc=None, logger=None): return toc_tree -def page_index_main(doc, opt=None, pdf_parser=DEFAULT_PDF_PARSER): +def page_index_main(doc, opt=None): logger = JsonLogger(doc) is_valid_pdf = ( @@ -1074,7 +1074,7 @@ def page_index_main(doc, opt=None, pdf_parser=DEFAULT_PDF_PARSER): raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') - page_list = get_page_tokens(doc, model=opt.model, pdf_parser=pdf_parser) + page_list = get_page_tokens(doc, model=opt.model) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) @@ -1111,15 +1111,14 @@ async def page_index_builder(): def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, - if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None, - pdf_parser=DEFAULT_PDF_PARSER): + if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None): user_opt = { arg: value for arg, value in locals().items() - if arg not in ("doc", "pdf_parser") and value is not None + if arg != "doc" and value is not None } opt = ConfigLoader().load(user_opt) - return page_index_main(doc, opt, pdf_parser=pdf_parser) + return page_index_main(doc, opt) def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None): diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index dabd2583e..81c643eb8 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -1,9 +1,9 @@ import json try: - from .utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields + from .utils import get_number_of_pages, read_pdf_pages, remove_fields except ImportError: - from utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields + from utils import get_number_of_pages, read_pdf_pages, remove_fields # ── Helpers ────────────────────────────────────────────────────────────────── @@ -32,7 +32,7 @@ def _count_pages(doc_info: dict) -> int: return get_number_of_pages(doc_info['path']) -def _get_pdf_page_content(doc_info: dict, page_nums: list[int], pdf_parser: str = DEFAULT_PDF_PARSER) -> list[dict]: +def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF.""" cached_pages = doc_info.get('pages') if cached_pages: @@ -41,7 +41,7 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int], pdf_parser: str {'page': p, 'content': page_map[p]} for p in page_nums if p in page_map ] - all_pages = read_pdf_pages(doc_info['path'], pdf_parser=pdf_parser) + all_pages = read_pdf_pages(doc_info['path']) total = len(all_pages) valid_pages = [p for p in page_nums if 1 <= p <= total] return [ @@ -104,7 +104,7 @@ def get_document_structure(documents: dict, doc_id: str) -> str: return json.dumps(structure_no_text, ensure_ascii=False) -def get_page_content(documents: dict, doc_id: str, pages: str, pdf_parser: str = DEFAULT_PDF_PARSER) -> str: +def get_page_content(documents: dict, doc_id: str, pages: str) -> str: """ Retrieve page content for a document. @@ -125,7 +125,7 @@ def get_page_content(documents: dict, doc_id: str, pages: str, pdf_parser: str = try: if doc_info.get('type') == 'pdf': - content = _get_pdf_page_content(doc_info, page_nums, pdf_parser=pdf_parser) + content = _get_pdf_page_content(doc_info, page_nums) else: content = _get_md_page_content(doc_info, page_nums) except Exception as e: diff --git a/pageindex/utils.py b/pageindex/utils.py index e0ebb54f1..6563d268a 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -385,26 +385,26 @@ def add_preface_if_needed(data): SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF") -DEFAULT_PDF_PARSER = SUPPORTED_PDF_PARSERS[0] +# Module-level setting. Override by mutating this attribute or setting +# PAGEINDEX_PDF_PARSER in the environment before import. +DEFAULT_PDF_PARSER = os.getenv("PAGEINDEX_PDF_PARSER") or SUPPORTED_PDF_PARSERS[0] -def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER): - """Return a list of per-page text strings using the selected parser. - `doc` may be a file path (str/Path) or a BytesIO. `pdf_parser` is one of - SUPPORTED_PDF_PARSERS. PyPDF2 is the default and only required dependency; - pypdfium2 is lazy-imported so users opt in by installing it separately. - """ - if pdf_parser == "PyPDF2": +def read_pdf_pages(doc): + """Return a list of per-page text strings using the currently configured parser.""" + parser = DEFAULT_PDF_PARSER + + if parser == "PyPDF2": reader = PyPDF2.PdfReader(doc) return [(p.extract_text() or "") for p in reader.pages] - if pdf_parser == "pypdfium2": + if parser == "pypdfium2": try: import pypdfium2 as pdfium except ImportError as e: raise ImportError( - "pdf_parser='pypdfium2' requires the optional dependency. " + "DEFAULT_PDF_PARSER='pypdfium2' requires the optional dependency. " "Install it with: pip install pypdfium2" ) from e source = doc.getvalue() if isinstance(doc, BytesIO) else str(doc) @@ -424,7 +424,7 @@ def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER): finally: pdf.close() - if pdf_parser == "PyMuPDF": + if parser == "PyMuPDF": if isinstance(doc, BytesIO): d = pymupdf.open(stream=doc, filetype="pdf") else: @@ -435,12 +435,12 @@ def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER): d.close() raise ValueError( - f"Unsupported pdf_parser={pdf_parser!r}. Choose from {SUPPORTED_PDF_PARSERS}." + f"Unsupported DEFAULT_PDF_PARSER={parser!r}. Choose from {SUPPORTED_PDF_PARSERS}." ) -def get_page_tokens(pdf_path, model=None, pdf_parser=DEFAULT_PDF_PARSER): - pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser) +def get_page_tokens(pdf_path, model=None): + pages = read_pdf_pages(pdf_path) return [(text, litellm.token_counter(model=model, text=text)) for text in pages] diff --git a/run_pageindex.py b/run_pageindex.py index 295f3ede8..4103a49ff 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -2,8 +2,9 @@ import os import json from pageindex import * +import pageindex.utils as pageindex_utils from pageindex.page_index_md import md_to_tree -from pageindex.utils import ConfigLoader, DEFAULT_PDF_PARSER +from pageindex.utils import ConfigLoader if __name__ == "__main__": # Set up argument parser @@ -66,8 +67,12 @@ } opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) + # CLI flag overrides the module-level default (and env var PAGEINDEX_PDF_PARSER). + if args.pdf_parser: + pageindex_utils.DEFAULT_PDF_PARSER = args.pdf_parser + # Process the PDF - toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or DEFAULT_PDF_PARSER) + toc_with_page_number = page_index_main(args.pdf_path, opt) print('Parsing done, saving to file...') # Save results From 7b15dea4d50402c4e5c495d14d4dff0283268fad Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 18:55:27 +0800 Subject: [PATCH 9/9] Use single import style for pageindex.utils in run_pageindex --- run_pageindex.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/run_pageindex.py b/run_pageindex.py index 4103a49ff..354874771 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -4,7 +4,6 @@ from pageindex import * import pageindex.utils as pageindex_utils from pageindex.page_index_md import md_to_tree -from pageindex.utils import ConfigLoader if __name__ == "__main__": # Set up argument parser @@ -65,7 +64,7 @@ 'if_add_doc_description': args.if_add_doc_description, 'if_add_node_text': args.if_add_node_text, } - opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) + opt = pageindex_utils.ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) # CLI flag overrides the module-level default (and env var PAGEINDEX_PDF_PARSER). if args.pdf_parser: @@ -100,8 +99,7 @@ import asyncio # Use ConfigLoader to get consistent defaults (matching PDF behavior) - from pageindex.utils import ConfigLoader - config_loader = ConfigLoader() + config_loader = pageindex_utils.ConfigLoader() # Create options dict with user args user_opt = {