From ecd2bc81eb004acbac5db37d8c4b1da92f6c45e3 Mon Sep 17 00:00:00 2001 From: ethan Date: Sat, 2 May 2026 11:03:47 +0000 Subject: [PATCH] feat(pageindex): add .txt parsing support Fixes #222 Add a new pageindex/page_index_txt.py module exposing async txt_to_tree(), which reads UTF-8 (with latin-1 fallback) and returns a single-root-node tree mirroring md_to_tree's output shape (doc_name, line_count, structure). Wired into pageindex package exports, PageIndexClient.index() (new mode='txt' branch and is_txt auto-detection; _make_meta_entry treats txt like md for line_count), and run_pageindex.py CLI (--txt_path with the same validation/save flow as --md_path). Co-Authored-By: Claude --- pageindex/__init__.py | 1 + pageindex/client.py | 31 +++++++++++++++- pageindex/page_index_txt.py | 61 ++++++++++++++++++++++++++++++++ run_pageindex.py | 55 ++++++++++++++++++++++++++--- tests/test_page_index_txt.py | 68 ++++++++++++++++++++++++++++++++++++ 5 files changed, 211 insertions(+), 5 deletions(-) create mode 100644 pageindex/page_index_txt.py create mode 100644 tests/test_page_index_txt.py diff --git a/pageindex/__init__.py b/pageindex/__init__.py index 658003bf5..0c4481194 100644 --- a/pageindex/__init__.py +++ b/pageindex/__init__.py @@ -1,4 +1,5 @@ from .page_index import * from .page_index_md import md_to_tree +from .page_index_txt import txt_to_tree from .retrieve import get_document, get_document_structure, get_page_content from .client import PageIndexClient diff --git a/pageindex/client.py b/pageindex/client.py index 894dab181..4c6fd251d 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -9,6 +9,7 @@ from .page_index import page_index from .page_index_md import md_to_tree +from .page_index_txt import txt_to_tree from .retrieve import get_document, get_document_structure, get_page_content from .utils import ConfigLoader, remove_fields @@ -65,6 +66,7 @@ def index(self, file_path: str, mode: str = "auto") -> str: is_pdf = ext == '.pdf' is_md = ext in ['.md', '.markdown'] + is_txt = ext == '.txt' if mode == "pdf" or (mode == "auto" and is_pdf): print(f"Indexing PDF: {file_path}") @@ -121,6 +123,33 @@ def index(self, file_path: str, mode: str = "auto") -> str: 'line_count': result.get('line_count', 0), 'structure': result['structure'], } + + elif mode == "txt" or (mode == "auto" and is_txt): + print(f"Indexing Text: {file_path}") + coro = txt_to_tree( + txt_path=file_path, + if_add_node_summary='yes', + summary_token_threshold=200, + model=self.model, + if_add_doc_description='yes', + if_add_node_text='yes', + if_add_node_id='yes' + ) + try: + asyncio.get_running_loop() + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + result = pool.submit(asyncio.run, coro).result() + except RuntimeError: + result = asyncio.run(coro) + self.documents[doc_id] = { + 'id': doc_id, + 'type': 'txt', + 'path': file_path, + 'doc_name': result.get('doc_name', ''), + 'doc_description': result.get('doc_description', ''), + 'line_count': result.get('line_count', 0), + 'structure': result['structure'], + } else: raise ValueError(f"Unsupported file format for: {file_path}") @@ -140,7 +169,7 @@ def _make_meta_entry(doc: dict) -> dict: } if doc.get('type') == 'pdf': entry['page_count'] = doc.get('page_count') - elif doc.get('type') == 'md': + elif doc.get('type') in ('md', 'txt'): entry['line_count'] = doc.get('line_count') return entry diff --git a/pageindex/page_index_txt.py b/pageindex/page_index_txt.py new file mode 100644 index 000000000..05a34b309 --- /dev/null +++ b/pageindex/page_index_txt.py @@ -0,0 +1,61 @@ +import os +try: + from .utils import * +except: + from utils import * + +from .page_index_md import generate_summaries_for_structure_md + + +def _read_text_file(txt_path): + try: + with open(txt_path, 'r', encoding='utf-8') as f: + return f.read() + except UnicodeDecodeError: + with open(txt_path, 'r', encoding='latin-1') as f: + return f.read() + + +async def txt_to_tree(txt_path, if_add_node_summary='no', summary_token_threshold=200, model=None, if_add_doc_description='no', if_add_node_text='yes', if_add_node_id='yes'): + text = _read_text_file(txt_path) + line_count = text.count('\n') + 1 + + doc_name = os.path.splitext(os.path.basename(txt_path))[0] + tree_structure = [{ + 'title': doc_name, + 'node_id': '0001', + 'text': text, + 'line_num': 1, + 'nodes': [], + }] + + if if_add_node_id == 'yes': + write_node_id(tree_structure) + + if if_add_node_summary == 'yes': + tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes']) + tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model) + + if if_add_node_text == 'no': + tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes']) + + if if_add_doc_description == 'yes': + clean_structure = create_clean_structure_for_description(tree_structure) + doc_description = generate_doc_description(clean_structure, model=model) + return { + 'doc_name': doc_name, + 'doc_description': doc_description, + 'line_count': line_count, + 'structure': tree_structure, + } + else: + if if_add_node_text == 'yes': + tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes']) + else: + tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes']) + + return { + 'doc_name': doc_name, + 'line_count': line_count, + 'structure': tree_structure, + } diff --git a/run_pageindex.py b/run_pageindex.py index 673439d89..0d5116ff8 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -3,6 +3,7 @@ import json from pageindex import * from pageindex.page_index_md import md_to_tree +from pageindex.page_index_txt import txt_to_tree from pageindex.utils import ConfigLoader if __name__ == "__main__": @@ -10,6 +11,7 @@ parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure') parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') parser.add_argument('--md_path', type=str, help='Path to the Markdown file') + parser.add_argument('--txt_path', type=str, help='Path to the plain text file') parser.add_argument('--model', type=str, default=None, help='Model to use (overrides config.yaml)') @@ -39,10 +41,11 @@ args = parser.parse_args() # Validate that exactly one file type is specified - if not args.pdf_path and not args.md_path: - raise ValueError("Either --pdf_path or --md_path must be specified") - if args.pdf_path and args.md_path: - raise ValueError("Only one of --pdf_path or --md_path can be specified") + specified = [p for p in (args.pdf_path, args.md_path, args.txt_path) if p] + if not specified: + raise ValueError("Either --pdf_path, --md_path, or --txt_path must be specified") + if len(specified) > 1: + raise ValueError("Only one of --pdf_path, --md_path, or --txt_path can be specified") if args.pdf_path: # Validate PDF file @@ -131,4 +134,48 @@ with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False) + print(f'Tree structure saved to: {output_file}') + + elif args.txt_path: + if not args.txt_path.lower().endswith('.txt'): + raise ValueError("Text file must have .txt extension") + if not os.path.isfile(args.txt_path): + raise ValueError(f"Text file not found: {args.txt_path}") + + print('Processing text file...') + + import asyncio + from pageindex.utils import ConfigLoader + config_loader = ConfigLoader() + + user_opt = { + 'model': args.model, + 'if_add_node_summary': args.if_add_node_summary, + 'if_add_doc_description': args.if_add_doc_description, + 'if_add_node_text': args.if_add_node_text, + 'if_add_node_id': args.if_add_node_id + } + + opt = config_loader.load(user_opt) + + toc_with_page_number = asyncio.run(txt_to_tree( + txt_path=args.txt_path, + if_add_node_summary=opt.if_add_node_summary, + summary_token_threshold=args.summary_token_threshold, + model=opt.model, + if_add_doc_description=opt.if_add_doc_description, + if_add_node_text=opt.if_add_node_text, + if_add_node_id=opt.if_add_node_id + )) + + print('Parsing done, saving to file...') + + txt_name = os.path.splitext(os.path.basename(args.txt_path))[0] + output_dir = './results' + output_file = f'{output_dir}/{txt_name}_structure.json' + os.makedirs(output_dir, exist_ok=True) + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False) + print(f'Tree structure saved to: {output_file}') \ No newline at end of file diff --git a/tests/test_page_index_txt.py b/tests/test_page_index_txt.py new file mode 100644 index 000000000..93e9b543a --- /dev/null +++ b/tests/test_page_index_txt.py @@ -0,0 +1,68 @@ +import asyncio +import os +import tempfile + +import pytest + +from pageindex.page_index_txt import txt_to_tree + + +def _run(coro): + return asyncio.run(coro) + + +def _write_tmp(content, suffix=".txt", encoding="utf-8"): + fd, path = tempfile.mkstemp(suffix=suffix) + os.close(fd) + with open(path, "w", encoding=encoding) as f: + f.write(content) + return path + + +def test_txt_to_tree_parses_plain_text_into_single_node(): + path = _write_tmp("Hello world.\nThis is a plain text document.\n") + try: + result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no")) + finally: + os.unlink(path) + + assert result["doc_name"] == os.path.splitext(os.path.basename(path))[0] + assert isinstance(result["structure"], list) + assert len(result["structure"]) == 1 + root = result["structure"][0] + assert root["text"].startswith("Hello world.") + assert "This is a plain text document." in root["text"] + + +def test_txt_to_tree_preserves_utf8_content(): + path = _write_tmp("héllo wörld — 你好\n", encoding="utf-8") + try: + result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no")) + finally: + os.unlink(path) + + assert "héllo wörld" in result["structure"][0]["text"] + assert "你好" in result["structure"][0]["text"] + + +def test_txt_to_tree_includes_line_count(): + path = _write_tmp("line1\nline2\nline3\n") + try: + result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no")) + finally: + os.unlink(path) + + assert result["line_count"] == 4 + + +def test_txt_to_tree_exposed_from_package(): + from pageindex import txt_to_tree as exported + assert exported is txt_to_tree + + +def test_client_dispatches_txt_extension(): + import inspect + from pageindex import client + src = inspect.getsource(client) + assert "txt_to_tree" in src + assert ".txt" in src