From ecd2bc81eb004acbac5db37d8c4b1da92f6c45e3 Mon Sep 17 00:00:00 2001
From: ethan <ethanguo.2003@gmail.com>
Date: Sat, 2 May 2026 11:03:47 +0000
Subject: [PATCH] feat(pageindex): add .txt parsing support

Fixes #222

Add a new pageindex/page_index_txt.py module exposing async txt_to_tree(),
which reads UTF-8 (with latin-1 fallback) and returns a single-root-node
tree mirroring md_to_tree's output shape (doc_name, line_count, structure).
Wired into pageindex package exports, PageIndexClient.index() (new
mode='txt' branch and is_txt auto-detection; _make_meta_entry treats txt
like md for line_count), and run_pageindex.py CLI (--txt_path with the
same validation/save flow as --md_path).

Co-Authored-By: Claude <noreply@anthropic.com>
---
 pageindex/__init__.py        |  1 +
 pageindex/client.py          | 31 +++++++++++++++-
 pageindex/page_index_txt.py  | 61 ++++++++++++++++++++++++++++++++
 run_pageindex.py             | 55 ++++++++++++++++++++++++++---
 tests/test_page_index_txt.py | 68 ++++++++++++++++++++++++++++++++++++
 5 files changed, 211 insertions(+), 5 deletions(-)
 create mode 100644 pageindex/page_index_txt.py
 create mode 100644 tests/test_page_index_txt.py

diff --git a/pageindex/__init__.py b/pageindex/__init__.py
index 658003bf5..0c4481194 100644
--- a/pageindex/__init__.py
+++ b/pageindex/__init__.py
@@ -1,4 +1,5 @@
 from .page_index import *
 from .page_index_md import md_to_tree
+from .page_index_txt import txt_to_tree
 from .retrieve import get_document, get_document_structure, get_page_content
 from .client import PageIndexClient
diff --git a/pageindex/client.py b/pageindex/client.py
index 894dab181..4c6fd251d 100644
--- a/pageindex/client.py
+++ b/pageindex/client.py
@@ -9,6 +9,7 @@
 
 from .page_index import page_index
 from .page_index_md import md_to_tree
+from .page_index_txt import txt_to_tree
 from .retrieve import get_document, get_document_structure, get_page_content
 from .utils import ConfigLoader, remove_fields
 
@@ -65,6 +66,7 @@ def index(self, file_path: str, mode: str = "auto") -> str:
 
         is_pdf = ext == '.pdf'
         is_md = ext in ['.md', '.markdown']
+        is_txt = ext == '.txt'
 
         if mode == "pdf" or (mode == "auto" and is_pdf):
             print(f"Indexing PDF: {file_path}")
@@ -121,6 +123,33 @@ def index(self, file_path: str, mode: str = "auto") -> str:
                 'line_count': result.get('line_count', 0),
                 'structure': result['structure'],
             }
+
+        elif mode == "txt" or (mode == "auto" and is_txt):
+            print(f"Indexing Text: {file_path}")
+            coro = txt_to_tree(
+                txt_path=file_path,
+                if_add_node_summary='yes',
+                summary_token_threshold=200,
+                model=self.model,
+                if_add_doc_description='yes',
+                if_add_node_text='yes',
+                if_add_node_id='yes'
+            )
+            try:
+                asyncio.get_running_loop()
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                    result = pool.submit(asyncio.run, coro).result()
+            except RuntimeError:
+                result = asyncio.run(coro)
+            self.documents[doc_id] = {
+                'id': doc_id,
+                'type': 'txt',
+                'path': file_path,
+                'doc_name': result.get('doc_name', ''),
+                'doc_description': result.get('doc_description', ''),
+                'line_count': result.get('line_count', 0),
+                'structure': result['structure'],
+            }
         else:
             raise ValueError(f"Unsupported file format for: {file_path}")
 
@@ -140,7 +169,7 @@ def _make_meta_entry(doc: dict) -> dict:
         }
         if doc.get('type') == 'pdf':
             entry['page_count'] = doc.get('page_count')
-        elif doc.get('type') == 'md':
+        elif doc.get('type') in ('md', 'txt'):
             entry['line_count'] = doc.get('line_count')
         return entry
 
diff --git a/pageindex/page_index_txt.py b/pageindex/page_index_txt.py
new file mode 100644
index 000000000..05a34b309
--- /dev/null
+++ b/pageindex/page_index_txt.py
@@ -0,0 +1,61 @@
+import os
+try:
+    from .utils import *
+except:
+    from utils import *
+
+from .page_index_md import generate_summaries_for_structure_md
+
+
+def _read_text_file(txt_path):
+    try:
+        with open(txt_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    except UnicodeDecodeError:
+        with open(txt_path, 'r', encoding='latin-1') as f:
+            return f.read()
+
+
+async def txt_to_tree(txt_path, if_add_node_summary='no', summary_token_threshold=200, model=None, if_add_doc_description='no', if_add_node_text='yes', if_add_node_id='yes'):
+    text = _read_text_file(txt_path)
+    line_count = text.count('\n') + 1
+
+    doc_name = os.path.splitext(os.path.basename(txt_path))[0]
+    tree_structure = [{
+        'title': doc_name,
+        'node_id': '0001',
+        'text': text,
+        'line_num': 1,
+        'nodes': [],
+    }]
+
+    if if_add_node_id == 'yes':
+        write_node_id(tree_structure)
+
+    if if_add_node_summary == 'yes':
+        tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
+        tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
+
+        if if_add_node_text == 'no':
+            tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
+
+        if if_add_doc_description == 'yes':
+            clean_structure = create_clean_structure_for_description(tree_structure)
+            doc_description = generate_doc_description(clean_structure, model=model)
+            return {
+                'doc_name': doc_name,
+                'doc_description': doc_description,
+                'line_count': line_count,
+                'structure': tree_structure,
+            }
+    else:
+        if if_add_node_text == 'yes':
+            tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
+        else:
+            tree_structure = format_structure(tree_structure, order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
+
+    return {
+        'doc_name': doc_name,
+        'line_count': line_count,
+        'structure': tree_structure,
+    }
diff --git a/run_pageindex.py b/run_pageindex.py
index 673439d89..0d5116ff8 100644
--- a/run_pageindex.py
+++ b/run_pageindex.py
@@ -3,6 +3,7 @@
 import json
 from pageindex import *
 from pageindex.page_index_md import md_to_tree
+from pageindex.page_index_txt import txt_to_tree
 from pageindex.utils import ConfigLoader
 
 if __name__ == "__main__":
@@ -10,6 +11,7 @@
     parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
     parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
     parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
+    parser.add_argument('--txt_path', type=str, help='Path to the plain text file')
 
     parser.add_argument('--model', type=str, default=None, help='Model to use (overrides config.yaml)')
 
@@ -39,10 +41,11 @@
     args = parser.parse_args()
     
     # Validate that exactly one file type is specified
-    if not args.pdf_path and not args.md_path:
-        raise ValueError("Either --pdf_path or --md_path must be specified")
-    if args.pdf_path and args.md_path:
-        raise ValueError("Only one of --pdf_path or --md_path can be specified")
+    specified = [p for p in (args.pdf_path, args.md_path, args.txt_path) if p]
+    if not specified:
+        raise ValueError("Either --pdf_path, --md_path, or --txt_path must be specified")
+    if len(specified) > 1:
+        raise ValueError("Only one of --pdf_path, --md_path, or --txt_path can be specified")
     
     if args.pdf_path:
         # Validate PDF file
@@ -131,4 +134,48 @@
         with open(output_file, 'w', encoding='utf-8') as f:
             json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
         
+        print(f'Tree structure saved to: {output_file}')
+
+    elif args.txt_path:
+        if not args.txt_path.lower().endswith('.txt'):
+            raise ValueError("Text file must have .txt extension")
+        if not os.path.isfile(args.txt_path):
+            raise ValueError(f"Text file not found: {args.txt_path}")
+
+        print('Processing text file...')
+
+        import asyncio
+        from pageindex.utils import ConfigLoader
+        config_loader = ConfigLoader()
+
+        user_opt = {
+            'model': args.model,
+            'if_add_node_summary': args.if_add_node_summary,
+            'if_add_doc_description': args.if_add_doc_description,
+            'if_add_node_text': args.if_add_node_text,
+            'if_add_node_id': args.if_add_node_id
+        }
+
+        opt = config_loader.load(user_opt)
+
+        toc_with_page_number = asyncio.run(txt_to_tree(
+            txt_path=args.txt_path,
+            if_add_node_summary=opt.if_add_node_summary,
+            summary_token_threshold=args.summary_token_threshold,
+            model=opt.model,
+            if_add_doc_description=opt.if_add_doc_description,
+            if_add_node_text=opt.if_add_node_text,
+            if_add_node_id=opt.if_add_node_id
+        ))
+
+        print('Parsing done, saving to file...')
+
+        txt_name = os.path.splitext(os.path.basename(args.txt_path))[0]
+        output_dir = './results'
+        output_file = f'{output_dir}/{txt_name}_structure.json'
+        os.makedirs(output_dir, exist_ok=True)
+
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
+
         print(f'Tree structure saved to: {output_file}')
\ No newline at end of file
diff --git a/tests/test_page_index_txt.py b/tests/test_page_index_txt.py
new file mode 100644
index 000000000..93e9b543a
--- /dev/null
+++ b/tests/test_page_index_txt.py
@@ -0,0 +1,68 @@
+import asyncio
+import os
+import tempfile
+
+import pytest
+
+from pageindex.page_index_txt import txt_to_tree
+
+
+def _run(coro):
+    return asyncio.run(coro)
+
+
+def _write_tmp(content, suffix=".txt", encoding="utf-8"):
+    fd, path = tempfile.mkstemp(suffix=suffix)
+    os.close(fd)
+    with open(path, "w", encoding=encoding) as f:
+        f.write(content)
+    return path
+
+
+def test_txt_to_tree_parses_plain_text_into_single_node():
+    path = _write_tmp("Hello world.\nThis is a plain text document.\n")
+    try:
+        result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no"))
+    finally:
+        os.unlink(path)
+
+    assert result["doc_name"] == os.path.splitext(os.path.basename(path))[0]
+    assert isinstance(result["structure"], list)
+    assert len(result["structure"]) == 1
+    root = result["structure"][0]
+    assert root["text"].startswith("Hello world.")
+    assert "This is a plain text document." in root["text"]
+
+
+def test_txt_to_tree_preserves_utf8_content():
+    path = _write_tmp("héllo wörld — 你好\n", encoding="utf-8")
+    try:
+        result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no"))
+    finally:
+        os.unlink(path)
+
+    assert "héllo wörld" in result["structure"][0]["text"]
+    assert "你好" in result["structure"][0]["text"]
+
+
+def test_txt_to_tree_includes_line_count():
+    path = _write_tmp("line1\nline2\nline3\n")
+    try:
+        result = _run(txt_to_tree(txt_path=path, if_add_node_summary="no", if_add_doc_description="no"))
+    finally:
+        os.unlink(path)
+
+    assert result["line_count"] == 4
+
+
+def test_txt_to_tree_exposed_from_package():
+    from pageindex import txt_to_tree as exported
+    assert exported is txt_to_tree
+
+
+def test_client_dispatches_txt_extension():
+    import inspect
+    from pageindex import client
+    src = inspect.getsource(client)
+    assert "txt_to_tree" in src
+    assert ".txt" in src