From 61f1ace87d08d4a6d92648e950890cbe9273a5ba Mon Sep 17 00:00:00 2001 From: goktugibolar Date: Sun, 10 May 2026 00:54:58 +0300 Subject: [PATCH 1/2] Add --output-dir and --output-file support to CLI --- README.md | 2 ++ run_pageindex.py | 36 +++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a8bfadc34..3acfd611b 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,8 @@ You can customize the processing with additional optional arguments: --if-add-node-id Add node ID (yes/no, default: yes) --if-add-node-summary Add node summary (yes/no, default: yes) --if-add-doc-description Add doc description (yes/no, default: yes) +--output-dir Directory to write the output JSON (default: ./results) +--output-file Full output file path, e.g. /tmp/my_doc.json ``` diff --git a/run_pageindex.py b/run_pageindex.py index 673439d89..d941f062a 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -29,6 +29,13 @@ parser.add_argument('--if-add-node-text', type=str, default=None, help='Whether to add text to the node') + # Output path arguments (mutually exclusive) + output_group = parser.add_mutually_exclusive_group() + output_group.add_argument('--output-dir', type=str, default=None, + help='Directory to write the output JSON (default: ./results)') + output_group.add_argument('--output-file', type=str, default=None, + help='Full output file path, e.g. /tmp/my_doc.json') + # Markdown specific arguments parser.add_argument('--if-thinning', type=str, default='no', help='Whether to apply tree thinning for markdown (markdown only)') @@ -69,11 +76,16 @@ print('Parsing done, saving to file...') # Save results - pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] - output_dir = './results' - output_file = f'{output_dir}/{pdf_name}_structure.json' - os.makedirs(output_dir, exist_ok=True) - + pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] + if args.output_file: + output_file = args.output_file + parent = os.path.dirname(os.path.abspath(output_file)) + if parent: + os.makedirs(parent, exist_ok=True) + else: + output_dir = args.output_dir if args.output_dir else './results' + os.makedirs(output_dir, exist_ok=True) + output_file = os.path.join(output_dir, f'{pdf_name}_structure.json') with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2) @@ -123,10 +135,16 @@ print('Parsing done, saving to file...') # Save results - md_name = os.path.splitext(os.path.basename(args.md_path))[0] - output_dir = './results' - output_file = f'{output_dir}/{md_name}_structure.json' - os.makedirs(output_dir, exist_ok=True) + md_name = os.path.splitext(os.path.basename(args.md_path))[0] + if args.output_file: + output_file = args.output_file + parent = os.path.dirname(os.path.abspath(output_file)) + if parent: + os.makedirs(parent, exist_ok=True) + else: + output_dir = args.output_dir if args.output_dir else './results' + os.makedirs(output_dir, exist_ok=True) + output_file = os.path.join(output_dir, f'{md_name}_structure.json') with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False) From a25662dffdafaa9ed094e4b2149d6207a5fd9288 Mon Sep 17 00:00:00 2001 From: goktugibolar Date: Sun, 10 May 2026 01:09:26 +0300 Subject: [PATCH 2/2] Address Copilot review: extract helper, normalize paths, update README --- README.md | 2 +- run_pageindex.py | 37 ++++++++++++++++--------------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 3acfd611b..b1a782842 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,7 @@ You can customize the processing with additional optional arguments: --if-add-node-summary Add node summary (yes/no, default: yes) --if-add-doc-description Add doc description (yes/no, default: yes) --output-dir Directory to write the output JSON (default: ./results) ---output-file Full output file path, e.g. /tmp/my_doc.json +--output-file Full output file path, e.g. ~/results/my_doc.json (mutually exclusive with --output-dir) ``` diff --git a/run_pageindex.py b/run_pageindex.py index d941f062a..64d8a115c 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -5,6 +5,20 @@ from pageindex.page_index_md import md_to_tree from pageindex.utils import ConfigLoader +def resolve_output_path(input_path, output_dir_arg, output_file_arg): + if output_file_arg: + output_file = os.path.abspath(os.path.expandvars(os.path.expanduser(output_file_arg))) + parent = os.path.dirname(output_file) + if parent: + os.makedirs(parent, exist_ok=True) + else: + out_dir = os.path.abspath(os.path.expandvars(os.path.expanduser(output_dir_arg))) if output_dir_arg else './results' + os.makedirs(out_dir, exist_ok=True) + doc_name = os.path.splitext(os.path.basename(input_path))[0] + output_file = os.path.join(out_dir, f'{doc_name}_structure.json') + return output_file + + if __name__ == "__main__": # Set up argument parser parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure') @@ -76,16 +90,7 @@ print('Parsing done, saving to file...') # Save results - pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] - if args.output_file: - output_file = args.output_file - parent = os.path.dirname(os.path.abspath(output_file)) - if parent: - os.makedirs(parent, exist_ok=True) - else: - output_dir = args.output_dir if args.output_dir else './results' - os.makedirs(output_dir, exist_ok=True) - output_file = os.path.join(output_dir, f'{pdf_name}_structure.json') + output_file = resolve_output_path(args.pdf_path, args.output_dir, args.output_file) with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2) @@ -135,17 +140,7 @@ print('Parsing done, saving to file...') # Save results - md_name = os.path.splitext(os.path.basename(args.md_path))[0] - if args.output_file: - output_file = args.output_file - parent = os.path.dirname(os.path.abspath(output_file)) - if parent: - os.makedirs(parent, exist_ok=True) - else: - output_dir = args.output_dir if args.output_dir else './results' - os.makedirs(output_dir, exist_ok=True) - output_file = os.path.join(output_dir, f'{md_name}_structure.json') - + output_file = resolve_output_path(args.md_path, args.output_dir, args.output_file) with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)