diff --git a/README.md b/README.md index a8bfadc34..b1a782842 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,8 @@ You can customize the processing with additional optional arguments: --if-add-node-id Add node ID (yes/no, default: yes) --if-add-node-summary Add node summary (yes/no, default: yes) --if-add-doc-description Add doc description (yes/no, default: yes) +--output-dir Directory to write the output JSON (default: ./results) +--output-file Full output file path, e.g. ~/results/my_doc.json (mutually exclusive with --output-dir) ``` diff --git a/run_pageindex.py b/run_pageindex.py index 673439d89..64d8a115c 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -5,6 +5,20 @@ from pageindex.page_index_md import md_to_tree from pageindex.utils import ConfigLoader +def resolve_output_path(input_path, output_dir_arg, output_file_arg): + if output_file_arg: + output_file = os.path.abspath(os.path.expandvars(os.path.expanduser(output_file_arg))) + parent = os.path.dirname(output_file) + if parent: + os.makedirs(parent, exist_ok=True) + else: + out_dir = os.path.abspath(os.path.expandvars(os.path.expanduser(output_dir_arg))) if output_dir_arg else './results' + os.makedirs(out_dir, exist_ok=True) + doc_name = os.path.splitext(os.path.basename(input_path))[0] + output_file = os.path.join(out_dir, f'{doc_name}_structure.json') + return output_file + + if __name__ == "__main__": # Set up argument parser parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure') @@ -29,6 +43,13 @@ parser.add_argument('--if-add-node-text', type=str, default=None, help='Whether to add text to the node') + # Output path arguments (mutually exclusive) + output_group = parser.add_mutually_exclusive_group() + output_group.add_argument('--output-dir', type=str, default=None, + help='Directory to write the output JSON (default: ./results)') + output_group.add_argument('--output-file', type=str, default=None, + help='Full output file path, e.g. /tmp/my_doc.json') + # Markdown specific arguments parser.add_argument('--if-thinning', type=str, default='no', help='Whether to apply tree thinning for markdown (markdown only)') @@ -69,11 +90,7 @@ print('Parsing done, saving to file...') # Save results - pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] - output_dir = './results' - output_file = f'{output_dir}/{pdf_name}_structure.json' - os.makedirs(output_dir, exist_ok=True) - + output_file = resolve_output_path(args.pdf_path, args.output_dir, args.output_file) with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2) @@ -123,11 +140,7 @@ print('Parsing done, saving to file...') # Save results - md_name = os.path.splitext(os.path.basename(args.md_path))[0] - output_dir = './results' - output_file = f'{output_dir}/{md_name}_structure.json' - os.makedirs(output_dir, exist_ok=True) - + output_file = resolve_output_path(args.md_path, args.output_dir, args.output_file) with open(output_file, 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)