Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ You can customize the processing with additional optional arguments:
--if-add-node-id Add node ID (yes/no, default: yes)
--if-add-node-summary Add node summary (yes/no, default: yes)
--if-add-doc-description Add doc description (yes/no, default: yes)
--output-dir Directory to write the output JSON (default: ./results)
--output-file Full output file path, e.g. ~/results/my_doc.json (mutually exclusive with --output-dir)
```
</details>

Expand Down
33 changes: 23 additions & 10 deletions run_pageindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,20 @@
from pageindex.page_index_md import md_to_tree
from pageindex.utils import ConfigLoader

def resolve_output_path(input_path, output_dir_arg, output_file_arg):
if output_file_arg:
output_file = os.path.abspath(os.path.expandvars(os.path.expanduser(output_file_arg)))
parent = os.path.dirname(output_file)
if parent:
os.makedirs(parent, exist_ok=True)
else:
out_dir = os.path.abspath(os.path.expandvars(os.path.expanduser(output_dir_arg))) if output_dir_arg else './results'
os.makedirs(out_dir, exist_ok=True)
doc_name = os.path.splitext(os.path.basename(input_path))[0]
output_file = os.path.join(out_dir, f'{doc_name}_structure.json')
return output_file


if __name__ == "__main__":
# Set up argument parser
parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
Expand All @@ -29,6 +43,13 @@
parser.add_argument('--if-add-node-text', type=str, default=None,
help='Whether to add text to the node')

# Output path arguments (mutually exclusive)
output_group = parser.add_mutually_exclusive_group()
output_group.add_argument('--output-dir', type=str, default=None,
help='Directory to write the output JSON (default: ./results)')
output_group.add_argument('--output-file', type=str, default=None,
help='Full output file path, e.g. /tmp/my_doc.json')

# Markdown specific arguments
parser.add_argument('--if-thinning', type=str, default='no',
help='Whether to apply tree thinning for markdown (markdown only)')
Expand Down Expand Up @@ -69,11 +90,7 @@
print('Parsing done, saving to file...')

# Save results
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{pdf_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

output_file = resolve_output_path(args.pdf_path, args.output_dir, args.output_file)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2)

Expand Down Expand Up @@ -123,11 +140,7 @@
print('Parsing done, saving to file...')

# Save results
md_name = os.path.splitext(os.path.basename(args.md_path))[0]
output_dir = './results'
output_file = f'{output_dir}/{md_name}_structure.json'
os.makedirs(output_dir, exist_ok=True)

output_file = resolve_output_path(args.md_path, args.output_dir, args.output_file)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)

Expand Down
Loading