From 9db4c944cccedff14943eb5be2e06bda0faa18d1 Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Sat, 6 Jun 2026 18:56:51 +0200 Subject: [PATCH 1/4] Get XMP, Outline and tags --- docs/introduction.md | 4 + docs/reference/cli.md | 688 +++++++++++-------- docs/reference/configuration.md | 208 +++--- docs/tutorials/using_cli.md | 88 +++ src/parxy_cli/commands/pdf.py | 881 ++++++++++++++++++++++++- src/parxy_core/services/pdf_service.py | 856 ++++++++++++++++++++++++ tests/commands/test_pdf.py | 329 +++++++++ tests/services/test_pdf_service.py | 395 +++++++++++ 8 files changed, 3075 insertions(+), 374 deletions(-) diff --git a/docs/introduction.md b/docs/introduction.md index 0eb7520..0ade9d4 100644 --- a/docs/introduction.md +++ b/docs/introduction.md @@ -44,6 +44,10 @@ Once installed, `parxy` provides the following commands: | `parxy docker` | Generate a Docker Compose configuration for self-hosted services | | `parxy pdf:merge` | Merge multiple PDF files into one, with support for selecting specific page ranges | | `parxy pdf:split` | Split a PDF file into individual pages | +| `parxy pdf:outline` | Print or export a PDF's outline (bookmarks / table of contents) | +| `parxy pdf:tags` | Extract the tag (structure) tree of a tagged, accessible PDF | +| `parxy pdf:tags-check` | Check whether a PDF is a tagged (accessible) PDF | +| `parxy pdf:xmp` | Read and extract the XMP metadata of a PDF | ```bash # Parse a PDF to markdown diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 59922bc..1fac29e 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -1,269 +1,419 @@ ---- -title: CLI reference -description: Command line reference with all parxy commands, including arguments, options, types, and defaults. Prefer to run parxy --help and parxy --help if you have access to the terminal. ---- - - - - -# CLI reference - -## `parxy agents` - -Set up AI agent configuration files for Parxy projects. - -Creates or updates an AGENTS.md file with Parxy usage documentation. -If AGENTS.md exists, the Parxy section (marked with tags) is -added or updated while preserving other content. - -Optionally creates Claude Code skill files for common operations. - -``` -parxy agents [OPTIONS] -``` - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `path` | - | Output directory for agent files. Defaults to current directory. | -| `--overwrite` | `-f` | `flag` | `false` | Overwrite existing Parxy section without prompting. | - -## `parxy attach` - -Extract an attached file from a PDF - -``` -parxy attach [OPTIONS] INPUT_FILE NAME -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file containing the attachment | -| `NAME` | Yes | Name of attached file to extract | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path. If not specified, saves to current directory with original name. | -| `--stdout` | - | `flag` | `false` | Output content to stdout (text files only) | - -## `parxy attach:add` - -Add files as attachments to a PDF - -``` -parxy attach:add [OPTIONS] INPUT_FILE FILES... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to add attachments to | -| `FILES` | Yes | One or more files to attach | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_with_attachments.pdf | -| `--description` | `-d` | `text` | - | Description for attached file(s). Matched by position to files. | -| `--name` | `-n` | `text` | - | Custom name(s) for attached file(s). Matched by position to files. | -| `--overwrite` | - | `flag` | `false` | Overwrite existing attachments with same name | - -## `parxy attach:list` - -List attached files in a PDF - -``` -parxy attach:list [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to inspect | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--verbose` | `-v` | `flag` | `false` | Show detailed information | - -## `parxy attach:remove` - -Remove attached files from a PDF - -``` -parxy attach:remove [OPTIONS] INPUT_FILE NAMES... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to process | -| `NAMES` | No | Names of attachments to remove | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_no_attachments.pdf | -| `--all` | - | `flag` | `false` | Remove all attached files | - -## `parxy docker` - -Create a Docker Compose file to run self-hostable parsers (experimental). - -``` -parxy docker -``` - -## `parxy drivers` - -List supported drivers. - -``` -parxy drivers -``` - -## `parxy env` - -Create an environment file with Parxy configuration. - -``` -parxy env -``` - -## `parxy markdown` - -Parse documents to Markdown. - -Accepts PDF files (parsed on-the-fly) or pre-parsed JSON result files -(loaded directly from the Document model without re-parsing). - -``` -parxy markdown [OPTIONS] INPUTS... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | -| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | -| `--output` | `-o` | `text` | - | Directory to save markdown files. If not specified, files are saved next to the source files. | -| `--inline` | `-i` | `flag` | `false` | Output markdown to stdout with file name as YAML frontmatter. Only valid with a single file. | -| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | -| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | -| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | -| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | -| `--page-separators` | - | `flag` | `false` | Insert HTML comments before each page's content. | - -## `parxy parse` - -Parse documents using one or more drivers. - -This command processes PDF documents and extracts their content in various formats. -You can specify individual files or entire folders to process. - -``` -parxy parse [OPTIONS] INPUTS... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | -| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | -| `--mode` | `-m` | `json` | `plain` | `markdown` | `json` | Output mode: json (JSON serialization), plain (plain text), or markdown (markdown format) | -| `--output` | `-o` | `text` | - | Directory to save output files. If not specified, files will be saved in the same directory as the source files. | -| `--show` | `-s` | `flag` | `false` | Show document content in console in addition to saving to files | -| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | -| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | -| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | -| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | - -## `parxy pdf:merge` - -Merge multiple PDF files into a single PDF - -``` -parxy pdf:merge [OPTIONS] INPUTS... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUTS` | Yes | One or more PDF files or folders to merge. Files support page ranges in square brackets (e.g., file.pdf[1:3]). Folders are processed non-recursively. | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path for the merged PDF. If not specified, you will be prompted. | - -## `parxy pdf:split` - -Split a PDF file into individual pages - -``` -parxy pdf:split [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to split | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output path. Without --combine: output directory for split files (default: folder next to input). With --combine: output file path (default: {stem}_pages_{from}-{to}.pdf next to input). | -| `--prefix` | `-p` | `text` | - | Prefix for output filenames. If not specified, uses the input filename. | -| `--pages` | - | `text` | - | Page range to extract (1-based). Examples: "1" (single page), "1:3" (pages 1-3), ":3" (up to page 3), "3:" (from page 3). If not specified, all pages are extracted. | -| `--combine` | - | `flag` | `false` | Combine extracted pages into a single PDF instead of one file per page. | - -## `parxy tui` - -Launch the Parxy TUI for interactive parser comparison - -``` -parxy tui WORKSPACE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `WORKSPACE` | No | Path to the workspace folder (optional — can be selected inside the TUI) | - -## `parxy version` - -Print Parxy version information. - -``` -parxy version -``` +--- +title: CLI reference +description: Command line reference with all parxy commands, including arguments, options, types, and defaults. Prefer to run parxy --help and parxy --help if you have access to the terminal. +--- + + + + +# CLI reference + +## `parxy agents` + +Set up AI agent configuration files for Parxy projects. + +Creates or updates an AGENTS.md file with Parxy usage documentation. +If AGENTS.md exists, the Parxy section (marked with tags) is +added or updated while preserving other content. + +Optionally creates Claude Code skill files for common operations. + +``` +parxy agents [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `path` | - | Output directory for agent files. Defaults to current directory. | +| `--overwrite` | `-f` | `flag` | `false` | Overwrite existing Parxy section without prompting. | + +## `parxy attach` + +Extract an attached file from a PDF + +``` +parxy attach [OPTIONS] INPUT_FILE NAME +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file containing the attachment | +| `NAME` | Yes | Name of attached file to extract | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, saves to current directory with original name. | +| `--stdout` | - | `flag` | `false` | Output content to stdout (text files only) | + +## `parxy attach:add` + +Add files as attachments to a PDF + +``` +parxy attach:add [OPTIONS] INPUT_FILE FILES... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to add attachments to | +| `FILES` | Yes | One or more files to attach | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_with_attachments.pdf | +| `--description` | `-d` | `text` | - | Description for attached file(s). Matched by position to files. | +| `--name` | `-n` | `text` | - | Custom name(s) for attached file(s). Matched by position to files. | +| `--overwrite` | - | `flag` | `false` | Overwrite existing attachments with same name | + +## `parxy attach:list` + +List attached files in a PDF + +``` +parxy attach:list [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--verbose` | `-v` | `flag` | `false` | Show detailed information | + +## `parxy attach:remove` + +Remove attached files from a PDF + +``` +parxy attach:remove [OPTIONS] INPUT_FILE NAMES... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to process | +| `NAMES` | No | Names of attachments to remove | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_no_attachments.pdf | +| `--all` | - | `flag` | `false` | Remove all attached files | + +## `parxy docker` + +Create a Docker Compose file to run self-hostable parsers (experimental). + +``` +parxy docker +``` + +## `parxy drivers` + +List supported drivers. + +``` +parxy drivers +``` + +## `parxy env` + +Create an environment file with Parxy configuration. + +``` +parxy env +``` + +## `parxy markdown` + +Parse documents to Markdown. + +Accepts PDF files (parsed on-the-fly) or pre-parsed JSON result files +(loaded directly from the Document model without re-parsing). + +``` +parxy markdown [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | +| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | +| `--output` | `-o` | `text` | - | Directory to save markdown files. If not specified, files are saved next to the source files. | +| `--inline` | `-i` | `flag` | `false` | Output markdown to stdout with file name as YAML frontmatter. Only valid with a single file. | +| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | +| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | +| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | +| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | +| `--page-separators` | - | `flag` | `false` | Insert HTML comments before each page's content. | + +## `parxy parse` + +Parse documents using one or more drivers. + +This command processes PDF documents and extracts their content in various formats. +You can specify individual files or entire folders to process. + +``` +parxy parse [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | +| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | +| `--mode` | `-m` | `json` | `plain` | `markdown` | `json` | Output mode: json (JSON serialization), plain (plain text), or markdown (markdown format) | +| `--output` | `-o` | `text` | - | Directory to save output files. If not specified, files will be saved in the same directory as the source files. | +| `--show` | `-s` | `flag` | `false` | Show document content in console in addition to saving to files | +| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | +| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | +| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | +| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | + +## `parxy pdf:merge` + +Merge multiple PDF files into a single PDF + +``` +parxy pdf:merge [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more PDF files or folders to merge. Files support page ranges in square brackets (e.g., file.pdf[1:3]). Folders are processed non-recursively. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path for the merged PDF. If not specified, you will be prompted. | + +## `parxy pdf:outline` + +Print or export the outline (bookmarks / table of contents) of a PDF + +``` +parxy pdf:outline [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Write the outline as JSON to this file instead of printing a tree. | +| `--json` | - | `flag` | `false` | Print the outline as JSON to stdout. | +| `--flat` | - | `flag` | `false` | Print a flat, indented list instead of a tree. | + +## `parxy pdf:split` + +Split a PDF file into individual pages + +``` +parxy pdf:split [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to split | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output path. Without --combine: output directory for split files (default: folder next to input). With --combine: output file path (default: {stem}_pages_{from}-{to}.pdf next to input). | +| `--prefix` | `-p` | `text` | - | Prefix for output filenames. If not specified, uses the input filename. | +| `--pages` | - | `text` | - | Page range to extract (1-based). Examples: "1" (single page), "1:3" (pages 1-3), ":3" (up to page 3), "3:" (from page 3). If not specified, all pages are extracted. | +| `--combine` | - | `flag` | `false` | Combine extracted pages into a single PDF instead of one file per page. | +| `--every` | `-e` | `integer` | - | Split into chunks of N pages each. Cannot be used with --combine. | + +## `parxy pdf:split-by-text` + +Split a PDF into chunks whenever a page matches a text condition + +``` +parxy pdf:split-by-text [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to split | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--text` | `-t` | `text` | - | Text to match. Can be repeated for multiple patterns (OR logic). | +| `--mode` | `-m` | `text` | `contains` | Matching mode: "contains" (default) or "starts-with". | +| `--ignore-case` | `-i` | `flag` | `false` | Case-insensitive matching. | +| `--regex` | - | `flag` | `false` | Treat --text values as regular expressions. | +| `--discard-preamble` | - | `flag` | `false` | Discard pages that appear before the first matching page. | +| `--output` | `-o` | `text` | - | Output directory for chunk files (default: {stem}_split next to input). | +| `--prefix` | `-p` | `text` | - | Prefix for output filenames. Defaults to the input filename stem. | + +## `parxy pdf:tag-skeleton` + +Copy a tagged PDF keeping its tags but removing visible content + +``` +parxy pdf:tag-skeleton [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | Tagged PDF file to strip | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output path for the tags-only PDF (default: {stem}_tags.pdf next to input). | + +## `parxy pdf:tag-template` + +Create an empty tagged PDF skeleton for accessibility work + +``` +parxy pdf:tag-template [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path for the template PDF. If not specified, you will be prompted. | +| `--pages` | - | `integer` | `1` | Number of blank pages to create (default: 1). | +| `--lang` | - | `text` | `en-US` | Document language tag set on the catalog (default: en-US). | +| `--title` | - | `text` | - | Optional document title stored in the PDF metadata. | + +## `parxy pdf:tags` + +Extract the tag (structure) tree of a tagged PDF + +``` +parxy pdf:tags [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Write the extracted tags as JSON to this file instead of printing a tree. | +| `--json` | - | `flag` | `false` | Print the extracted tags as JSON to stdout. | +| `--text` | - | `flag` | `false` | Include the text content of each element. Rebuilds the tree per page; accessibility attributes (alt text, page refs) are not shown in this mode. | + +## `parxy pdf:tags-check` + +Check whether a PDF is a tagged (accessible) PDF + +``` +parxy pdf:tags-check [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--json` | - | `flag` | `false` | Output the detection result as JSON. | + +## `parxy pdf:xmp` + +Read and extract the XMP metadata of a PDF + +``` +parxy pdf:xmp [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Write the metadata to this file. A .xml extension writes the raw XMP packet; any other extension writes parsed JSON. | +| `--json` | - | `flag` | `false` | Print the parsed metadata as JSON to stdout. | +| `--raw` | - | `flag` | `false` | Print the raw XMP XML packet to stdout. | + +## `parxy tui` + +Launch the Parxy TUI for interactive parser comparison + +``` +parxy tui WORKSPACE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `WORKSPACE` | No | Path to the workspace folder (optional — can be selected inside the TUI) | + +## `parxy version` + +Print Parxy version information. + +``` +parxy version +``` diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 8ea71cb..0c97211 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -1,104 +1,104 @@ ---- -title: Configuration reference -description: Configuration options for Parxy and the drivers. Settings are read from the environment or a .env file. Run parxy env to generate a starter .env with some default. ---- - - - - -# Configuration reference - -All settings are read from environment variables or a `.env` file in your project root. - -Run `parxy env` to generate a template `.env` with usual configuration options. - -## Core settings - -Prefix: `PARXY_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_DEFAULT_DRIVER` | `pymupdf` | The default driver to use in case nothing is specified. | -| `PARXY_LOGGING_LEVEL` | `INFO` | The logging level. | -| `PARXY_LOGGING_FILE` | - | The log file path. | -| `PARXY_THEME` | - | The console theme to use. One of: `light`, `dark`. | - -## Observability / tracing - -Prefix: `PARXY_TRACING_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_TRACING_ENABLE` | `false` | Enable sending traces to the observability service. | -| `PARXY_TRACING_API_KEY` | *(secret)* | The authentication key (used for both traces and metrics unless overridden). | -| `PARXY_TRACING_ENDPOINT` | `http://localhost:4318/` | The base url of the Open Telemetry collector endpoint. | -| `PARXY_TRACING_ENABLE_METRICS` | `false` | Enable sending metrics to the telemetry service. | -| `PARXY_TRACING_TRACES_ENDPOINT` | *(computed)* | The endpoint for the traces exporter. | -| `PARXY_TRACING_METRICS_ENDPOINT` | *(computed)* | The endpoint for the metrics exporter. | -| `PARXY_TRACING_TIMEOUT_SECONDS` | `10` | The client timeout when sending traces. | -| `PARXY_TRACING_USE_COMPRESSION` | `true` | The client should compress traces before send. | -| `PARXY_TRACING_VERBOSE` | `true` | Log when traces are sent. | -| `PARXY_TRACING_AUTHENTICATION_HEADER` | `Authorization` | The header in which the api key needs to be included for authentication purposes. | - -## PdfAct - -Prefix: `PARXY_PDFACT_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_PDFACT_BASE_URL` | `http://localhost:4567/` | The base URL of the PdfAct API. | -| `PARXY_PDFACT_API_KEY` | *(secret)* | The authentication key. | - -## LlamaParse - -Prefix: `PARXY_LLAMAPARSE_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_LLAMAPARSE_BASE_URL` | `https://api.cloud.eu.llamaindex.ai` | The base URL of the LlamaParse API. | -| `PARXY_LLAMAPARSE_API_KEY` | *(secret)* | The authentication key. | -| `PARXY_LLAMAPARSE_ORGANIZATION_ID` | - | The organization ID for the LlamaParse API. | -| `PARXY_LLAMAPARSE_PROJECT_ID` | - | The project ID for the LlamaParse API. | -| `PARXY_LLAMAPARSE_TIER` | - | Parsing tier to use. One of: `fast`, `cost_effective`, `agentic`, `agentic_plus`. | -| `PARXY_LLAMAPARSE_VERSION` | `latest` | API version string. | -| `PARXY_LLAMAPARSE_PARSE_MODE` | - | Legacy parsing mode. | -| `PARXY_LLAMAPARSE_PREMIUM_MODE` | `false` | If True, selects the 'agentic_plus' tier (legacy shorthand). | -| `PARXY_LLAMAPARSE_FAST_MODE` | `false` | If True, selects the 'fast' tier (legacy shorthand). | -| `PARXY_LLAMAPARSE_DISABLE_OCR` | `false` | Disable OCR on images embedded in the document. | -| `PARXY_LLAMAPARSE_SKIP_DIAGONAL_TEXT` | `false` | Skip text rotated at an angle (e.g. | -| `PARXY_LLAMAPARSE_LANGUAGE` | `en` | Primary language for OCR (e.g. | -| `PARXY_LLAMAPARSE_DO_NOT_UNROLL_COLUMNS` | `false` | Keep multi-column layout intact instead of linearising columns into sequential text. | -| `PARXY_LLAMAPARSE_DISABLE_IMAGE_EXTRACTION` | `false` | If True, skip image extraction. | -| `PARXY_LLAMAPARSE_CONTINUOUS_MODE` | `false` | Automatically merge tables that span multiple pages. | -| `PARXY_LLAMAPARSE_TARGET_PAGES` | - | Specific pages to extract. | -| `PARXY_LLAMAPARSE_MAX_PAGES` | - | Maximum number of pages to extract. | -| `PARXY_LLAMAPARSE_DO_NOT_CACHE` | `true` | If True, bypass result caching and force re-parsing. | -| `PARXY_LLAMAPARSE_VERBOSE` | `false` | Print progress indicators during parsing. | - -## LLMWhisperer - -Prefix: `PARXY_LLMWHISPERER_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_LLMWHISPERER_BASE_URL` | `https://llmwhisperer-api.eu-west.unstract.com/api/v2` | The base URL of the LlmWhisperer API v2. | -| `PARXY_LLMWHISPERER_API_KEY` | *(secret)* | The authentication key. | -| `PARXY_LLMWHISPERER_LOGGING_LEVEL` | `INFO` | The logging level for the client. | -| `PARXY_LLMWHISPERER_MODE` | `form` | Default parsing mode. | - -## Landing AI - -Prefix: `PARXY_LANDINGAI_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_LANDINGAI_API_KEY` | *(secret)* | The authentication key. | -| `PARXY_LANDINGAI_ENVIRONMENT` | `eu` | The environment to use. One of: `production`, `eu`. | -| `PARXY_LANDINGAI_BASE_URL` | - | The base URL of the Landing AI ADE API. | - -## Unstructured library - -Prefix: `PARXY_UNSTRUCTURED_LOCAL_` - -| Variable | Default | Description | -|----------|---------|-------------| +--- +title: Configuration reference +description: Configuration options for Parxy and the drivers. Settings are read from the environment or a .env file. Run parxy env to generate a starter .env with some default. +--- + + + + +# Configuration reference + +All settings are read from environment variables or a `.env` file in your project root. + +Run `parxy env` to generate a template `.env` with usual configuration options. + +## Core settings + +Prefix: `PARXY_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_DEFAULT_DRIVER` | `pymupdf` | The default driver to use in case nothing is specified. | +| `PARXY_LOGGING_LEVEL` | `INFO` | The logging level. | +| `PARXY_LOGGING_FILE` | - | The log file path. | +| `PARXY_THEME` | - | The console theme to use. One of: `light`, `dark`. | + +## Observability / tracing + +Prefix: `PARXY_TRACING_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_TRACING_ENABLE` | `false` | Enable sending traces to the observability service. | +| `PARXY_TRACING_API_KEY` | *(secret)* | The authentication key (used for both traces and metrics unless overridden). | +| `PARXY_TRACING_ENDPOINT` | `http://localhost:4318/` | The base url of the Open Telemetry collector endpoint. | +| `PARXY_TRACING_ENABLE_METRICS` | `false` | Enable sending metrics to the telemetry service. | +| `PARXY_TRACING_TRACES_ENDPOINT` | *(computed)* | The endpoint for the traces exporter. | +| `PARXY_TRACING_METRICS_ENDPOINT` | *(computed)* | The endpoint for the metrics exporter. | +| `PARXY_TRACING_TIMEOUT_SECONDS` | `10` | The client timeout when sending traces. | +| `PARXY_TRACING_USE_COMPRESSION` | `true` | The client should compress traces before send. | +| `PARXY_TRACING_VERBOSE` | `true` | Log when traces are sent. | +| `PARXY_TRACING_AUTHENTICATION_HEADER` | `Authorization` | The header in which the api key needs to be included for authentication purposes. | + +## PdfAct + +Prefix: `PARXY_PDFACT_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_PDFACT_BASE_URL` | `http://localhost:4567/` | The base URL of the PdfAct API. | +| `PARXY_PDFACT_API_KEY` | *(secret)* | The authentication key. | + +## LlamaParse + +Prefix: `PARXY_LLAMAPARSE_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LLAMAPARSE_BASE_URL` | `https://api.cloud.eu.llamaindex.ai` | The base URL of the LlamaParse API. | +| `PARXY_LLAMAPARSE_API_KEY` | *(secret)* | The authentication key. | +| `PARXY_LLAMAPARSE_ORGANIZATION_ID` | - | The organization ID for the LlamaParse API. | +| `PARXY_LLAMAPARSE_PROJECT_ID` | - | The project ID for the LlamaParse API. | +| `PARXY_LLAMAPARSE_TIER` | - | Parsing tier to use. One of: `fast`, `cost_effective`, `agentic`, `agentic_plus`. | +| `PARXY_LLAMAPARSE_VERSION` | `latest` | API version string. | +| `PARXY_LLAMAPARSE_PARSE_MODE` | - | Legacy parsing mode. | +| `PARXY_LLAMAPARSE_PREMIUM_MODE` | `false` | If True, selects the 'agentic_plus' tier (legacy shorthand). | +| `PARXY_LLAMAPARSE_FAST_MODE` | `false` | If True, selects the 'fast' tier (legacy shorthand). | +| `PARXY_LLAMAPARSE_DISABLE_OCR` | `false` | Disable OCR on images embedded in the document. | +| `PARXY_LLAMAPARSE_SKIP_DIAGONAL_TEXT` | `false` | Skip text rotated at an angle (e.g. | +| `PARXY_LLAMAPARSE_LANGUAGE` | `en` | Primary language for OCR (e.g. | +| `PARXY_LLAMAPARSE_DO_NOT_UNROLL_COLUMNS` | `false` | Keep multi-column layout intact instead of linearising columns into sequential text. | +| `PARXY_LLAMAPARSE_DISABLE_IMAGE_EXTRACTION` | `false` | If True, skip image extraction. | +| `PARXY_LLAMAPARSE_CONTINUOUS_MODE` | `false` | Automatically merge tables that span multiple pages. | +| `PARXY_LLAMAPARSE_TARGET_PAGES` | - | Specific pages to extract. | +| `PARXY_LLAMAPARSE_MAX_PAGES` | - | Maximum number of pages to extract. | +| `PARXY_LLAMAPARSE_DO_NOT_CACHE` | `true` | If True, bypass result caching and force re-parsing. | +| `PARXY_LLAMAPARSE_VERBOSE` | `false` | Print progress indicators during parsing. | + +## LLMWhisperer + +Prefix: `PARXY_LLMWHISPERER_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LLMWHISPERER_BASE_URL` | `https://llmwhisperer-api.eu-west.unstract.com/api/v2` | The base URL of the LlmWhisperer API v2. | +| `PARXY_LLMWHISPERER_API_KEY` | *(secret)* | The authentication key. | +| `PARXY_LLMWHISPERER_LOGGING_LEVEL` | `INFO` | The logging level for the client. | +| `PARXY_LLMWHISPERER_MODE` | `form` | Default parsing mode. | + +## Landing AI + +Prefix: `PARXY_LANDINGAI_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LANDINGAI_API_KEY` | *(secret)* | The authentication key. | +| `PARXY_LANDINGAI_ENVIRONMENT` | `eu` | The environment to use. One of: `production`, `eu`. | +| `PARXY_LANDINGAI_BASE_URL` | - | The base URL of the Landing AI ADE API. | + +## Unstructured library + +Prefix: `PARXY_UNSTRUCTURED_LOCAL_` + +| Variable | Default | Description | +|----------|---------|-------------| diff --git a/docs/tutorials/using_cli.md b/docs/tutorials/using_cli.md index aa14f57..7dc0393 100644 --- a/docs/tutorials/using_cli.md +++ b/docs/tutorials/using_cli.md @@ -14,6 +14,9 @@ The Parxy CLI lets you: | `parxy markdown` | Convert documents to Markdown files, with support for multiple drivers and folder processing | | `parxy pdf:merge`| Merge multiple PDF files into one, with support for page ranges | | `parxy pdf:split`| Split a PDF into individual pages, with optional page range and single-file extraction | +| `parxy pdf:outline`| Print or export a PDF's outline (bookmarks / table of contents) | +| `parxy pdf:tags` | Inspect and extract the tag (structure) tree of a tagged, accessible PDF | +| `parxy pdf:xmp` | Read and extract XMP metadata from a PDF | | `parxy drivers` | List available document processing drivers | | `parxy env` | Generate a default `.env` configuration file | | `parxy docker` | Create a Docker Compose configuration for running Parxy-related services | @@ -303,6 +306,88 @@ Page range formats (1-based): `3` · `2:5` · `:5` · `3:` For more detailed examples and use cases, see the [Merge and split PDFs](../howto/merge_and_split_pdfs.md) guide. +## Inspecting PDFs + +Beyond text extraction, Parxy can inspect a PDF's structure and metadata: its outline (bookmarks), its accessibility tag tree, and its XMP metadata. Each command prints a human-readable view by default and can emit JSON with `--json` (to stdout) or `--output` (to a file). + +### Outline (bookmarks) + +The `pdf:outline` command prints the table of contents as a tree: + +```bash +parxy pdf:outline document.pdf +``` + +Use `--flat` for an indented list instead of a tree, or export the structure: + +```bash +# Flat listing +parxy pdf:outline document.pdf --flat + +# Export as JSON (flat entries + nested tree) +parxy pdf:outline document.pdf -o outline.json +``` + +The command exits with code `2` when the PDF has no bookmarks, which is handy in scripts. + +### Tags (accessibility structure) + +A *tagged* PDF carries a logical structure tree (`/StructTreeRoot`) that makes it accessible. Start by checking whether a PDF is tagged: + +```bash +parxy pdf:tags-check document.pdf +``` + +This reports whether the content is marked, whether a structure tree is present, the document language, and the number of structure elements. It exits with `0` for a tagged PDF and `2` otherwise. + +Extract the tag tree itself with `pdf:tags`: + +```bash +# Print the structure tree (with page references and alt text) +parxy pdf:tags document.pdf + +# Include the visible text of each element (rebuilt per page) +parxy pdf:tags document.pdf --text + +# Export the full nested structure as JSON +parxy pdf:tags document.pdf -o tags.json +``` + +The default view walks the document-wide structure tree and shows accessibility attributes (alt text, titles, page references) but not body text, which lives in the page content streams. The `--text` view reconstructs the structure per page including each element's visible text, but without the accessibility attributes. + +Two companion commands help with accessibility work: + +```bash +# Copy a tagged PDF keeping its tags but removing visible content +parxy pdf:tag-skeleton document.pdf -o tags-only.pdf + +# Create an empty tagged PDF skeleton from scratch +parxy pdf:tag-template -o template.pdf --pages 3 --lang en-US +``` + +### XMP metadata + +The `pdf:xmp` command reads the XMP metadata packet (an RDF/XML block holding properties such as `dc:title`, `dc:creator`, and `pdf:Producer`) and prints the parsed properties alongside the classic `/Info` dictionary: + +```bash +parxy pdf:xmp document.pdf +``` + +You can view the original packet or export the metadata: + +```bash +# Print the raw XMP XML packet +parxy pdf:xmp document.pdf --raw + +# Export parsed metadata as JSON +parxy pdf:xmp document.pdf --json + +# Save the raw XMP packet (a .xml path writes the raw packet, +# any other extension writes parsed JSON) +parxy pdf:xmp document.pdf -o metadata.xml +``` + + ## Managing Drivers To view the list of supported document parsing drivers: @@ -368,6 +453,9 @@ With the CLI, you can use Parxy as a **standalone document parsing tool** — id | `parxy markdown` | Generate Markdown files; accepts JSON results and supports `--page-separators` | | `parxy pdf:merge`| Merge multiple PDF files with page range support | | `parxy pdf:split`| Split PDF into individual pages; supports `--pages` and `--combine` | +| `parxy pdf:outline`| Print or export a PDF's outline (bookmarks) | +| `parxy pdf:tags` | Inspect and extract a tagged PDF's structure tree; supports `--text` | +| `parxy pdf:xmp` | Read and extract XMP metadata; supports `--raw` and JSON export | | `parxy drivers` | List supported drivers | | `parxy env` | Create default configuration file | | `parxy docker` | Generate Docker Compose setup | diff --git a/src/parxy_cli/commands/pdf.py b/src/parxy_cli/commands/pdf.py index b569417..1a8f25e 100644 --- a/src/parxy_cli/commands/pdf.py +++ b/src/parxy_cli/commands/pdf.py @@ -1,5 +1,6 @@ """PDF manipulation commands.""" +import json from pathlib import Path from typing import List, Annotated, Optional @@ -181,6 +182,14 @@ def split( help='Combine extracted pages into a single PDF instead of one file per page.', ), ] = False, + every: Annotated[ + Optional[int], + typer.Option( + '--every', + '-e', + help='Split into chunks of N pages each. Cannot be used with --combine.', + ), + ] = None, ): """ Split a PDF file into individual pages. @@ -220,9 +229,24 @@ def split( # Combine with custom output path parxy pdf:split document.pdf --pages 2:5 --combine -o extracted.pdf + + # Split into chunks of 10 pages each + parxy pdf:split document.pdf --every 10 + + # Split into chunks of 5 pages, only from pages 3-20 + parxy pdf:split document.pdf --every 5 --pages 3:20 """ console.action('Split PDF file', space_after=False) + # Validate mutually exclusive options + if every is not None and combine: + console.error('--every and --combine cannot be used together.', panel=True) + raise typer.Exit(1) + + if every is not None and every < 1: + console.error('--every must be a positive integer.', panel=True) + raise typer.Exit(1) + # Validate input file input_path = Path(input_file) if not input_path.is_file(): @@ -293,7 +317,33 @@ def split( f'Extracting pages {effective_from}-{effective_to} ({extract_count} page{"s" if extract_count > 1 else ""})' ) - if combine: + if every is not None: + # Determine output directory + if output_dir is None: + output_path = input_path.parent / f'{input_path.stem}_split' + else: + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + chunk_count = (extract_count + every - 1) // every + console.info( + f'Splitting into {chunk_count} chunk{"s" if chunk_count > 1 else ""} of up to {every} page{"s" if every > 1 else ""} each' + ) + + with console.shimmer('Splitting PDF into chunks...'): + output_files = PdfService.split_pdf_by_chunk( + input_path, output_path, prefix, every, from_page, to_page + ) + + for output_file in output_files: + console.print(f'[faint]⎿ [/faint] Created {output_file.name}') + + console.newline() + console.success( + f'Successfully split PDF into {len(output_files)} chunk{"s" if len(output_files) > 1 else ""} in {output_path}' + ) + + elif combine: # Determine output file path if output_dir is not None: combined_output = Path(output_dir) @@ -345,3 +395,832 @@ def split( except Exception as e: console.error(f'Error during split: {str(e)}') raise typer.Exit(1) + + +@app.command( + name='pdf:split-by-text', + help='Split a PDF into chunks whenever a page matches a text condition', +) +def split_by_text( + input_file: Annotated[ + str, + typer.Argument(help='PDF file to split'), + ], + text: Annotated[ + List[str], + typer.Option( + '--text', + '-t', + help='Text to match. Can be repeated for multiple patterns (OR logic).', + ), + ] = None, + mode: Annotated[ + str, + typer.Option( + '--mode', + '-m', + help='Matching mode: "contains" (default) or "starts-with".', + ), + ] = 'contains', + ignore_case: Annotated[ + bool, + typer.Option( + '--ignore-case', + '-i', + help='Case-insensitive matching.', + ), + ] = False, + regex: Annotated[ + bool, + typer.Option( + '--regex', + help='Treat --text values as regular expressions.', + ), + ] = False, + discard_preamble: Annotated[ + bool, + typer.Option( + '--discard-preamble', + help='Discard pages that appear before the first matching page.', + ), + ] = False, + output_dir: Annotated[ + Optional[str], + typer.Option( + '--output', + '-o', + help='Output directory for chunk files (default: {stem}_split next to input).', + ), + ] = None, + prefix: Annotated[ + Optional[str], + typer.Option( + '--prefix', + '-p', + help='Prefix for output filenames. Defaults to the input filename stem.', + ), + ] = None, +): + """ + Split a PDF into chunks whenever a page matches a text condition. + + A new chunk begins at each page that satisfies the match condition. Pages + before the first match are included as a leading chunk unless + --discard-preamble is given. + + Matching modes: + contains - page text contains the pattern anywhere (default) + starts-with - page text starts with the pattern (after leading whitespace) + + Multiple --text values are combined with OR logic: any match triggers a split. + Use --regex to treat --text values as regular expressions. + + Output files are named: {prefix}_part_{N}_{pages}.pdf + + Examples: + + # Split whenever a page contains "Chapter" + parxy pdf:split-by-text document.pdf --text "Chapter" + + # Split on "Invoice" or "Credit Note", case-insensitive + parxy pdf:split-by-text document.pdf -t "Invoice" -t "Credit Note" -i + + # Split when a page starts with "SECTION" + parxy pdf:split-by-text document.pdf --text "SECTION" --mode starts-with + + # Split using a regex pattern (e.g. "Chapter N" headings) + parxy pdf:split-by-text document.pdf --text "^Chapter \\d+" --regex + + # Discard pages before the first match and write to a custom directory + parxy pdf:split-by-text document.pdf -t "Invoice" --discard-preamble -o ./invoices + """ + console.action('Split PDF by text condition', space_after=False) + + if not text: + console.error('At least one --text pattern is required.', panel=True) + raise typer.Exit(1) + + if mode not in ('contains', 'starts-with'): + console.error( + f'Invalid --mode "{mode}". Choose "contains" or "starts-with".', panel=True + ) + raise typer.Exit(1) + + input_path = Path(input_file) + if not input_path.is_file(): + console.error(f'Input file not found: {input_file}', panel=True) + raise typer.Exit(1) + + if input_path.suffix.lower() != '.pdf': + console.error(f'Input file must be a PDF: {input_file}', panel=True) + raise typer.Exit(1) + + if output_dir is None: + out_path = input_path.parent / f'{input_path.stem}_split' + else: + out_path = Path(output_dir) + + if prefix is None: + prefix = input_path.stem + + # Display matching configuration + mode_label = 'starts with' if mode == 'starts-with' else 'contains' + pattern_list = ', '.join(f'"{p}"' for p in text) + console.info(f'Patterns ({mode_label}): {pattern_list}') + if ignore_case: + console.info('Case-insensitive matching enabled') + if regex: + console.info('Regex matching enabled') + + try: + with console.shimmer('Scanning pages and splitting PDF...'): + chunks = PdfService.split_pdf_by_text( + input_path, + out_path, + prefix, + patterns=text, + mode=mode, + ignore_case=ignore_case, + use_regex=regex, + discard_before_first_match=discard_preamble, + ) + + for output_file, first_page, last_page in chunks: + page_label = ( + f'page {first_page}' + if first_page == last_page + else f'pages {first_page}-{last_page}' + ) + console.print( + f'[faint]⎿ [/faint] Created {output_file.name} ({page_label})' + ) + + console.newline() + console.success( + f'Successfully split PDF into {len(chunks)} chunk{"s" if len(chunks) > 1 else ""} in {out_path}' + ) + + except (ValueError, FileNotFoundError) as e: + console.error(f'Error during split: {str(e)}') + raise typer.Exit(1) + except Exception as e: + console.error(f'Error during split: {str(e)}') + raise typer.Exit(1) + + +def _validate_pdf_input(input_file: str) -> Path: + """Validate that input_file is an existing .pdf file and return its Path.""" + input_path = Path(input_file) + if not input_path.is_file(): + console.error(f'Input file not found: {input_file}', panel=True) + raise typer.Exit(1) + if input_path.suffix.lower() != '.pdf': + console.error(f'Input file must be a PDF: {input_file}', panel=True) + raise typer.Exit(1) + return input_path + + +@app.command( + name='pdf:tags-check', + help='Check whether a PDF is a tagged (accessible) PDF', +) +def tags_check( + input_file: Annotated[ + str, + typer.Argument(help='PDF file to inspect'), + ], + as_json: Annotated[ + bool, + typer.Option( + '--json', + help='Output the detection result as JSON.', + ), + ] = False, +): + """ + Check whether a PDF is a tagged (accessible) PDF. + + A PDF is considered tagged when its catalog marks the content as tagged + (/MarkInfo /Marked true) and provides a logical structure tree + (/StructTreeRoot). The command also reports the declared document language + and how many structure elements the tree contains. + + The process exits with code 0 when the PDF is tagged and 2 when it is not, + so it can be used in scripts. + + Examples: + + # Human-readable report + parxy pdf:tags-check document.pdf + + # Machine-readable output + parxy pdf:tags-check document.pdf --json + """ + input_path = _validate_pdf_input(input_file) + + try: + info = PdfService.is_tagged(input_path) + except (ValueError, FileNotFoundError) as e: + console.error(f'Error inspecting PDF: {str(e)}') + raise typer.Exit(1) + + if as_json: + # Plain stdout (not rich) so the JSON is emitted verbatim, without + # markup interpretation of brackets or width-based line wrapping. + typer.echo(json.dumps(info, indent=2)) + raise typer.Exit(0 if info['tagged'] else 2) + + console.action('Check tagged PDF', space_after=False) + console.info(f'File: {input_path.name}') + + if info['tagged']: + console.success('This is a tagged PDF') + else: + console.warning('This is NOT a tagged PDF') + + marked = '✓' if info['marked'] else '✗' + struct = '✓' if info['has_struct_tree'] else '✗' + console.print(f'[faint]⎿ [/faint] Marked content (/MarkInfo): {marked}') + console.print(f'[faint]⎿ [/faint] Structure tree (/StructTreeRoot): {struct}') + console.print( + f'[faint]⎿ [/faint] Structure elements: {info["struct_element_count"]}' + ) + console.print(f'[faint]⎿ [/faint] Language (/Lang): {info["lang"] or "not set"}') + console.print(f'[faint]⎿ [/faint] Pages: {info["page_count"]}') + + raise typer.Exit(0 if info['tagged'] else 2) + + +@app.command( + name='pdf:tags', + help='Extract the tag (structure) tree of a tagged PDF', +) +def tags( + input_file: Annotated[ + str, + typer.Argument(help='PDF file to inspect'), + ], + output: Annotated[ + Optional[str], + typer.Option( + '--output', + '-o', + help='Write the extracted tags as JSON to this file instead of printing a tree.', + ), + ] = None, + as_json: Annotated[ + bool, + typer.Option( + '--json', + help='Print the extracted tags as JSON to stdout.', + ), + ] = False, + with_text: Annotated[ + bool, + typer.Option( + '--text', + help='Include the text content of each element. Rebuilds the tree ' + 'per page; accessibility attributes (alt text, page refs) are not ' + 'shown in this mode.', + ), + ] = False, +): + """ + Extract the logical structure (tag) tree of a tagged PDF. + + By default this walks the /StructTreeRoot and prints each structure + element, the page it refers to, and any alternative text, title, or + language attached to it. This view spans the whole document but does not + include body text (which lives in the page content streams). + + Use --text to instead reconstruct the structure per page including the + visible text of each element (P, Strong, Span, ...). That view shows text + but not the alt-text / page-reference accessibility attributes. + + Use --json (stdout) or --output (file) to obtain the full nested structure + for further processing. + + Examples: + + # Print the tag tree (accessibility view) + parxy pdf:tags document.pdf + + # Include the text content of each element + parxy pdf:tags document.pdf --text + + # Emit JSON to stdout + parxy pdf:tags document.pdf --json + + # Save the tag tree to a file + parxy pdf:tags document.pdf -o tags.json + """ + input_path = _validate_pdf_input(input_file) + + try: + if with_text: + result = PdfService.extract_tags_with_text(input_path) + else: + result = PdfService.extract_tags(input_path) + except (ValueError, FileNotFoundError) as e: + console.error(f'Error extracting tags: {str(e)}') + raise typer.Exit(1) + + if output is not None: + output_path = Path(output) + if output_path.suffix.lower() != '.json': + output_path = output_path.with_suffix('.json') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(result, indent=2), encoding='utf-8') + console.success(f'Saved tag tree to {output_path}') + return + + if as_json: + # Plain stdout (not rich) so the JSON is emitted verbatim, without + # markup interpretation of brackets or width-based line wrapping. + typer.echo(json.dumps(result, indent=2)) + return + + console.action('Extract PDF tags', space_after=False) + + if not result['tagged']: + console.warning('No structure tree found — this PDF is not tagged.', panel=True) + raise typer.Exit(2) + + from rich.tree import Tree + + if with_text: + _render_text_tags(input_path, result) + return + + def _label(node: dict) -> str: + parts = [f'[highlight]{node["type"]}[/highlight]'] + if node['page'] is not None: + parts.append(f'[faint](page {node["page"]})[/faint]') + if node['title']: + parts.append(f'[muted]"{node["title"]}"[/muted]') + if node['alt']: + parts.append(f'[cyan]alt: {node["alt"]}[/cyan]') + if node['lang']: + parts.append(f'[faint]lang={node["lang"]}[/faint]') + return ' '.join(parts) + + def _add(parent_tree, node: dict): + branch = parent_tree.add(_label(node)) + for child in node['children']: + _add(branch, child) + + tree = Tree(f'[bold]{input_path.name}[/bold]') + for root in result['roots']: + _add(tree, root) + console.console.print(tree) + + console.newline() + summary = ', '.join( + f'{count}×{tag}' for tag, count in sorted(result['tag_counts'].items()) + ) + total = sum(result['tag_counts'].values()) + console.info(f'{total} structure elements: {summary}') + + +def _render_text_tags(input_path: Path, result: dict): + """Render the per-page, text-bearing tag tree produced with --text.""" + from rich.markup import escape + from rich.tree import Tree + + def _truncate(text: str, limit: int = 100) -> str: + text = ' '.join(text.split()) + return text if len(text) <= limit else text[: limit - 1] + '…' + + def _type_label(node: dict) -> str: + label = f'[highlight]{node["type"]}[/highlight]' + if node['standard_type'] and node['standard_type'] != node['type']: + label += f' [faint]({node["standard_type"]})[/faint]' + return label + + def _add(parent_tree, node: dict): + if node['type'] is None: + # Bare text run + parent_tree.add(f'[muted]"{escape(_truncate(node["text"]))}"[/muted]') + return + children = node['children'] + # Inline a single text-only child next to its element type + if len(children) == 1 and children[0]['type'] is None: + text = escape(_truncate(children[0]['text'])) + parent_tree.add(f'{_type_label(node)} [muted]"{text}"[/muted]') + return + branch = parent_tree.add(_type_label(node)) + for child in children: + _add(branch, child) + + multi_page = result['page_count'] > 1 + for page in result['pages']: + title = ( + f'[bold]{input_path.name}[/bold] [faint]— page {page["page"]}[/faint]' + if multi_page + else f'[bold]{input_path.name}[/bold]' + ) + tree = Tree(title) + for root in page['roots']: + _add(tree, root) + console.console.print(tree) + + console.newline() + console.info( + f'{result["page_count"]} page{"s" if result["page_count"] > 1 else ""} ' + '(per-page reconstruction with text)' + ) + + +@app.command( + name='pdf:tag-skeleton', + help='Copy a tagged PDF keeping its tags but removing visible content', +) +def tag_skeleton( + input_file: Annotated[ + str, + typer.Argument(help='Tagged PDF file to strip'), + ], + output: Annotated[ + Optional[str], + typer.Option( + '--output', + '-o', + help='Output path for the tags-only PDF (default: {stem}_tags.pdf next to input).', + ), + ] = None, +): + """ + Copy a tagged PDF, keeping the tag tree but removing visible content. + + Every page's content streams and resources (text, images, fonts) are + removed while the page objects and the /StructTreeRoot logical structure + are preserved. The result is a lightweight document that still carries the + accessibility structure, useful for inspecting or testing the tag tree in + isolation. + + Examples: + + # Strip content, write {stem}_tags.pdf next to the input + parxy pdf:tag-skeleton document.pdf + + # Strip content to a specific path + parxy pdf:tag-skeleton document.pdf -o tags-only.pdf + """ + input_path = _validate_pdf_input(input_file) + + if output is None: + output_path = input_path.parent / f'{input_path.stem}_tags.pdf' + else: + output_path = Path(output) + if output_path.suffix.lower() != '.pdf': + output_path = output_path.with_suffix('.pdf') + + console.action('Strip PDF to tags', space_after=False) + + try: + with console.shimmer('Removing content while preserving tags...'): + result = PdfService.strip_to_tags(input_path, output_path) + except (ValueError, FileNotFoundError) as e: + console.error(f'Error during strip: {str(e)}') + raise typer.Exit(1) + + if not result['tagged']: + console.warning( + 'Source PDF has no structure tree — the output will have no tags.' + ) + + saved = result['original_size'] - result['stripped_size'] + console.newline() + console.success(f'Wrote tags-only PDF to {output_path}') + console.print( + f'[faint]⎿ [/faint] Structure elements preserved: {result["struct_element_count"]}' + ) + console.print( + f'[faint]⎿ [/faint] Size: {result["original_size"]:,} → {result["stripped_size"]:,} bytes ' + f'(−{saved:,})' + ) + + +@app.command( + name='pdf:tag-template', + help='Create an empty tagged PDF skeleton for accessibility work', +) +def tag_template( + output: Annotated[ + str, + typer.Option( + '--output', + '-o', + help='Output file path for the template PDF. If not specified, you will be prompted.', + ), + ] = None, + pages: Annotated[ + int, + typer.Option( + '--pages', + help='Number of blank pages to create (default: 1).', + ), + ] = 1, + lang: Annotated[ + str, + typer.Option( + '--lang', + help='Document language tag set on the catalog (default: en-US).', + ), + ] = 'en-US', + title: Annotated[ + Optional[str], + typer.Option( + '--title', + help='Optional document title stored in the PDF metadata.', + ), + ] = None, +): + """ + Create an empty tagged PDF skeleton for accessibility work. + + Generates a fresh PDF with the requested number of blank pages and a valid + logical structure tree: the content is marked as tagged (/MarkInfo /Marked + true) and a /StructTreeRoot groups one paragraph (/P) structure element per + page under a /Document element. A starting point for building or testing + accessible PDFs. + + Examples: + + # Single-page template + parxy pdf:tag-template -o template.pdf + + # Three-page German template with a title + parxy pdf:tag-template -o template.pdf --pages 3 --lang de-DE --title "Report" + """ + console.action('Create tagged PDF template', space_after=False) + + if pages < 1: + console.error('--pages must be a positive integer.', panel=True) + raise typer.Exit(1) + + if output is None: + output = typer.prompt('Enter output filename or path') + + output_path = Path(output) + if output_path.suffix.lower() != '.pdf': + output_path = output_path.with_suffix('.pdf') + + try: + with console.shimmer('Building tagged PDF skeleton...'): + PdfService.create_tag_template( + output_path, pages=pages, lang=lang, title=title + ) + except (ValueError, FileNotFoundError) as e: + console.error(f'Error creating template: {str(e)}') + raise typer.Exit(1) + + console.newline() + console.success(f'Created tagged PDF template at {output_path}') + console.print( + f'[faint]⎿ [/faint] {pages} page{"s" if pages > 1 else ""}, language {lang}' + ) + + +@app.command( + name='pdf:outline', + help='Print or export the outline (bookmarks / table of contents) of a PDF', +) +def outline( + input_file: Annotated[ + str, + typer.Argument(help='PDF file to inspect'), + ], + output: Annotated[ + Optional[str], + typer.Option( + '--output', + '-o', + help='Write the outline as JSON to this file instead of printing a tree.', + ), + ] = None, + as_json: Annotated[ + bool, + typer.Option( + '--json', + help='Print the outline as JSON to stdout.', + ), + ] = False, + flat: Annotated[ + bool, + typer.Option( + '--flat', + help='Print a flat, indented list instead of a tree.', + ), + ] = False, +): + """ + Print or export the outline (bookmarks / table of contents) of a PDF. + + Reads the PDF's bookmark hierarchy and shows each entry with the page it + points to. By default a nested tree is rendered; use --flat for a plain + indented list, or --json / --output to obtain the structured data (both a + flat list of entries and the nested tree) for further processing. + + Examples: + + # Render the outline as a tree + parxy pdf:outline document.pdf + + # Flat, indented listing + parxy pdf:outline document.pdf --flat + + # Emit JSON to stdout + parxy pdf:outline document.pdf --json + + # Save the outline to a file + parxy pdf:outline document.pdf -o outline.json + """ + input_path = _validate_pdf_input(input_file) + + try: + result = PdfService.extract_outline(input_path) + except (ValueError, FileNotFoundError) as e: + console.error(f'Error extracting outline: {str(e)}') + raise typer.Exit(1) + + if output is not None: + output_path = Path(output) + if output_path.suffix.lower() != '.json': + output_path = output_path.with_suffix('.json') + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(result, indent=2), encoding='utf-8') + console.success(f'Saved outline to {output_path}') + return + + if as_json: + # Plain stdout (not rich) so the JSON is emitted verbatim, without + # markup interpretation of brackets or width-based line wrapping. + typer.echo(json.dumps(result, indent=2)) + return + + console.action('Extract PDF outline', space_after=False) + + if not result['has_outline']: + console.warning('This PDF has no outline (no bookmarks).', panel=True) + raise typer.Exit(2) + + from rich.markup import escape + + def _page_suffix(page: Optional[int]) -> str: + return f' [faint](page {page})[/faint]' if page is not None else '' + + if flat: + for entry in result['entries']: + indent = ' ' * (entry['level'] - 1) + console.print( + f'{indent}[highlight]{escape(entry["title"])}[/highlight]' + f'{_page_suffix(entry["page"])}' + ) + else: + from rich.tree import Tree + + def _add(parent_tree, node: dict): + branch = parent_tree.add( + f'[highlight]{escape(node["title"])}[/highlight]' + f'{_page_suffix(node["page"])}' + ) + for child in node['children']: + _add(branch, child) + + tree = Tree(f'[bold]{input_path.name}[/bold]') + for node in result['tree']: + _add(tree, node) + console.console.print(tree) + + console.newline() + console.info( + f'{result["entry_count"]} bookmark' + f'{"s" if result["entry_count"] != 1 else ""} ' + f'across {result["page_count"]} page' + f'{"s" if result["page_count"] != 1 else ""}' + ) + + +@app.command( + name='pdf:xmp', + help='Read and extract the XMP metadata of a PDF', +) +def xmp( + input_file: Annotated[ + str, + typer.Argument(help='PDF file to inspect'), + ], + output: Annotated[ + Optional[str], + typer.Option( + '--output', + '-o', + help='Write the metadata to this file. A .xml extension writes the ' + 'raw XMP packet; any other extension writes parsed JSON.', + ), + ] = None, + as_json: Annotated[ + bool, + typer.Option( + '--json', + help='Print the parsed metadata as JSON to stdout.', + ), + ] = False, + raw: Annotated[ + bool, + typer.Option( + '--raw', + help='Print the raw XMP XML packet to stdout.', + ), + ] = False, +): + """ + Read and extract the XMP metadata packet of a PDF. + + XMP is an RDF/XML metadata packet embedded in the document. By default the + parsed properties (such as dc:title, dc:creator, pdf:Producer, + xmp:CreateDate) are printed alongside the classic /Info dictionary. Use + --raw to print the original XML packet, or --json / --output for structured + data. Writing to a path ending in .xml exports the raw packet. + + Examples: + + # Print parsed XMP properties + parxy pdf:xmp document.pdf + + # Print the raw XMP XML packet + parxy pdf:xmp document.pdf --raw + + # Emit parsed metadata as JSON + parxy pdf:xmp document.pdf --json + + # Save the raw XMP packet to a file + parxy pdf:xmp document.pdf -o metadata.xml + + # Save parsed metadata as JSON + parxy pdf:xmp document.pdf -o metadata.json + """ + input_path = _validate_pdf_input(input_file) + + try: + result = PdfService.extract_xmp_metadata(input_path) + except (ValueError, FileNotFoundError) as e: + console.error(f'Error extracting XMP metadata: {str(e)}') + raise typer.Exit(1) + + if output is not None: + output_path = Path(output) + output_path.parent.mkdir(parents=True, exist_ok=True) + if output_path.suffix.lower() == '.xml': + if not result['has_xmp']: + console.warning('This PDF has no XMP metadata packet to export.') + raise typer.Exit(2) + output_path.write_text(result['raw'], encoding='utf-8') + console.success(f'Saved raw XMP packet to {output_path}') + else: + if output_path.suffix.lower() != '.json': + output_path = output_path.with_suffix('.json') + output_path.write_text(json.dumps(result, indent=2), encoding='utf-8') + console.success(f'Saved XMP metadata to {output_path}') + return + + if as_json: + # Plain stdout (not rich) so the JSON is emitted verbatim, without + # markup interpretation of brackets or width-based line wrapping. + typer.echo(json.dumps(result, indent=2)) + return + + if raw: + if not result['has_xmp']: + console.warning('This PDF has no XMP metadata packet.', panel=True) + raise typer.Exit(2) + typer.echo(result['raw']) + return + + from rich.markup import escape + + console.action('Extract PDF XMP metadata', space_after=False) + + if not result['has_xmp']: + console.warning('This PDF has no XMP metadata packet.', panel=True) + else: + if result['properties']: + for key, value in result['properties'].items(): + shown = ', '.join(value) if isinstance(value, list) else value + console.print( + f'[highlight]{escape(key)}[/highlight]: ' + f'[muted]{escape(str(shown))}[/muted]' + ) + else: + console.info('XMP packet present but no recognised properties.') + + if result['doc_info']: + console.newline() + console.info('Document info (/Info):') + for key, value in result['doc_info'].items(): + if value: + console.print( + f'[faint]⎿ [/faint] {escape(str(key))}: ' + f'[muted]{escape(str(value))}[/muted]' + ) diff --git a/src/parxy_core/services/pdf_service.py b/src/parxy_core/services/pdf_service.py index 256c31a..056c6fc 100644 --- a/src/parxy_core/services/pdf_service.py +++ b/src/parxy_core/services/pdf_service.py @@ -1,5 +1,6 @@ """PDF manipulation service using PyMuPDF.""" +import re as _re from pathlib import Path from typing import List, Optional, Dict, Any, Tuple @@ -339,6 +340,209 @@ def split_pdf( return output_files + @staticmethod + def split_pdf_by_text( + input_path: Path, + output_dir: Path, + prefix: str, + patterns: List[str], + mode: str = 'contains', + ignore_case: bool = False, + use_regex: bool = False, + discard_before_first_match: bool = False, + ) -> List[Tuple[Path, int, int]]: + """ + Split a PDF into chunks whenever a page matches one or more text patterns. + + Each matching page becomes the first page of a new chunk. Pages before + the first match form a preamble chunk unless discard_before_first_match + is True. + + Args: + input_path: Path to the PDF file to split + output_dir: Directory where chunk PDFs should be saved + prefix: Prefix for output filenames + patterns: One or more text strings or regex patterns (OR logic) + mode: Matching mode — 'contains' or 'starts-with' + ignore_case: Case-insensitive matching + use_regex: Treat patterns as regular expressions + discard_before_first_match: Skip pages that appear before the first match + + Returns: + List of (output_path, first_page_1based, last_page_1based) tuples + + Raises: + FileNotFoundError: If input PDF doesn't exist + ValueError: If patterns is empty, mode is invalid, PDF is empty, + or no matching pages are found + """ + if not input_path.is_file(): + raise FileNotFoundError(f'PDF file not found: {input_path}') + + if not patterns: + raise ValueError('At least one pattern must be provided') + + if mode not in ('contains', 'starts-with'): + raise ValueError('mode must be "contains" or "starts-with"') + + flags = _re.IGNORECASE if ignore_case else 0 + compiled = [_re.compile(p, flags) for p in patterns] if use_regex else None + + def _matches(text: str) -> bool: + check = text.lower() if ignore_case and not use_regex else text + if use_regex: + for pat in compiled: + if mode == 'starts-with': + if pat.match(text.lstrip()): + return True + else: + if pat.search(text): + return True + else: + for p in patterns: + check_p = p.lower() if ignore_case else p + if mode == 'starts-with': + if check.lstrip().startswith(check_p): + return True + else: + if check_p in check: + return True + return False + + pdf = pymupdf.open(input_path) + total_pages = pdf.page_count + + if total_pages == 0: + pdf.close() + raise ValueError('PDF file is empty (no pages)') + + # Find 0-based indices of pages that trigger a new chunk + split_starts: List[int] = [] + for i in range(total_pages): + page_text = pdf[i].get_text('text') + if _matches(page_text): + split_starts.append(i) + + if not split_starts: + pdf.close() + raise ValueError('No pages matched the given pattern(s)') + + # Build (start, end) 0-based inclusive ranges + ranges: List[Tuple[int, int]] = [] + if not discard_before_first_match and split_starts[0] > 0: + ranges.append((0, split_starts[0] - 1)) + for idx, start in enumerate(split_starts): + end = ( + split_starts[idx + 1] - 1 + if idx + 1 < len(split_starts) + else total_pages - 1 + ) + ranges.append((start, end)) + + output_dir.mkdir(parents=True, exist_ok=True) + output_files: List[Tuple[Path, int, int]] = [] + + try: + for chunk_idx, (start, end) in enumerate(ranges): + page_label = ( + f'p{start + 1}' if start == end else f'p{start + 1}-{end + 1}' + ) + filename = f'{prefix}_part_{chunk_idx + 1:03d}_{page_label}.pdf' + output_file = output_dir / filename + output_pdf = pymupdf.open() + output_pdf.insert_pdf(pdf, from_page=start, to_page=end) + output_pdf.save(str(output_file)) + output_pdf.close() + output_files.append((output_file, start + 1, end + 1)) + finally: + pdf.close() + + return output_files + + @staticmethod + def split_pdf_by_chunk( + input_path: Path, + output_dir: Path, + prefix: str, + chunk_size: int, + from_page: Optional[int] = None, + to_page: Optional[int] = None, + ) -> List[Path]: + """ + Split a PDF into chunks of N pages each. + + Args: + input_path: Path to the PDF file to split + output_dir: Directory where chunk PDFs should be saved + prefix: Prefix for output filenames + chunk_size: Number of pages per chunk + from_page: First page to process (0-based, inclusive). None means first page. + to_page: Last page to process (0-based, inclusive). None means last page. + + Returns: + List of paths to the created PDF files + + Raises: + FileNotFoundError: If input PDF doesn't exist + ValueError: If PDF is empty, chunk_size < 1, or page range is invalid + """ + if not input_path.is_file(): + raise FileNotFoundError(f'PDF file not found: {input_path}') + + if chunk_size < 1: + raise ValueError('Chunk size must be at least 1') + + pdf = pymupdf.open(input_path) + total_pages = pdf.page_count + + if total_pages == 0: + pdf.close() + raise ValueError('PDF file is empty (no pages)') + + start = from_page if from_page is not None else 0 + end = to_page if to_page is not None else total_pages - 1 + + if start < 0 or start >= total_pages: + pdf.close() + raise ValueError( + f'Invalid page range: page {start + 1} does not exist (PDF has {total_pages} pages)' + ) + + if end < 0 or end >= total_pages: + pdf.close() + raise ValueError( + f'Invalid page range: page {end + 1} does not exist (PDF has {total_pages} pages)' + ) + + if start > end: + pdf.close() + raise ValueError( + f'Invalid page range: start page {start + 1} > end page {end + 1}' + ) + + output_dir.mkdir(parents=True, exist_ok=True) + output_files = [] + + try: + chunk_start = start + while chunk_start <= end: + chunk_end = min(chunk_start + chunk_size - 1, end) + if chunk_start == chunk_end: + filename = f'{prefix}_page_{chunk_start + 1}.pdf' + else: + filename = f'{prefix}_pages_{chunk_start + 1}-{chunk_end + 1}.pdf' + output_file = output_dir / filename + output_pdf = pymupdf.open() + output_pdf.insert_pdf(pdf, from_page=chunk_start, to_page=chunk_end) + output_pdf.save(str(output_file)) + output_pdf.close() + output_files.append(output_file) + chunk_start = chunk_end + 1 + finally: + pdf.close() + + return output_files + @staticmethod def extract_pages( input_path: Path, @@ -400,6 +604,658 @@ def extract_pages( finally: pdf.close() + # ======================================================================== + # Tagged PDF / Accessibility Operations + # ======================================================================== + + @staticmethod + def _refs_in_value(value_type: str, value: str) -> List[int]: + """ + Extract the indirect object numbers from a low-level key value. + + Handles both single references ('5 0 R') and arrays of references + ('[5 0 R 6 0 R]'). Non-reference values (ints, names, MCIDs) yield + an empty list. + + Args: + value_type: The type reported by ``xref_get_key`` (e.g. 'xref', 'array') + value: The raw value string reported by ``xref_get_key`` + + Returns: + List of object numbers referenced by the value, in order. + """ + if value_type in ('xref', 'array'): + return [int(m) for m in _re.findall(r'(\d+) 0 R', value)] + return [] + + @staticmethod + def _string_key(doc, xref: int, key: str) -> Optional[str]: + """Return a string-typed key value, or None if absent/not a string.""" + value_type, value = doc.xref_get_key(xref, key) + return value if value_type == 'string' else None + + @staticmethod + def is_tagged(input_path: Path) -> Dict[str, Any]: + """ + Inspect a PDF to determine whether it is a tagged (accessible) PDF. + + A PDF is considered tagged when its catalog declares ``/MarkInfo`` + with ``/Marked true`` and references a ``/StructTreeRoot`` describing + the logical structure tree. + + Args: + input_path: Path to the PDF file + + Returns: + Dictionary with detection details: + - tagged: True when both marked and a structure tree are present + - marked: Value of /MarkInfo /Marked + - has_struct_tree: Whether /StructTreeRoot is present + - lang: Document language (/Lang) if declared, else None + - struct_element_count: Number of structure elements in the tree + - page_count: Number of pages in the document + + Raises: + FileNotFoundError: If input PDF doesn't exist + """ + if not input_path.is_file(): + raise FileNotFoundError(f'PDF file not found: {input_path}') + + doc = pymupdf.open(input_path) + try: + cat = doc.pdf_catalog() + + _, marked_val = doc.xref_get_key(cat, 'MarkInfo/Marked') + marked = marked_val == 'true' + + str_type, str_val = doc.xref_get_key(cat, 'StructTreeRoot') + has_struct_tree = str_type == 'xref' + + lang = PdfService._string_key(doc, cat, 'Lang') + + struct_element_count = 0 + if has_struct_tree: + root_xref = int(str_val.split()[0]) + k_type, k_val = doc.xref_get_key(root_xref, 'K') + visited: set = set() + stack = PdfService._refs_in_value(k_type, k_val) + while stack: + xref = stack.pop() + if xref in visited: + continue + visited.add(xref) + s_type, _ = doc.xref_get_key(xref, 'S') + if s_type != 'name': + # Not a structure element (e.g. OBJR/MCR content ref) + continue + struct_element_count += 1 + ck_type, ck_val = doc.xref_get_key(xref, 'K') + stack.extend(PdfService._refs_in_value(ck_type, ck_val)) + + return { + 'tagged': marked and has_struct_tree, + 'marked': marked, + 'has_struct_tree': has_struct_tree, + 'lang': lang, + 'struct_element_count': struct_element_count, + 'page_count': doc.page_count, + } + finally: + doc.close() + + @staticmethod + def extract_tags(input_path: Path) -> Dict[str, Any]: + """ + Extract the logical structure (tag) tree of a tagged PDF. + + Walks the ``/StructTreeRoot`` and returns the hierarchy of structure + elements together with the page each element refers to (resolved from + ``/Pg``, inherited from the nearest ancestor when absent). + + Each node is a dict with keys: ``type`` (structure type without the + leading slash), ``page`` (1-based page number or None), ``alt``, + ``actual_text``, ``title``, ``lang``, and ``children`` (list of nodes). + + Args: + input_path: Path to the PDF file + + Returns: + Dictionary with: + - tagged: Whether a structure tree was found + - roots: Top-level structure nodes + - tag_counts: Mapping of structure type -> occurrence count + - page_count: Number of pages in the document + + Raises: + FileNotFoundError: If input PDF doesn't exist + """ + if not input_path.is_file(): + raise FileNotFoundError(f'PDF file not found: {input_path}') + + doc = pymupdf.open(input_path) + try: + cat = doc.pdf_catalog() + str_type, str_val = doc.xref_get_key(cat, 'StructTreeRoot') + + result: Dict[str, Any] = { + 'tagged': str_type == 'xref', + 'roots': [], + 'tag_counts': {}, + 'page_count': doc.page_count, + } + + if str_type != 'xref': + return result + + # Map page object numbers to their 1-based page index + page_map = {doc[i].xref: i + 1 for i in range(doc.page_count)} + tag_counts: Dict[str, int] = {} + visited: set = set() + + def _walk(xref: int, inherited_page: Optional[int]): + if xref in visited: + return None + visited.add(xref) + + s_type, s_val = doc.xref_get_key(xref, 'S') + if s_type != 'name': + return None # content reference, not a structure element + structure_type = s_val.lstrip('/') + + pg_type, pg_val = doc.xref_get_key(xref, 'Pg') + page = inherited_page + if pg_type == 'xref': + page = page_map.get(int(pg_val.split()[0]), inherited_page) + + tag_counts[structure_type] = tag_counts.get(structure_type, 0) + 1 + + children = [] + k_type, k_val = doc.xref_get_key(xref, 'K') + for child_xref in PdfService._refs_in_value(k_type, k_val): + child = _walk(child_xref, page) + if child is not None: + children.append(child) + + return { + 'type': structure_type, + 'page': page, + 'alt': PdfService._string_key(doc, xref, 'Alt'), + 'actual_text': PdfService._string_key(doc, xref, 'ActualText'), + 'title': PdfService._string_key(doc, xref, 'T'), + 'lang': PdfService._string_key(doc, xref, 'Lang'), + 'children': children, + } + + root_xref = int(str_val.split()[0]) + rk_type, rk_val = doc.xref_get_key(root_xref, 'K') + for child_xref in PdfService._refs_in_value(rk_type, rk_val): + node = _walk(child_xref, None) + if node is not None: + result['roots'].append(node) + + result['tag_counts'] = tag_counts + return result + finally: + doc.close() + + @staticmethod + def _struct_block_to_node(block: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """ + Convert a TEXT_COLLECT_STRUCTURE block into a structure node with text. + + Block types reported by PyMuPDF: 2 = structure element (with ``std`` + / ``raw`` type names and nested ``blocks``), 0 = text leaf (with + ``lines``/``spans``), 1 = image. Text leaves become nodes with + ``type`` None and their concatenated text; structure elements carry + their type names and child nodes. + """ + block_type = block.get('type') + + if block_type == 2: # structure element + node = { + 'type': block.get('raw'), + 'standard_type': block.get('std'), + 'text': None, + 'children': [], + } + for child in block.get('blocks', []): + child_node = PdfService._struct_block_to_node(child) + if child_node is not None: + node['children'].append(child_node) + return node + + if block_type == 0: # text leaf + lines = [] + for line in block.get('lines', []): + line_text = ''.join( + span.get('text', '') for span in line.get('spans', []) + ) + if line_text: + lines.append(line_text) + return { + 'type': None, + 'standard_type': None, + 'text': ' '.join(lines), + 'children': [], + } + + if block_type == 1: # image + return { + 'type': 'Image', + 'standard_type': None, + 'text': None, + 'children': [], + } + + return None + + @staticmethod + def extract_tags_with_text(input_path: Path) -> Dict[str, Any]: + """ + Extract the structure tree together with the text content of each tag. + + Unlike :meth:`extract_tags` (which walks the document-wide + ``/StructTreeRoot`` and exposes accessibility attributes such as + ``/Alt`` but no body text), this method reconstructs the structure + **per page** using PyMuPDF's ``TEXT_COLLECT_STRUCTURE`` text + extraction, so the visible text of ``P``, ``Strong``, ``Span`` and + other elements is included. + + Each node is a dict with: ``type`` (raw structure type, or None for a + bare text run), ``standard_type`` (the standardised type, e.g. a + ``Strong`` maps to ``Span``), ``text`` (text content for leaves), and + ``children``. + + Args: + input_path: Path to the PDF file + + Returns: + Dictionary with: + - tagged: Whether the document declares a structure tree + - page_count: Number of pages + - pages: List of {'page': 1-based index, 'roots': [nodes]} + + Raises: + FileNotFoundError: If input PDF doesn't exist + """ + if not input_path.is_file(): + raise FileNotFoundError(f'PDF file not found: {input_path}') + + info = PdfService.is_tagged(input_path) + + doc = pymupdf.open(input_path) + try: + pages = [] + for i in range(doc.page_count): + data = doc[i].get_text('dict', flags=pymupdf.TEXT_COLLECT_STRUCTURE) + roots = [] + for block in data.get('blocks', []): + node = PdfService._struct_block_to_node(block) + if node is not None: + roots.append(node) + pages.append({'page': i + 1, 'roots': roots}) + + return { + 'tagged': info['tagged'], + 'page_count': doc.page_count, + 'pages': pages, + } + finally: + doc.close() + + @staticmethod + def create_tag_template( + output_path: Path, + pages: int = 1, + lang: str = 'en-US', + title: Optional[str] = None, + ) -> None: + """ + Create a minimal, empty tagged PDF skeleton for accessibility work. + + The generated document has the requested number of blank pages and a + valid logical structure tree: ``/MarkInfo /Marked true``, a + ``/StructTreeRoot`` containing a ``/Document`` element that groups one + ``/P`` (paragraph) structure element per page. + + Args: + output_path: Path where the template PDF should be saved + pages: Number of blank pages to create (default: 1) + lang: Document language tag set on the catalog (default: 'en-US') + title: Optional document title stored in the PDF metadata + + Raises: + ValueError: If pages < 1 + """ + if pages < 1: + raise ValueError('A tagged template must have at least one page') + + doc = pymupdf.open() + try: + for _ in range(pages): + doc.new_page() + + cat = doc.pdf_catalog() + doc.xref_set_key(cat, 'MarkInfo/Marked', 'true') + if lang: + doc.xref_set_key(cat, 'Lang', f'({lang})') + + struct_root = doc.get_new_xref() + document_elem = doc.get_new_xref() + + paragraph_refs = [] + for i in range(pages): + para = doc.get_new_xref() + page_xref = doc[i].xref + doc.update_object( + para, + f'<< /Type /StructElem /S /P /P {document_elem} 0 R ' + f'/Pg {page_xref} 0 R >>', + ) + paragraph_refs.append(f'{para} 0 R') + + kids = ' '.join(paragraph_refs) + doc.update_object( + document_elem, + f'<< /Type /StructElem /S /Document /P {struct_root} 0 R ' + f'/K [{kids}] >>', + ) + doc.update_object( + struct_root, + f'<< /Type /StructTreeRoot /K [{document_elem} 0 R] >>', + ) + doc.xref_set_key(cat, 'StructTreeRoot', f'{struct_root} 0 R') + + if title: + doc.set_metadata({'title': title}) + + output_path.parent.mkdir(parents=True, exist_ok=True) + doc.save(str(output_path)) + finally: + doc.close() + + @staticmethod + def strip_to_tags(input_path: Path, output_path: Path) -> Dict[str, Any]: + """ + Produce a copy of a tagged PDF that keeps the tag tree but no content. + + Every page's content stream is emptied so the visible text and images + are removed, while the page objects and the ``/StructTreeRoot`` logical + structure are preserved. Unused images and fonts are garbage-collected + on save, yielding a lightweight artifact that still carries the + accessibility structure for inspection and testing. + + Args: + input_path: Path to the source tagged PDF + output_path: Path where the stripped PDF should be saved + + Returns: + Dictionary with: + - tagged: Whether the source had a structure tree + - struct_element_count: Structure elements preserved + - page_count: Number of pages + - original_size: Source file size in bytes + - stripped_size: Output file size in bytes + + Raises: + FileNotFoundError: If input PDF doesn't exist + """ + if not input_path.is_file(): + raise FileNotFoundError(f'PDF file not found: {input_path}') + + info = PdfService.is_tagged(input_path) + original_size = input_path.stat().st_size + + doc = pymupdf.open(input_path) + try: + for page in doc: + for content_xref in page.get_contents(): + doc.update_stream(content_xref, b' ') + # Drop the page's resources (images, fonts, xobjects) so the + # now-unused objects can be garbage-collected on save. + doc.xref_set_key(page.xref, 'Resources', '<<>>') + + output_path.parent.mkdir(parents=True, exist_ok=True) + doc.save(str(output_path), garbage=4, deflate=True) + finally: + doc.close() + + return { + 'tagged': info['has_struct_tree'], + 'struct_element_count': info['struct_element_count'], + 'page_count': info['page_count'], + 'original_size': original_size, + 'stripped_size': output_path.stat().st_size, + } + + # ======================================================================== + # Outline / Bookmarks Operations + # ======================================================================== + + @staticmethod + def extract_outline(input_path: Path) -> Dict[str, Any]: + """ + Extract the document outline (bookmarks / table of contents) of a PDF. + + Reads the PDF's ``/Outlines`` hierarchy via PyMuPDF's ``get_toc`` and + returns both a flat list of entries (one per bookmark, with its nesting + level) and a nested tree mirroring the bookmark hierarchy. + + Each tree node is a dict with: ``title`` (bookmark label), ``page`` + (1-based target page, or None when the destination has no page), + ``level`` (1-based nesting depth), and ``children`` (list of nodes). + Flat entries carry ``title``, ``page`` and ``level``. + + Args: + input_path: Path to the PDF file + + Returns: + Dictionary with: + - has_outline: Whether the PDF defines any bookmarks + - entries: Flat list of {'title', 'page', 'level'} + - tree: Nested list of top-level outline nodes + - entry_count: Total number of bookmarks + - page_count: Number of pages in the document + + Raises: + FileNotFoundError: If input PDF doesn't exist + """ + if not input_path.is_file(): + raise FileNotFoundError(f'PDF file not found: {input_path}') + + doc = pymupdf.open(input_path) + try: + # toc rows are [level (1-based), title, page (1-based, -1 if none)] + toc = doc.get_toc(simple=True) + + entries: List[Dict[str, Any]] = [] + tree: List[Dict[str, Any]] = [] + # Stack of (level, node) used to attach each entry to its parent + stack: List[Tuple[int, Dict[str, Any]]] = [] + + for level, title, page in toc: + page_number = page if page and page > 0 else None + entries.append( + {'title': title, 'page': page_number, 'level': level} + ) + + node = { + 'title': title, + 'page': page_number, + 'level': level, + 'children': [], + } + + # Pop deeper-or-equal levels so the top of the stack is the parent + while stack and stack[-1][0] >= level: + stack.pop() + + if stack: + stack[-1][1]['children'].append(node) + else: + tree.append(node) + + stack.append((level, node)) + + return { + 'has_outline': len(entries) > 0, + 'entries': entries, + 'tree': tree, + 'entry_count': len(entries), + 'page_count': doc.page_count, + } + finally: + doc.close() + + # ======================================================================== + # XMP Metadata Operations + # ======================================================================== + + _RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + + # Fallback prefixes for well-known XMP namespaces, used when the packet + # does not declare a prefix for a namespace URI. + _XMP_KNOWN_NS = { + 'http://purl.org/dc/elements/1.1/': 'dc', + 'http://ns.adobe.com/pdf/1.3/': 'pdf', + 'http://ns.adobe.com/xap/1.0/': 'xmp', + 'http://ns.adobe.com/xap/1.0/mm/': 'xmpMM', + 'http://ns.adobe.com/xap/1.0/g/': 'xmpG', + 'http://ns.adobe.com/xap/1.0/sType/ResourceRef#': 'stRef', + 'http://ns.adobe.com/xap/1.0/rights/': 'xmpRights', + 'http://ns.adobe.com/pdfx/1.3/': 'pdfx', + 'http://www.aiim.org/pdfa/ns/id/': 'pdfaid', + 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', + } + + @classmethod + def _qname_to_prefixed(cls, tag: str, ns_map: Dict[str, str]) -> str: + """Convert an ElementTree ``{uri}local`` tag to ``prefix:local``.""" + if not tag.startswith('{'): + return tag + uri, _, local = tag[1:].partition('}') + prefix = ns_map.get(uri) or cls._XMP_KNOWN_NS.get(uri) + return f'{prefix}:{local}' if prefix else local + + @classmethod + def _xmp_property_value(cls, element): + """ + Extract the value of an XMP property element. + + Handles RDF container forms (``rdf:Alt`` / ``rdf:Seq`` / ``rdf:Bag``) + by collecting their ``rdf:li`` items, and simple text values. An ``Alt`` + with a single language alternative collapses to a plain string. + """ + rdf = '{' + cls._RDF_NS + '}' + + # Look for an RDF container child + for container in ('Alt', 'Seq', 'Bag'): + holder = element.find(rdf + container) + if holder is not None: + items = [ + (li.text or '').strip() + for li in holder.findall(rdf + 'li') + ] + items = [i for i in items if i] + if container == 'Alt' and len(items) == 1: + return items[0] + return items + + text = (element.text or '').strip() + return text or None + + @classmethod + def extract_xmp_metadata(cls, input_path: Path) -> Dict[str, Any]: + """ + Read and extract the XMP metadata packet of a PDF. + + XMP (Extensible Metadata Platform) is an RDF/XML packet stored in the + document catalog (``/Metadata``). This method returns the raw packet + as well as a parsed mapping of its properties (e.g. ``dc:title``, + ``dc:creator``, ``pdf:Producer``, ``xmp:CreateDate``). Properties using + RDF containers are returned as lists; single-valued properties as + strings. The classic ``/Info`` dictionary is also returned for + comparison. + + Args: + input_path: Path to the PDF file + + Returns: + Dictionary with: + - has_xmp: Whether an XMP packet is present + - properties: Mapping of ``prefix:name`` -> value (str or list) + - raw: The raw XMP XML packet (or None) + - doc_info: The classic /Info metadata dictionary + - page_count: Number of pages in the document + + Raises: + FileNotFoundError: If input PDF doesn't exist + """ + if not input_path.is_file(): + raise FileNotFoundError(f'PDF file not found: {input_path}') + + import io + import xml.etree.ElementTree as ET + + doc = pymupdf.open(input_path) + try: + raw = doc.get_xml_metadata() or '' + doc_info = dict(doc.metadata or {}) + page_count = doc.page_count + finally: + doc.close() + + raw = raw.strip() + result: Dict[str, Any] = { + 'has_xmp': bool(raw), + 'properties': {}, + 'raw': raw or None, + 'doc_info': doc_info, + 'page_count': page_count, + } + + if not raw: + return result + + # Capture the prefix/URI declarations so original prefixes are kept. + ns_map: Dict[str, str] = {} + try: + for _event, (prefix, uri) in ET.iterparse( + io.StringIO(raw), events=('start-ns',) + ): + ns_map.setdefault(uri, prefix) + except ET.ParseError: + ns_map = {} + + try: + root = ET.fromstring(raw) + except ET.ParseError: + # Packet present but unparseable — still expose the raw bytes. + return result + + rdf = '{' + cls._RDF_NS + '}' + properties: Dict[str, Any] = {} + + for desc in root.iter(rdf + 'Description'): + # Compact form: properties carried as attributes + for attr_name, attr_val in desc.attrib.items(): + if attr_name.startswith(rdf) or attr_name == 'about': + continue + key = cls._qname_to_prefixed(attr_name, ns_map) + value = attr_val.strip() + if value: + properties.setdefault(key, value) + + # Element form: each child is a property + for prop in desc: + key = cls._qname_to_prefixed(prop.tag, ns_map) + value = cls._xmp_property_value(prop) + if value not in (None, '', []): + properties.setdefault(key, value) + + result['properties'] = properties + return result + @staticmethod def optimize_pdf( input_path: Path, diff --git a/tests/commands/test_pdf.py b/tests/commands/test_pdf.py index 3e3a3f7..bf36a3e 100644 --- a/tests/commands/test_pdf.py +++ b/tests/commands/test_pdf.py @@ -763,3 +763,332 @@ def test_split_combine_does_not_create_split_directory(self, runner, sample_pdfs assert result.exit_code == 0 unwanted_dir = sample_pdfs['pdf1'].parent / 'doc1_split' assert not unwanted_dir.exists() + + +@pytest.fixture +def untagged_pdf(tmp_path): + """Create a plain (untagged) PDF with content.""" + pdf_path = tmp_path / 'untagged.pdf' + doc = pymupdf.open() + page = doc.new_page(width=612, height=792) + page.insert_text((72, 72), 'Untagged content') + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +@pytest.fixture +def tagged_pdf(tmp_path): + """Create a tagged PDF skeleton via the tag-template command.""" + from parxy_core.services.pdf_service import PdfService + + pdf_path = tmp_path / 'tagged.pdf' + PdfService.create_tag_template(pdf_path, pages=2, lang='en-US') + return pdf_path + + +@pytest.fixture +def tagged_pdf_with_text(tmp_path): + """Create a tagged PDF with real, structure-bearing text content.""" + pdf_path = tmp_path / 'tagged_text.pdf' + doc = pymupdf.open() + page = doc.new_page(width=612, height=792) + page.insert_htmlbox( + pymupdf.Rect(36, 36, 576, 756), + '

Hello Heading

A paragraph of body text.

', + ) + cat = doc.pdf_catalog() + doc.xref_set_key(cat, 'MarkInfo/Marked', 'true') + st = doc.get_new_xref() + doc.update_object(st, '<< /Type /StructTreeRoot /K [] >>') + doc.xref_set_key(cat, 'StructTreeRoot', f'{st} 0 R') + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +class TestTagTemplateCommand: + """Tests for the pdf:tag-template command.""" + + def test_creates_template(self, runner, tmp_path): + from parxy_core.services.pdf_service import PdfService + + out = tmp_path / 'template.pdf' + result = runner.invoke( + app, + ['pdf:tag-template', '-o', str(out), '--pages', '3', '--lang', 'de-DE'], + ) + + assert result.exit_code == 0 + assert out.exists() + info = PdfService.is_tagged(out) + assert info['tagged'] is True + assert info['page_count'] == 3 + assert info['lang'] == 'de-DE' + + def test_adds_pdf_extension(self, runner, tmp_path): + out = tmp_path / 'template' + result = runner.invoke(app, ['pdf:tag-template', '-o', str(out)]) + assert result.exit_code == 0 + assert (tmp_path / 'template.pdf').exists() + + def test_rejects_zero_pages(self, runner, tmp_path): + result = runner.invoke( + app, ['pdf:tag-template', '-o', str(tmp_path / 'x.pdf'), '--pages', '0'] + ) + assert result.exit_code == 1 + + +class TestTagsCheckCommand: + """Tests for the pdf:tags-check command.""" + + def test_tagged_exits_zero(self, runner, tagged_pdf): + result = runner.invoke(app, ['pdf:tags-check', str(tagged_pdf)]) + assert result.exit_code == 0 + + def test_untagged_exits_two(self, runner, untagged_pdf): + result = runner.invoke(app, ['pdf:tags-check', str(untagged_pdf)]) + assert result.exit_code == 2 + + def test_json_output(self, runner, tagged_pdf): + import json + + result = runner.invoke(app, ['pdf:tags-check', str(tagged_pdf), '--json']) + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data['tagged'] is True + assert data['page_count'] == 2 + + def test_missing_file(self, runner, tmp_path): + result = runner.invoke(app, ['pdf:tags-check', str(tmp_path / 'nope.pdf')]) + assert result.exit_code == 1 + + +class TestTagsCommand: + """Tests for the pdf:tags command.""" + + def test_prints_tree(self, runner, tagged_pdf): + result = runner.invoke(app, ['pdf:tags', str(tagged_pdf)]) + assert result.exit_code == 0 + assert 'Document' in result.stdout + + def test_untagged_exits_two(self, runner, untagged_pdf): + result = runner.invoke(app, ['pdf:tags', str(untagged_pdf)]) + assert result.exit_code == 2 + + def test_json_to_file(self, runner, tagged_pdf, tmp_path): + import json + + out = tmp_path / 'tags.json' + result = runner.invoke(app, ['pdf:tags', str(tagged_pdf), '-o', str(out)]) + assert result.exit_code == 0 + assert out.exists() + data = json.loads(out.read_text(encoding='utf-8')) + assert data['tagged'] is True + assert data['tag_counts'] == {'Document': 1, 'P': 2} + + def test_text_flag_includes_content(self, runner, tagged_pdf_with_text): + import json + + result = runner.invoke( + app, ['pdf:tags', str(tagged_pdf_with_text), '--text', '--json'] + ) + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert 'pages' in data + assert 'tag_counts' not in data # per-page text view, not the xref walk + assert 'Hello Heading' in result.stdout + + +class TestTagSkeletonCommand: + """Tests for the pdf:tag-skeleton command.""" + + def test_strips_content_keeps_tags(self, runner, tagged_pdf, tmp_path): + from parxy_core.services.pdf_service import PdfService + + out = tmp_path / 'skeleton.pdf' + result = runner.invoke( + app, ['pdf:tag-skeleton', str(tagged_pdf), '-o', str(out)] + ) + assert result.exit_code == 0 + assert out.exists() + + info = PdfService.is_tagged(out) + assert info['tagged'] is True + assert info['struct_element_count'] == 3 # 1 Document + 2 P + + def test_default_output_path(self, runner, tagged_pdf): + result = runner.invoke(app, ['pdf:tag-skeleton', str(tagged_pdf)]) + assert result.exit_code == 0 + expected = tagged_pdf.parent / f'{tagged_pdf.stem}_tags.pdf' + assert expected.exists() + + def test_missing_file(self, runner, tmp_path): + result = runner.invoke(app, ['pdf:tag-skeleton', str(tmp_path / 'nope.pdf')]) + assert result.exit_code == 1 + + +@pytest.fixture +def pdf_with_outline(tmp_path): + """Create a PDF with a nested bookmark hierarchy.""" + pdf_path = tmp_path / 'outline.pdf' + doc = pymupdf.open() + for _ in range(5): + doc.new_page() + doc.set_toc( + [ + [1, 'Chapter 1', 1], + [2, 'Section 1.1', 2], + [1, 'Chapter 2', 4], + ] + ) + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +@pytest.fixture +def pdf_without_outline(tmp_path): + """Create a PDF with no bookmarks.""" + pdf_path = tmp_path / 'no-outline.pdf' + doc = pymupdf.open() + doc.new_page() + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +class TestOutlineCommand: + """Tests for the pdf:outline command.""" + + def test_prints_tree(self, runner, pdf_with_outline): + result = runner.invoke(app, ['pdf:outline', str(pdf_with_outline)]) + assert result.exit_code == 0 + assert 'Chapter 1' in result.stdout + assert 'Section 1.1' in result.stdout + assert 'Chapter 2' in result.stdout + + def test_flat_listing(self, runner, pdf_with_outline): + result = runner.invoke(app, ['pdf:outline', str(pdf_with_outline), '--flat']) + assert result.exit_code == 0 + assert 'Chapter 1' in result.stdout + + def test_no_outline_exits_two(self, runner, pdf_without_outline): + result = runner.invoke(app, ['pdf:outline', str(pdf_without_outline)]) + assert result.exit_code == 2 + + def test_json_to_stdout(self, runner, pdf_with_outline): + import json + + result = runner.invoke(app, ['pdf:outline', str(pdf_with_outline), '--json']) + assert result.exit_code == 0 + data = json.loads(result.stdout) + assert data['has_outline'] is True + assert data['entry_count'] == 3 + assert len(data['tree']) == 2 + assert data['tree'][0]['children'][0]['title'] == 'Section 1.1' + + def test_json_to_file(self, runner, pdf_with_outline, tmp_path): + import json + + out = tmp_path / 'outline.json' + result = runner.invoke( + app, ['pdf:outline', str(pdf_with_outline), '-o', str(out)] + ) + assert result.exit_code == 0 + assert out.exists() + data = json.loads(out.read_text(encoding='utf-8')) + assert data['entry_count'] == 3 + + def test_missing_file(self, runner, tmp_path): + result = runner.invoke(app, ['pdf:outline', str(tmp_path / 'nope.pdf')]) + assert result.exit_code == 1 + + +_SAMPLE_XMP = """ + + + + + + A Sample Title + + + + + +""" + + +@pytest.fixture +def pdf_with_xmp(tmp_path): + """Create a PDF carrying an XMP metadata packet.""" + pdf_path = tmp_path / 'with-xmp.pdf' + doc = pymupdf.open() + doc.new_page() + doc.set_xml_metadata(_SAMPLE_XMP) + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +@pytest.fixture +def pdf_without_xmp(tmp_path): + """Create a PDF with no XMP packet.""" + pdf_path = tmp_path / 'no-xmp.pdf' + doc = pymupdf.open() + doc.new_page() + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +class TestXmpCommand: + """Tests for the pdf:xmp command.""" + + def test_prints_properties(self, runner, pdf_with_xmp): + result = runner.invoke(app, ['pdf:xmp', str(pdf_with_xmp)]) + assert result.exit_code == 0 + assert 'dc:title' in result.stdout + assert 'A Sample Title' in result.stdout + + def test_raw_output(self, runner, pdf_with_xmp): + result = runner.invoke(app, ['pdf:xmp', str(pdf_with_xmp), '--raw']) + assert result.exit_code == 0 + assert 'Hello Heading

A paragraph of body text.

', + ) + # Mark the document as tagged so is_tagged() reports tagged=True. + cat = doc.pdf_catalog() + doc.xref_set_key(cat, 'MarkInfo/Marked', 'true') + st = doc.get_new_xref() + doc.update_object(st, '<< /Type /StructTreeRoot /K [] >>') + doc.xref_set_key(cat, 'StructTreeRoot', f'{st} 0 R') + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +class TestExtractTagsWithText: + """Tests for PdfService.extract_tags_with_text.""" + + def _all_text(self, pages): + """Flatten all node text across all pages.""" + collected = [] + + def walk(node): + if node.get('text'): + collected.append(node['text']) + for child in node['children']: + walk(child) + + for page in pages: + for root in page['roots']: + walk(root) + return ' '.join(collected) + + def test_returns_per_page_structure(self, tagged_pdf_with_text): + result = PdfService.extract_tags_with_text(tagged_pdf_with_text) + assert result['page_count'] == 1 + assert len(result['pages']) == 1 + assert result['pages'][0]['page'] == 1 + assert result['pages'][0]['roots'] # non-empty + + def test_includes_text_content(self, tagged_pdf_with_text): + result = PdfService.extract_tags_with_text(tagged_pdf_with_text) + text = self._all_text(result['pages']) + assert 'Hello Heading' in text + assert 'A paragraph of body text.' in text + + def test_node_shape(self, tagged_pdf_with_text): + result = PdfService.extract_tags_with_text(tagged_pdf_with_text) + root = result['pages'][0]['roots'][0] + assert set(root.keys()) == {'type', 'standard_type', 'text', 'children'} + + def test_file_not_found(self, tmp_path): + with pytest.raises(FileNotFoundError): + PdfService.extract_tags_with_text(tmp_path / 'missing.pdf') + + +class TestStripToTags: + """Tests for PdfService.strip_to_tags.""" + + def test_removes_visible_content(self, untagged_pdf, tmp_path): + out = tmp_path / 'stripped.pdf' + PdfService.strip_to_tags(untagged_pdf, out) + + doc = pymupdf.open(out) + try: + assert doc.page_count == 2 + for page in doc: + assert page.get_text().strip() == '' + finally: + doc.close() + + def test_preserves_tags(self, tagged_pdf, tmp_path): + out = tmp_path / 'stripped.pdf' + result = PdfService.strip_to_tags(tagged_pdf, out) + + assert result['tagged'] is True + assert result['struct_element_count'] == 4 + + # Structure tree survives the round-trip + info = PdfService.is_tagged(out) + assert info['tagged'] is True + assert info['struct_element_count'] == 4 + + def test_reports_sizes(self, tagged_pdf, tmp_path): + out = tmp_path / 'stripped.pdf' + result = PdfService.strip_to_tags(tagged_pdf, out) + assert result['original_size'] == tagged_pdf.stat().st_size + assert result['stripped_size'] == out.stat().st_size + + def test_creates_parent_directory(self, untagged_pdf, tmp_path): + out = tmp_path / 'nested' / 'stripped.pdf' + PdfService.strip_to_tags(untagged_pdf, out) + assert out.exists() + + def test_file_not_found(self, tmp_path): + with pytest.raises(FileNotFoundError): + PdfService.strip_to_tags(tmp_path / 'missing.pdf', tmp_path / 'o.pdf') + + +@pytest.fixture +def pdf_without_outline(tmp_path): + """Create a PDF with pages but no bookmarks.""" + pdf_path = tmp_path / 'no-outline.pdf' + doc = pymupdf.open() + for _ in range(3): + doc.new_page() + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +@pytest.fixture +def pdf_with_outline(tmp_path): + """Create a PDF with a nested bookmark hierarchy.""" + pdf_path = tmp_path / 'outline.pdf' + doc = pymupdf.open() + for _ in range(5): + doc.new_page() + # [level (1-based), title, page (1-based)] + doc.set_toc( + [ + [1, 'Chapter 1', 1], + [2, 'Section 1.1', 2], + [2, 'Section 1.2', 3], + [1, 'Chapter 2', 4], + [2, 'Section 2.1', 5], + ] + ) + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +class TestExtractOutline: + """Tests for PdfService.extract_outline.""" + + def test_no_outline(self, pdf_without_outline): + result = PdfService.extract_outline(pdf_without_outline) + assert result['has_outline'] is False + assert result['entries'] == [] + assert result['tree'] == [] + assert result['entry_count'] == 0 + assert result['page_count'] == 3 + + def test_flat_entries(self, pdf_with_outline): + result = PdfService.extract_outline(pdf_with_outline) + assert result['has_outline'] is True + assert result['entry_count'] == 5 + titles = [e['title'] for e in result['entries']] + assert titles == [ + 'Chapter 1', + 'Section 1.1', + 'Section 1.2', + 'Chapter 2', + 'Section 2.1', + ] + assert result['entries'][0] == { + 'title': 'Chapter 1', + 'page': 1, + 'level': 1, + } + + def test_nested_tree(self, pdf_with_outline): + result = PdfService.extract_outline(pdf_with_outline) + tree = result['tree'] + assert len(tree) == 2 + + chapter1 = tree[0] + assert chapter1['title'] == 'Chapter 1' + assert chapter1['page'] == 1 + assert [c['title'] for c in chapter1['children']] == [ + 'Section 1.1', + 'Section 1.2', + ] + + chapter2 = tree[1] + assert chapter2['title'] == 'Chapter 2' + assert [c['title'] for c in chapter2['children']] == ['Section 2.1'] + + def test_file_not_found(self, tmp_path): + with pytest.raises(FileNotFoundError): + PdfService.extract_outline(tmp_path / 'missing.pdf') + + +# Minimal XMP packet exercising simple text, an Alt (title) and a Seq (creator). +_SAMPLE_XMP = """ + + + + + + A Sample Title + + + + + Alice + Bob + + + 2024-01-01T00:00:00Z + + + +""" + + +@pytest.fixture +def pdf_with_xmp(tmp_path): + """Create a PDF carrying an XMP metadata packet.""" + pdf_path = tmp_path / 'with-xmp.pdf' + doc = pymupdf.open() + doc.new_page() + doc.set_xml_metadata(_SAMPLE_XMP) + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +@pytest.fixture +def pdf_without_xmp(tmp_path): + """Create a PDF with no XMP packet.""" + pdf_path = tmp_path / 'no-xmp.pdf' + doc = pymupdf.open() + doc.new_page() + doc.save(str(pdf_path)) + doc.close() + return pdf_path + + +class TestExtractXmpMetadata: + """Tests for PdfService.extract_xmp_metadata.""" + + def test_no_xmp(self, pdf_without_xmp): + result = PdfService.extract_xmp_metadata(pdf_without_xmp) + assert result['has_xmp'] is False + assert result['raw'] is None + assert result['properties'] == {} + assert result['page_count'] == 1 + + def test_has_xmp_and_raw(self, pdf_with_xmp): + result = PdfService.extract_xmp_metadata(pdf_with_xmp) + assert result['has_xmp'] is True + assert result['raw'] is not None + assert 'A Sample Title' in result['raw'] + + def test_simple_alt_collapses_to_string(self, pdf_with_xmp): + props = PdfService.extract_xmp_metadata(pdf_with_xmp)['properties'] + assert props['dc:title'] == 'A Sample Title' + + def test_seq_returns_list(self, pdf_with_xmp): + props = PdfService.extract_xmp_metadata(pdf_with_xmp)['properties'] + assert props['dc:creator'] == ['Alice', 'Bob'] + + def test_attribute_form_property(self, pdf_with_xmp): + props = PdfService.extract_xmp_metadata(pdf_with_xmp)['properties'] + assert props['pdf:Producer'] == 'Test Producer' + + def test_simple_text_property(self, pdf_with_xmp): + props = PdfService.extract_xmp_metadata(pdf_with_xmp)['properties'] + assert props['xmp:CreateDate'] == '2024-01-01T00:00:00Z' + + def test_file_not_found(self, tmp_path): + with pytest.raises(FileNotFoundError): + PdfService.extract_xmp_metadata(tmp_path / 'missing.pdf') From 0c53a507f7ee0a1bb8238ad0d1613a4a6f367667 Mon Sep 17 00:00:00 2001 From: avvertix <5672748+avvertix@users.noreply.github.com> Date: Sat, 6 Jun 2026 16:57:20 +0000 Subject: [PATCH 2/4] Fix styling --- src/parxy_core/services/pdf_service.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/parxy_core/services/pdf_service.py b/src/parxy_core/services/pdf_service.py index 056c6fc..6c649f8 100644 --- a/src/parxy_core/services/pdf_service.py +++ b/src/parxy_core/services/pdf_service.py @@ -1075,9 +1075,7 @@ def extract_outline(input_path: Path) -> Dict[str, Any]: for level, title, page in toc: page_number = page if page and page > 0 else None - entries.append( - {'title': title, 'page': page_number, 'level': level} - ) + entries.append({'title': title, 'page': page_number, 'level': level}) node = { 'title': title, @@ -1152,10 +1150,7 @@ def _xmp_property_value(cls, element): for container in ('Alt', 'Seq', 'Bag'): holder = element.find(rdf + container) if holder is not None: - items = [ - (li.text or '').strip() - for li in holder.findall(rdf + 'li') - ] + items = [(li.text or '').strip() for li in holder.findall(rdf + 'li')] items = [i for i in items if i] if container == 'Alt' and len(items) == 1: return items[0] From 301f5091f1895d8b5b0d2c78ba1a6af87ae8abcb Mon Sep 17 00:00:00 2001 From: Alessio Vertemati Date: Sat, 6 Jun 2026 19:05:37 +0200 Subject: [PATCH 3/4] Attempt improve load time --- src/parxy_core/drivers/__init__.py | 55 +++++++++++++++++++++--------- src/parxy_core/drivers/factory.py | 42 ++++++++++++++++------- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/src/parxy_core/drivers/__init__.py b/src/parxy_core/drivers/__init__.py index 0793dbc..3007837 100644 --- a/src/parxy_core/drivers/__init__.py +++ b/src/parxy_core/drivers/__init__.py @@ -1,18 +1,41 @@ from parxy_core.drivers.abstract_driver import Driver as Driver from parxy_core.drivers.factory import DriverFactory as DriverFactory -from parxy_core.drivers.llamaparse import LlamaParseDriver as LlamaParseDriver -from parxy_core.drivers.landingai import LandingAIADEDriver as LandingAIADEDriver -from parxy_core.drivers.llmwhisperer import LlmWhispererDriver as LlmWhispererDriver -from parxy_core.drivers.pdfact import PdfActDriver as PdfActDriver -from parxy_core.drivers.pymupdf import PyMuPdfDriver as PyMuPdfDriver -from parxy_core.drivers.unstructured_local import ( - UnstructuredLocalDriver as UnstructuredLocalDriver, -) -from parxy_core.drivers.pypdfium2 import ( - PyPDFium2Driver as PyPDFium2Driver, -) -from parxy_core.drivers.pdfplumber import PDFPlumberDriver as PDFPlumberDriver -from parxy_core.drivers.pdfminer import PDFMinerDriver as PDFMinerDriver -from parxy_core.drivers.docling import DoclingDriver as DoclingDriver -from parxy_core.drivers.liteparse import LiteParseDriver as LiteParseDriver -from parxy_core.drivers.reducto import ReductoDriver as ReductoDriver + +# Concrete driver classes are exposed lazily so that `import parxy_core.drivers` +# (which the facade and CLI do on startup) does not eagerly import every +# driver's dependencies. Some drivers pull in heavy ML stacks (e.g. docling -> +# transformers/torch, ~5s), which would otherwise be loaded on every CLI +# invocation even when those drivers are never used. +_LAZY_DRIVERS = { + 'LlamaParseDriver': 'parxy_core.drivers.llamaparse', + 'LandingAIADEDriver': 'parxy_core.drivers.landingai', + 'LlmWhispererDriver': 'parxy_core.drivers.llmwhisperer', + 'PdfActDriver': 'parxy_core.drivers.pdfact', + 'PyMuPdfDriver': 'parxy_core.drivers.pymupdf', + 'UnstructuredLocalDriver': 'parxy_core.drivers.unstructured_local', + 'PyPDFium2Driver': 'parxy_core.drivers.pypdfium2', + 'PDFPlumberDriver': 'parxy_core.drivers.pdfplumber', + 'PDFMinerDriver': 'parxy_core.drivers.pdfminer', + 'DoclingDriver': 'parxy_core.drivers.docling', + 'LiteParseDriver': 'parxy_core.drivers.liteparse', + 'ReductoDriver': 'parxy_core.drivers.reducto', +} + +__all__ = ['Driver', 'DriverFactory', *_LAZY_DRIVERS.keys()] + + +def __getattr__(name: str): + """Lazily import driver classes on first attribute access (PEP 562).""" + module_path = _LAZY_DRIVERS.get(name) + if module_path is None: + raise AttributeError(f'module {__name__!r} has no attribute {name!r}') + + import importlib + + driver_class = getattr(importlib.import_module(module_path), name) + globals()[name] = driver_class # cache for subsequent lookups + return driver_class + + +def __dir__(): + return sorted(__all__) diff --git a/src/parxy_core/drivers/factory.py b/src/parxy_core/drivers/factory.py index 0472dfe..eb3b78f 100644 --- a/src/parxy_core/drivers/factory.py +++ b/src/parxy_core/drivers/factory.py @@ -1,20 +1,14 @@ +from __future__ import annotations + import logging from typing import Dict, Optional, Callable, Self, List +# NOTE: Concrete driver classes are imported lazily inside their respective +# `_create_*_driver` methods. Importing them at module load would eagerly pull +# in every driver's dependencies (e.g. docling -> transformers/torch), adding +# ~10s to CLI startup even for commands that never touch those drivers. from parxy_core.drivers.abstract_driver import Driver -from parxy_core.drivers.landingai import LandingAIADEDriver -from parxy_core.drivers.pymupdf import PyMuPdfDriver -from parxy_core.drivers.pdfact import PdfActDriver -from parxy_core.drivers.llamaparse import LlamaParseDriver -from parxy_core.drivers.llmwhisperer import LlmWhispererDriver -from parxy_core.drivers.unstructured_local import UnstructuredLocalDriver -from parxy_core.drivers.pypdfium2 import PyPDFium2Driver -from parxy_core.drivers.pdfplumber import PDFPlumberDriver -from parxy_core.drivers.pdfminer import PDFMinerDriver -from parxy_core.drivers.docling import DoclingDriver -from parxy_core.drivers.liteparse import LiteParseDriver -from parxy_core.drivers.reducto import ReductoDriver from parxy_core.models import ( PdfActConfig, LandingAIConfig, @@ -167,6 +161,8 @@ def _create_pymupdf_driver(self) -> PyMuPdfDriver: PyMuPdfDriver A new instance """ + from parxy_core.drivers.pymupdf import PyMuPdfDriver + return PyMuPdfDriver(logger=self._logger) def _create_pdfact_driver(self) -> PdfActDriver: @@ -177,6 +173,8 @@ def _create_pdfact_driver(self) -> PdfActDriver: PdfActDriver A new instance """ + from parxy_core.drivers.pdfact import PdfActDriver + return PdfActDriver(config=PdfActConfig(), logger=self._logger) def _create_llamaparse_driver(self) -> LlamaParseDriver: @@ -187,6 +185,8 @@ def _create_llamaparse_driver(self) -> LlamaParseDriver: LlamaParseDriver A new instance """ + from parxy_core.drivers.llamaparse import LlamaParseDriver + return LlamaParseDriver( config=LlamaParseConfig(), logger=self._logger, @@ -200,6 +200,8 @@ def _create_llmwhisperer_driver(self) -> LlmWhispererDriver: LlmWhispererDriver A new instance """ + from parxy_core.drivers.llmwhisperer import LlmWhispererDriver + return LlmWhispererDriver( config=LlmWhispererConfig(), logger=self._logger, @@ -213,27 +215,41 @@ def _create_unstructured_local_driver(self) -> UnstructuredLocalDriver: UnstructuredLocalDriver A new instance """ + from parxy_core.drivers.unstructured_local import UnstructuredLocalDriver + return UnstructuredLocalDriver( config=UnstructuredLocalConfig(), logger=self._logger, ) def _create_pypdfium_driver(self) -> PyPDFium2Driver: + from parxy_core.drivers.pypdfium2 import PyPDFium2Driver + return PyPDFium2Driver(logger=self._logger) def _create_pdfplumber_driver(self) -> PDFPlumberDriver: + from parxy_core.drivers.pdfplumber import PDFPlumberDriver + return PDFPlumberDriver(logger=self._logger) def _create_pdfminer_driver(self) -> PDFMinerDriver: + from parxy_core.drivers.pdfminer import PDFMinerDriver + return PDFMinerDriver(logger=self._logger) def _create_docling_driver(self) -> DoclingDriver: + from parxy_core.drivers.docling import DoclingDriver + return DoclingDriver(config=DoclingConfig(), logger=self._logger) def _create_liteparse_driver(self) -> LiteParseDriver: + from parxy_core.drivers.liteparse import LiteParseDriver + return LiteParseDriver(config=LiteParseConfig(), logger=self._logger) def _create_reducto_driver(self) -> ReductoDriver: + from parxy_core.drivers.reducto import ReductoDriver + return ReductoDriver(config=ReductoConfig(), logger=self._logger) def _create_landingai_driver(self) -> LandingAIADEDriver: @@ -244,6 +260,8 @@ def _create_landingai_driver(self) -> LandingAIADEDriver: LandingAIADEDriver A new instance """ + from parxy_core.drivers.landingai import LandingAIADEDriver + return LandingAIADEDriver( config=LandingAIConfig(), logger=self._logger, From 11c794860d0031c61bc96f9e032a5f0fe85a0219 Mon Sep 17 00:00:00 2001 From: avvertix <5672748+avvertix@users.noreply.github.com> Date: Sat, 6 Jun 2026 19:16:50 +0000 Subject: [PATCH 4/4] docs: sync CLI and configuration reference --- docs/reference/cli.md | 838 ++++++++++++++++---------------- docs/reference/configuration.md | 208 ++++---- 2 files changed, 523 insertions(+), 523 deletions(-) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 1fac29e..0988787 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -1,419 +1,419 @@ ---- -title: CLI reference -description: Command line reference with all parxy commands, including arguments, options, types, and defaults. Prefer to run parxy --help and parxy --help if you have access to the terminal. ---- - - - - -# CLI reference - -## `parxy agents` - -Set up AI agent configuration files for Parxy projects. - -Creates or updates an AGENTS.md file with Parxy usage documentation. -If AGENTS.md exists, the Parxy section (marked with tags) is -added or updated while preserving other content. - -Optionally creates Claude Code skill files for common operations. - -``` -parxy agents [OPTIONS] -``` - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `path` | - | Output directory for agent files. Defaults to current directory. | -| `--overwrite` | `-f` | `flag` | `false` | Overwrite existing Parxy section without prompting. | - -## `parxy attach` - -Extract an attached file from a PDF - -``` -parxy attach [OPTIONS] INPUT_FILE NAME -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file containing the attachment | -| `NAME` | Yes | Name of attached file to extract | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path. If not specified, saves to current directory with original name. | -| `--stdout` | - | `flag` | `false` | Output content to stdout (text files only) | - -## `parxy attach:add` - -Add files as attachments to a PDF - -``` -parxy attach:add [OPTIONS] INPUT_FILE FILES... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to add attachments to | -| `FILES` | Yes | One or more files to attach | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_with_attachments.pdf | -| `--description` | `-d` | `text` | - | Description for attached file(s). Matched by position to files. | -| `--name` | `-n` | `text` | - | Custom name(s) for attached file(s). Matched by position to files. | -| `--overwrite` | - | `flag` | `false` | Overwrite existing attachments with same name | - -## `parxy attach:list` - -List attached files in a PDF - -``` -parxy attach:list [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to inspect | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--verbose` | `-v` | `flag` | `false` | Show detailed information | - -## `parxy attach:remove` - -Remove attached files from a PDF - -``` -parxy attach:remove [OPTIONS] INPUT_FILE NAMES... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to process | -| `NAMES` | No | Names of attachments to remove | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_no_attachments.pdf | -| `--all` | - | `flag` | `false` | Remove all attached files | - -## `parxy docker` - -Create a Docker Compose file to run self-hostable parsers (experimental). - -``` -parxy docker -``` - -## `parxy drivers` - -List supported drivers. - -``` -parxy drivers -``` - -## `parxy env` - -Create an environment file with Parxy configuration. - -``` -parxy env -``` - -## `parxy markdown` - -Parse documents to Markdown. - -Accepts PDF files (parsed on-the-fly) or pre-parsed JSON result files -(loaded directly from the Document model without re-parsing). - -``` -parxy markdown [OPTIONS] INPUTS... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | -| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | -| `--output` | `-o` | `text` | - | Directory to save markdown files. If not specified, files are saved next to the source files. | -| `--inline` | `-i` | `flag` | `false` | Output markdown to stdout with file name as YAML frontmatter. Only valid with a single file. | -| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | -| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | -| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | -| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | -| `--page-separators` | - | `flag` | `false` | Insert HTML comments before each page's content. | - -## `parxy parse` - -Parse documents using one or more drivers. - -This command processes PDF documents and extracts their content in various formats. -You can specify individual files or entire folders to process. - -``` -parxy parse [OPTIONS] INPUTS... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | -| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | -| `--mode` | `-m` | `json` | `plain` | `markdown` | `json` | Output mode: json (JSON serialization), plain (plain text), or markdown (markdown format) | -| `--output` | `-o` | `text` | - | Directory to save output files. If not specified, files will be saved in the same directory as the source files. | -| `--show` | `-s` | `flag` | `false` | Show document content in console in addition to saving to files | -| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | -| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | -| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | -| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | - -## `parxy pdf:merge` - -Merge multiple PDF files into a single PDF - -``` -parxy pdf:merge [OPTIONS] INPUTS... -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUTS` | Yes | One or more PDF files or folders to merge. Files support page ranges in square brackets (e.g., file.pdf[1:3]). Folders are processed non-recursively. | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path for the merged PDF. If not specified, you will be prompted. | - -## `parxy pdf:outline` - -Print or export the outline (bookmarks / table of contents) of a PDF - -``` -parxy pdf:outline [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to inspect | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Write the outline as JSON to this file instead of printing a tree. | -| `--json` | - | `flag` | `false` | Print the outline as JSON to stdout. | -| `--flat` | - | `flag` | `false` | Print a flat, indented list instead of a tree. | - -## `parxy pdf:split` - -Split a PDF file into individual pages - -``` -parxy pdf:split [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to split | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output path. Without --combine: output directory for split files (default: folder next to input). With --combine: output file path (default: {stem}_pages_{from}-{to}.pdf next to input). | -| `--prefix` | `-p` | `text` | - | Prefix for output filenames. If not specified, uses the input filename. | -| `--pages` | - | `text` | - | Page range to extract (1-based). Examples: "1" (single page), "1:3" (pages 1-3), ":3" (up to page 3), "3:" (from page 3). If not specified, all pages are extracted. | -| `--combine` | - | `flag` | `false` | Combine extracted pages into a single PDF instead of one file per page. | -| `--every` | `-e` | `integer` | - | Split into chunks of N pages each. Cannot be used with --combine. | - -## `parxy pdf:split-by-text` - -Split a PDF into chunks whenever a page matches a text condition - -``` -parxy pdf:split-by-text [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to split | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--text` | `-t` | `text` | - | Text to match. Can be repeated for multiple patterns (OR logic). | -| `--mode` | `-m` | `text` | `contains` | Matching mode: "contains" (default) or "starts-with". | -| `--ignore-case` | `-i` | `flag` | `false` | Case-insensitive matching. | -| `--regex` | - | `flag` | `false` | Treat --text values as regular expressions. | -| `--discard-preamble` | - | `flag` | `false` | Discard pages that appear before the first matching page. | -| `--output` | `-o` | `text` | - | Output directory for chunk files (default: {stem}_split next to input). | -| `--prefix` | `-p` | `text` | - | Prefix for output filenames. Defaults to the input filename stem. | - -## `parxy pdf:tag-skeleton` - -Copy a tagged PDF keeping its tags but removing visible content - -``` -parxy pdf:tag-skeleton [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | Tagged PDF file to strip | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output path for the tags-only PDF (default: {stem}_tags.pdf next to input). | - -## `parxy pdf:tag-template` - -Create an empty tagged PDF skeleton for accessibility work - -``` -parxy pdf:tag-template [OPTIONS] -``` - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Output file path for the template PDF. If not specified, you will be prompted. | -| `--pages` | - | `integer` | `1` | Number of blank pages to create (default: 1). | -| `--lang` | - | `text` | `en-US` | Document language tag set on the catalog (default: en-US). | -| `--title` | - | `text` | - | Optional document title stored in the PDF metadata. | - -## `parxy pdf:tags` - -Extract the tag (structure) tree of a tagged PDF - -``` -parxy pdf:tags [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to inspect | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Write the extracted tags as JSON to this file instead of printing a tree. | -| `--json` | - | `flag` | `false` | Print the extracted tags as JSON to stdout. | -| `--text` | - | `flag` | `false` | Include the text content of each element. Rebuilds the tree per page; accessibility attributes (alt text, page refs) are not shown in this mode. | - -## `parxy pdf:tags-check` - -Check whether a PDF is a tagged (accessible) PDF - -``` -parxy pdf:tags-check [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to inspect | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--json` | - | `flag` | `false` | Output the detection result as JSON. | - -## `parxy pdf:xmp` - -Read and extract the XMP metadata of a PDF - -``` -parxy pdf:xmp [OPTIONS] INPUT_FILE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `INPUT_FILE` | Yes | PDF file to inspect | - -**Options:** - -| Option | Short | Type | Default | Description | -|--------|-------|------|---------|-------------| -| `--output` | `-o` | `text` | - | Write the metadata to this file. A .xml extension writes the raw XMP packet; any other extension writes parsed JSON. | -| `--json` | - | `flag` | `false` | Print the parsed metadata as JSON to stdout. | -| `--raw` | - | `flag` | `false` | Print the raw XMP XML packet to stdout. | - -## `parxy tui` - -Launch the Parxy TUI for interactive parser comparison - -``` -parxy tui WORKSPACE -``` - -**Arguments:** - -| Argument | Required | Description | -|----------|----------|-------------| -| `WORKSPACE` | No | Path to the workspace folder (optional — can be selected inside the TUI) | - -## `parxy version` - -Print Parxy version information. - -``` -parxy version -``` +--- +title: CLI reference +description: Command line reference with all parxy commands, including arguments, options, types, and defaults. Prefer to run parxy --help and parxy --help if you have access to the terminal. +--- + + + + +# CLI reference + +## `parxy agents` + +Set up AI agent configuration files for Parxy projects. + +Creates or updates an AGENTS.md file with Parxy usage documentation. +If AGENTS.md exists, the Parxy section (marked with tags) is +added or updated while preserving other content. + +Optionally creates Claude Code skill files for common operations. + +``` +parxy agents [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `path` | - | Output directory for agent files. Defaults to current directory. | +| `--overwrite` | `-f` | `flag` | `false` | Overwrite existing Parxy section without prompting. | + +## `parxy attach` + +Extract an attached file from a PDF + +``` +parxy attach [OPTIONS] INPUT_FILE NAME +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file containing the attachment | +| `NAME` | Yes | Name of attached file to extract | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, saves to current directory with original name. | +| `--stdout` | - | `flag` | `false` | Output content to stdout (text files only) | + +## `parxy attach:add` + +Add files as attachments to a PDF + +``` +parxy attach:add [OPTIONS] INPUT_FILE FILES... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to add attachments to | +| `FILES` | Yes | One or more files to attach | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_with_attachments.pdf | +| `--description` | `-d` | `text` | - | Description for attached file(s). Matched by position to files. | +| `--name` | `-n` | `text` | - | Custom name(s) for attached file(s). Matched by position to files. | +| `--overwrite` | - | `flag` | `false` | Overwrite existing attachments with same name | + +## `parxy attach:list` + +List attached files in a PDF + +``` +parxy attach:list [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--verbose` | `-v` | `flag` | `false` | Show detailed information | + +## `parxy attach:remove` + +Remove attached files from a PDF + +``` +parxy attach:remove [OPTIONS] INPUT_FILE NAMES... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to process | +| `NAMES` | No | Names of attachments to remove | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path. If not specified, creates {input}_no_attachments.pdf | +| `--all` | - | `flag` | `false` | Remove all attached files | + +## `parxy docker` + +Create a Docker Compose file to run self-hostable parsers (experimental). + +``` +parxy docker +``` + +## `parxy drivers` + +List supported drivers. + +``` +parxy drivers +``` + +## `parxy env` + +Create an environment file with Parxy configuration. + +``` +parxy env +``` + +## `parxy markdown` + +Parse documents to Markdown. + +Accepts PDF files (parsed on-the-fly) or pre-parsed JSON result files +(loaded directly from the Document model without re-parsing). + +``` +parxy markdown [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | +| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | +| `--output` | `-o` | `text` | - | Directory to save markdown files. If not specified, files are saved next to the source files. | +| `--inline` | `-i` | `flag` | `false` | Output markdown to stdout with file name as YAML frontmatter. Only valid with a single file. | +| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | +| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | +| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | +| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | +| `--page-separators` | - | `flag` | `false` | Insert HTML comments before each page's content. | + +## `parxy parse` + +Parse documents using one or more drivers. + +This command processes PDF documents and extracts their content in various formats. +You can specify individual files or entire folders to process. + +``` +parxy parse [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more files or folders to parse. Use --recursive to search subdirectories. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--driver` | `-d` | `text` | - | Driver(s) to use for parsing. Can be specified multiple times. (default: pymupdf or PARXY_DEFAULT_DRIVER) | +| `--level` | `-l` | `page` | `block` | `line` | `span` | `character` | `block` | Extraction level | +| `--mode` | `-m` | `json` | `plain` | `markdown` | `json` | Output mode: json (JSON serialization), plain (plain text), or markdown (markdown format) | +| `--output` | `-o` | `text` | - | Directory to save output files. If not specified, files will be saved in the same directory as the source files. | +| `--show` | `-s` | `flag` | `false` | Show document content in console in addition to saving to files | +| `--recursive` | `-r` | `flag` | `false` | Recursively search subdirectories when processing folders | +| `--max-depth` | - | `integer range` | - | Maximum depth to recurse into subdirectories (only applies with --recursive). 0 = current directory only, 1 = one level down, etc. | +| `--stop-on-failure` | - | `flag` | `false` | Stop processing files immediately if an error occurs with any file | +| `--workers` | `-w` | `integer range` | - | Number of parallel workers to use. Defaults to cpu count. | + +## `parxy pdf:merge` + +Merge multiple PDF files into a single PDF + +``` +parxy pdf:merge [OPTIONS] INPUTS... +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUTS` | Yes | One or more PDF files or folders to merge. Files support page ranges in square brackets (e.g., file.pdf[1:3]). Folders are processed non-recursively. | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path for the merged PDF. If not specified, you will be prompted. | + +## `parxy pdf:outline` + +Print or export the outline (bookmarks / table of contents) of a PDF + +``` +parxy pdf:outline [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Write the outline as JSON to this file instead of printing a tree. | +| `--json` | - | `flag` | `false` | Print the outline as JSON to stdout. | +| `--flat` | - | `flag` | `false` | Print a flat, indented list instead of a tree. | + +## `parxy pdf:split` + +Split a PDF file into individual pages + +``` +parxy pdf:split [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to split | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output path. Without --combine: output directory for split files (default: folder next to input). With --combine: output file path (default: {stem}_pages_{from}-{to}.pdf next to input). | +| `--prefix` | `-p` | `text` | - | Prefix for output filenames. If not specified, uses the input filename. | +| `--pages` | - | `text` | - | Page range to extract (1-based). Examples: "1" (single page), "1:3" (pages 1-3), ":3" (up to page 3), "3:" (from page 3). If not specified, all pages are extracted. | +| `--combine` | - | `flag` | `false` | Combine extracted pages into a single PDF instead of one file per page. | +| `--every` | `-e` | `integer` | - | Split into chunks of N pages each. Cannot be used with --combine. | + +## `parxy pdf:split-by-text` + +Split a PDF into chunks whenever a page matches a text condition + +``` +parxy pdf:split-by-text [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to split | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--text` | `-t` | `text` | - | Text to match. Can be repeated for multiple patterns (OR logic). | +| `--mode` | `-m` | `text` | `contains` | Matching mode: "contains" (default) or "starts-with". | +| `--ignore-case` | `-i` | `flag` | `false` | Case-insensitive matching. | +| `--regex` | - | `flag` | `false` | Treat --text values as regular expressions. | +| `--discard-preamble` | - | `flag` | `false` | Discard pages that appear before the first matching page. | +| `--output` | `-o` | `text` | - | Output directory for chunk files (default: {stem}_split next to input). | +| `--prefix` | `-p` | `text` | - | Prefix for output filenames. Defaults to the input filename stem. | + +## `parxy pdf:tag-skeleton` + +Copy a tagged PDF keeping its tags but removing visible content + +``` +parxy pdf:tag-skeleton [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | Tagged PDF file to strip | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output path for the tags-only PDF (default: {stem}_tags.pdf next to input). | + +## `parxy pdf:tag-template` + +Create an empty tagged PDF skeleton for accessibility work + +``` +parxy pdf:tag-template [OPTIONS] +``` + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Output file path for the template PDF. If not specified, you will be prompted. | +| `--pages` | - | `integer` | `1` | Number of blank pages to create (default: 1). | +| `--lang` | - | `text` | `en-US` | Document language tag set on the catalog (default: en-US). | +| `--title` | - | `text` | - | Optional document title stored in the PDF metadata. | + +## `parxy pdf:tags` + +Extract the tag (structure) tree of a tagged PDF + +``` +parxy pdf:tags [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Write the extracted tags as JSON to this file instead of printing a tree. | +| `--json` | - | `flag` | `false` | Print the extracted tags as JSON to stdout. | +| `--text` | - | `flag` | `false` | Include the text content of each element. Rebuilds the tree per page; accessibility attributes (alt text, page refs) are not shown in this mode. | + +## `parxy pdf:tags-check` + +Check whether a PDF is a tagged (accessible) PDF + +``` +parxy pdf:tags-check [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--json` | - | `flag` | `false` | Output the detection result as JSON. | + +## `parxy pdf:xmp` + +Read and extract the XMP metadata of a PDF + +``` +parxy pdf:xmp [OPTIONS] INPUT_FILE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `INPUT_FILE` | Yes | PDF file to inspect | + +**Options:** + +| Option | Short | Type | Default | Description | +|--------|-------|------|---------|-------------| +| `--output` | `-o` | `text` | - | Write the metadata to this file. A .xml extension writes the raw XMP packet; any other extension writes parsed JSON. | +| `--json` | - | `flag` | `false` | Print the parsed metadata as JSON to stdout. | +| `--raw` | - | `flag` | `false` | Print the raw XMP XML packet to stdout. | + +## `parxy tui` + +Launch the Parxy TUI for interactive parser comparison + +``` +parxy tui WORKSPACE +``` + +**Arguments:** + +| Argument | Required | Description | +|----------|----------|-------------| +| `WORKSPACE` | No | Path to the workspace folder (optional — can be selected inside the TUI) | + +## `parxy version` + +Print Parxy version information. + +``` +parxy version +``` diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index 0c97211..8ea71cb 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -1,104 +1,104 @@ ---- -title: Configuration reference -description: Configuration options for Parxy and the drivers. Settings are read from the environment or a .env file. Run parxy env to generate a starter .env with some default. ---- - - - - -# Configuration reference - -All settings are read from environment variables or a `.env` file in your project root. - -Run `parxy env` to generate a template `.env` with usual configuration options. - -## Core settings - -Prefix: `PARXY_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_DEFAULT_DRIVER` | `pymupdf` | The default driver to use in case nothing is specified. | -| `PARXY_LOGGING_LEVEL` | `INFO` | The logging level. | -| `PARXY_LOGGING_FILE` | - | The log file path. | -| `PARXY_THEME` | - | The console theme to use. One of: `light`, `dark`. | - -## Observability / tracing - -Prefix: `PARXY_TRACING_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_TRACING_ENABLE` | `false` | Enable sending traces to the observability service. | -| `PARXY_TRACING_API_KEY` | *(secret)* | The authentication key (used for both traces and metrics unless overridden). | -| `PARXY_TRACING_ENDPOINT` | `http://localhost:4318/` | The base url of the Open Telemetry collector endpoint. | -| `PARXY_TRACING_ENABLE_METRICS` | `false` | Enable sending metrics to the telemetry service. | -| `PARXY_TRACING_TRACES_ENDPOINT` | *(computed)* | The endpoint for the traces exporter. | -| `PARXY_TRACING_METRICS_ENDPOINT` | *(computed)* | The endpoint for the metrics exporter. | -| `PARXY_TRACING_TIMEOUT_SECONDS` | `10` | The client timeout when sending traces. | -| `PARXY_TRACING_USE_COMPRESSION` | `true` | The client should compress traces before send. | -| `PARXY_TRACING_VERBOSE` | `true` | Log when traces are sent. | -| `PARXY_TRACING_AUTHENTICATION_HEADER` | `Authorization` | The header in which the api key needs to be included for authentication purposes. | - -## PdfAct - -Prefix: `PARXY_PDFACT_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_PDFACT_BASE_URL` | `http://localhost:4567/` | The base URL of the PdfAct API. | -| `PARXY_PDFACT_API_KEY` | *(secret)* | The authentication key. | - -## LlamaParse - -Prefix: `PARXY_LLAMAPARSE_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_LLAMAPARSE_BASE_URL` | `https://api.cloud.eu.llamaindex.ai` | The base URL of the LlamaParse API. | -| `PARXY_LLAMAPARSE_API_KEY` | *(secret)* | The authentication key. | -| `PARXY_LLAMAPARSE_ORGANIZATION_ID` | - | The organization ID for the LlamaParse API. | -| `PARXY_LLAMAPARSE_PROJECT_ID` | - | The project ID for the LlamaParse API. | -| `PARXY_LLAMAPARSE_TIER` | - | Parsing tier to use. One of: `fast`, `cost_effective`, `agentic`, `agentic_plus`. | -| `PARXY_LLAMAPARSE_VERSION` | `latest` | API version string. | -| `PARXY_LLAMAPARSE_PARSE_MODE` | - | Legacy parsing mode. | -| `PARXY_LLAMAPARSE_PREMIUM_MODE` | `false` | If True, selects the 'agentic_plus' tier (legacy shorthand). | -| `PARXY_LLAMAPARSE_FAST_MODE` | `false` | If True, selects the 'fast' tier (legacy shorthand). | -| `PARXY_LLAMAPARSE_DISABLE_OCR` | `false` | Disable OCR on images embedded in the document. | -| `PARXY_LLAMAPARSE_SKIP_DIAGONAL_TEXT` | `false` | Skip text rotated at an angle (e.g. | -| `PARXY_LLAMAPARSE_LANGUAGE` | `en` | Primary language for OCR (e.g. | -| `PARXY_LLAMAPARSE_DO_NOT_UNROLL_COLUMNS` | `false` | Keep multi-column layout intact instead of linearising columns into sequential text. | -| `PARXY_LLAMAPARSE_DISABLE_IMAGE_EXTRACTION` | `false` | If True, skip image extraction. | -| `PARXY_LLAMAPARSE_CONTINUOUS_MODE` | `false` | Automatically merge tables that span multiple pages. | -| `PARXY_LLAMAPARSE_TARGET_PAGES` | - | Specific pages to extract. | -| `PARXY_LLAMAPARSE_MAX_PAGES` | - | Maximum number of pages to extract. | -| `PARXY_LLAMAPARSE_DO_NOT_CACHE` | `true` | If True, bypass result caching and force re-parsing. | -| `PARXY_LLAMAPARSE_VERBOSE` | `false` | Print progress indicators during parsing. | - -## LLMWhisperer - -Prefix: `PARXY_LLMWHISPERER_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_LLMWHISPERER_BASE_URL` | `https://llmwhisperer-api.eu-west.unstract.com/api/v2` | The base URL of the LlmWhisperer API v2. | -| `PARXY_LLMWHISPERER_API_KEY` | *(secret)* | The authentication key. | -| `PARXY_LLMWHISPERER_LOGGING_LEVEL` | `INFO` | The logging level for the client. | -| `PARXY_LLMWHISPERER_MODE` | `form` | Default parsing mode. | - -## Landing AI - -Prefix: `PARXY_LANDINGAI_` - -| Variable | Default | Description | -|----------|---------|-------------| -| `PARXY_LANDINGAI_API_KEY` | *(secret)* | The authentication key. | -| `PARXY_LANDINGAI_ENVIRONMENT` | `eu` | The environment to use. One of: `production`, `eu`. | -| `PARXY_LANDINGAI_BASE_URL` | - | The base URL of the Landing AI ADE API. | - -## Unstructured library - -Prefix: `PARXY_UNSTRUCTURED_LOCAL_` - -| Variable | Default | Description | -|----------|---------|-------------| +--- +title: Configuration reference +description: Configuration options for Parxy and the drivers. Settings are read from the environment or a .env file. Run parxy env to generate a starter .env with some default. +--- + + + + +# Configuration reference + +All settings are read from environment variables or a `.env` file in your project root. + +Run `parxy env` to generate a template `.env` with usual configuration options. + +## Core settings + +Prefix: `PARXY_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_DEFAULT_DRIVER` | `pymupdf` | The default driver to use in case nothing is specified. | +| `PARXY_LOGGING_LEVEL` | `INFO` | The logging level. | +| `PARXY_LOGGING_FILE` | - | The log file path. | +| `PARXY_THEME` | - | The console theme to use. One of: `light`, `dark`. | + +## Observability / tracing + +Prefix: `PARXY_TRACING_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_TRACING_ENABLE` | `false` | Enable sending traces to the observability service. | +| `PARXY_TRACING_API_KEY` | *(secret)* | The authentication key (used for both traces and metrics unless overridden). | +| `PARXY_TRACING_ENDPOINT` | `http://localhost:4318/` | The base url of the Open Telemetry collector endpoint. | +| `PARXY_TRACING_ENABLE_METRICS` | `false` | Enable sending metrics to the telemetry service. | +| `PARXY_TRACING_TRACES_ENDPOINT` | *(computed)* | The endpoint for the traces exporter. | +| `PARXY_TRACING_METRICS_ENDPOINT` | *(computed)* | The endpoint for the metrics exporter. | +| `PARXY_TRACING_TIMEOUT_SECONDS` | `10` | The client timeout when sending traces. | +| `PARXY_TRACING_USE_COMPRESSION` | `true` | The client should compress traces before send. | +| `PARXY_TRACING_VERBOSE` | `true` | Log when traces are sent. | +| `PARXY_TRACING_AUTHENTICATION_HEADER` | `Authorization` | The header in which the api key needs to be included for authentication purposes. | + +## PdfAct + +Prefix: `PARXY_PDFACT_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_PDFACT_BASE_URL` | `http://localhost:4567/` | The base URL of the PdfAct API. | +| `PARXY_PDFACT_API_KEY` | *(secret)* | The authentication key. | + +## LlamaParse + +Prefix: `PARXY_LLAMAPARSE_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LLAMAPARSE_BASE_URL` | `https://api.cloud.eu.llamaindex.ai` | The base URL of the LlamaParse API. | +| `PARXY_LLAMAPARSE_API_KEY` | *(secret)* | The authentication key. | +| `PARXY_LLAMAPARSE_ORGANIZATION_ID` | - | The organization ID for the LlamaParse API. | +| `PARXY_LLAMAPARSE_PROJECT_ID` | - | The project ID for the LlamaParse API. | +| `PARXY_LLAMAPARSE_TIER` | - | Parsing tier to use. One of: `fast`, `cost_effective`, `agentic`, `agentic_plus`. | +| `PARXY_LLAMAPARSE_VERSION` | `latest` | API version string. | +| `PARXY_LLAMAPARSE_PARSE_MODE` | - | Legacy parsing mode. | +| `PARXY_LLAMAPARSE_PREMIUM_MODE` | `false` | If True, selects the 'agentic_plus' tier (legacy shorthand). | +| `PARXY_LLAMAPARSE_FAST_MODE` | `false` | If True, selects the 'fast' tier (legacy shorthand). | +| `PARXY_LLAMAPARSE_DISABLE_OCR` | `false` | Disable OCR on images embedded in the document. | +| `PARXY_LLAMAPARSE_SKIP_DIAGONAL_TEXT` | `false` | Skip text rotated at an angle (e.g. | +| `PARXY_LLAMAPARSE_LANGUAGE` | `en` | Primary language for OCR (e.g. | +| `PARXY_LLAMAPARSE_DO_NOT_UNROLL_COLUMNS` | `false` | Keep multi-column layout intact instead of linearising columns into sequential text. | +| `PARXY_LLAMAPARSE_DISABLE_IMAGE_EXTRACTION` | `false` | If True, skip image extraction. | +| `PARXY_LLAMAPARSE_CONTINUOUS_MODE` | `false` | Automatically merge tables that span multiple pages. | +| `PARXY_LLAMAPARSE_TARGET_PAGES` | - | Specific pages to extract. | +| `PARXY_LLAMAPARSE_MAX_PAGES` | - | Maximum number of pages to extract. | +| `PARXY_LLAMAPARSE_DO_NOT_CACHE` | `true` | If True, bypass result caching and force re-parsing. | +| `PARXY_LLAMAPARSE_VERBOSE` | `false` | Print progress indicators during parsing. | + +## LLMWhisperer + +Prefix: `PARXY_LLMWHISPERER_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LLMWHISPERER_BASE_URL` | `https://llmwhisperer-api.eu-west.unstract.com/api/v2` | The base URL of the LlmWhisperer API v2. | +| `PARXY_LLMWHISPERER_API_KEY` | *(secret)* | The authentication key. | +| `PARXY_LLMWHISPERER_LOGGING_LEVEL` | `INFO` | The logging level for the client. | +| `PARXY_LLMWHISPERER_MODE` | `form` | Default parsing mode. | + +## Landing AI + +Prefix: `PARXY_LANDINGAI_` + +| Variable | Default | Description | +|----------|---------|-------------| +| `PARXY_LANDINGAI_API_KEY` | *(secret)* | The authentication key. | +| `PARXY_LANDINGAI_ENVIRONMENT` | `eu` | The environment to use. One of: `production`, `eu`. | +| `PARXY_LANDINGAI_BASE_URL` | - | The base URL of the Landing AI ADE API. | + +## Unstructured library + +Prefix: `PARXY_UNSTRUCTURED_LOCAL_` + +| Variable | Default | Description | +|----------|---------|-------------|