From f6d0a597961a983130a739cd93e087f079da0b2e Mon Sep 17 00:00:00 2001 From: mhadica Date: Thu, 12 Feb 2026 23:55:20 +0530 Subject: [PATCH] FEAT Add WordDocConverter --- pyproject.toml | 1 + pyrit/prompt_converter/__init__.py | 2 + pyrit/prompt_converter/word_doc_converter.py | 258 +++++++++++++++++++ 3 files changed, 261 insertions(+) create mode 100644 pyrit/prompt_converter/word_doc_converter.py diff --git a/pyproject.toml b/pyproject.toml index aac2f2929..a758e3e21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ "python-dotenv>=1.0.1", "pypdf>=6.6.2", "reportlab>=4.4.4", + "python-docx>=1.1.0", "segno>=1.6.6", "scipy>=1.15.3", "SQLAlchemy>=2.0.41", diff --git a/pyrit/prompt_converter/__init__.py b/pyrit/prompt_converter/__init__.py index e4f85ba2a..40c65798f 100644 --- a/pyrit/prompt_converter/__init__.py +++ b/pyrit/prompt_converter/__init__.py @@ -97,6 +97,7 @@ from pyrit.prompt_converter.unicode_sub_converter import UnicodeSubstitutionConverter from pyrit.prompt_converter.url_converter import UrlConverter from pyrit.prompt_converter.variation_converter import VariationConverter +from pyrit.prompt_converter.word_doc_converter import WordDoc_Converter from pyrit.prompt_converter.zalgo_converter import ZalgoConverter from pyrit.prompt_converter.zero_width_converter import ZeroWidthConverter @@ -177,6 +178,7 @@ "UnicodeSubstitutionConverter", "UrlConverter", "VariationConverter", + "WordDoc_Converter", "VariationSelectorSmugglerConverter", "WordIndexSelectionStrategy", "WordKeywordSelectionStrategy", diff --git a/pyrit/prompt_converter/word_doc_converter.py b/pyrit/prompt_converter/word_doc_converter.py new file mode 100644 index 000000000..5b8693cda --- /dev/null +++ b/pyrit/prompt_converter/word_doc_converter.py @@ -0,0 +1,258 @@ +from __future__ import annotations + +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from dataclasses import dataclass +import ast +import hashlib +from io import BytesIO +from pathlib import Path +from typing import Any, Dict, Optional + +from docx import Document # type: ignore[import-untyped] + +from pyrit.common.logger import logger +from pyrit.identifiers import ConverterIdentifier +from pyrit.models import PromptDataType, SeedPrompt, data_serializer_factory +from pyrit.models.data_type_serializer import DataTypeSerializer +from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter + + +@dataclass +class _WordDocInjectionConfig: + """Configuration for how to inject content into a Word document.""" + + existing_docx: Optional[Path] + placeholder: str + + +class WordDoc_Converter(PromptConverter): + """Convert a text prompt into a Word (.docx) document. + + This converter supports two main modes: + + 1. **New document generation** + If no existing document is provided, the converter creates a simple `.docx` + containing the rendered prompt content in a single paragraph. + + 2. **Placeholder-based injection into an existing document** + If an ``existing_docx`` is provided, the converter searches for a literal + placeholder string (for example ``{{INJECTION_PLACEHOLDER}}``) in the + document's paragraphs. When the placeholder is found fully inside a single + run, it is replaced with the rendered prompt content while preserving the + rest of the paragraph and its formatting. + + .. important:: + Placeholders must be fully contained within a single run. If a + placeholder spans multiple runs (for example due to mixed formatting), + this converter will not replace it. This limitation is intentional to + avoid collapsing mixed formatting or rewriting complex run structures. + + Security note: + This converter does **not** render Jinja2 templates from arbitrary + ``.docx`` content. Templating is handled via ``SeedPrompt`` (if provided), + and only the already-rendered text is injected into the document. This + avoids executing untrusted Jinja2 templates from document bodies. + """ + + SUPPORTED_INPUT_TYPES = ("text",) + SUPPORTED_OUTPUT_TYPES = ("binary_path",) + + def __init__( + self, + *, + prompt_template: Optional[SeedPrompt] = None, + existing_docx: Optional[Path] = None, + placeholder: str = "{{INJECTION_PLACEHOLDER}}", + ) -> None: + """Initialize the Word document converter. + + Args: + prompt_template: Optional ``SeedPrompt`` template used to render the + final content before injection. If provided, ``prompt`` should + be a dict-like object (or string representation) whose keys map + to the template parameters. + existing_docx: Optional path to an existing `.docx` file. When + provided, the converter will search for ``placeholder`` inside + the document paragraphs and replace it with the rendered content. + If not provided, a new document is generated instead. + placeholder: Literal placeholder text to search for in the existing + document. This value must be fully contained within a single + run for the replacement to succeed. + + Raises: + FileNotFoundError: If ``existing_docx`` is provided but does not exist. + ValueError: If ``placeholder`` is empty. + """ + super().__init__() + + if not placeholder: + raise ValueError("Placeholder must be a non-empty string.") + + if existing_docx is not None and not existing_docx.is_file(): + raise FileNotFoundError(f"Word document not found at: {existing_docx}") + + self._prompt_template = prompt_template + self._injection_config = _WordDocInjectionConfig( + existing_docx=existing_docx, + placeholder=placeholder, + ) + + def _build_identifier(self) -> ConverterIdentifier: + """Build identifier with template and document parameters.""" + template_hash: Optional[str] = None + if self._prompt_template: + template_hash = hashlib.sha256(str(self._prompt_template.value).encode("utf-8")).hexdigest()[:16] + + existing_docx_path = None + if self._injection_config.existing_docx: + existing_docx_path = str(self._injection_config.existing_docx) + + return self._create_identifier( + converter_specific_params={ + "prompt_template_hash": template_hash, + "existing_docx_path": existing_docx_path, + "placeholder": self._injection_config.placeholder, + } + ) + + async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: + """Convert the given prompt into a Word document (.docx). + + If ``prompt_template`` is provided, the prompt is first used to render the + template via ``SeedPrompt.render_template_value``. Otherwise, the raw + ``prompt`` string is used as the content. + + - When ``existing_docx`` is set, this content is injected into the + document by replacing the configured placeholder string. + - When no ``existing_docx`` is provided, a new document with a single + paragraph containing the content is created. + + Args: + prompt: The prompt or dynamic data used to generate the content. + input_type: The type of input data. Must be ``"text"``. + + Returns: + ConverterResult: Contains the path to the generated `.docx` file in + ``output_text`` and ``output_type="binary_path"``. + + Raises: + ValueError: If the input type is not supported. + """ + if not self.input_supported(input_type): + raise ValueError("Input type not supported") + + content = self._prepare_content(prompt) + + if self._injection_config.existing_docx: + doc_bytes = self._inject_into_existing_docx(content) + else: + doc_bytes = self._generate_new_docx(content) + + serializer = await self._serialize_docx(doc_bytes) + + return ConverterResult(output_text=serializer.value, output_type="binary_path") + + def _prepare_content(self, prompt: str) -> str: + """Prepare the content to be injected or written to the document. + + If a ``SeedPrompt`` template is provided, the ``prompt`` is parsed (if + necessary) as a dictionary and used to render the template. Otherwise, + the raw prompt string is used. + """ + if self._prompt_template: + logger.debug(f"Preparing Word content with template: {self._prompt_template.value}") + try: + dynamic_data: Dict[str, Any] + if isinstance(prompt, str): + dynamic_data = ast.literal_eval(prompt) + else: + dynamic_data = prompt # type: ignore[assignment] + + if not isinstance(dynamic_data, dict): + raise ValueError("Prompt must be a dictionary-compatible object after parsing.") + + rendered_content = self._prompt_template.render_template_value(**dynamic_data) + logger.debug("Rendered Word template content successfully.") + return rendered_content + except (ValueError, SyntaxError, KeyError) as exc: + logger.error("Error rendering Word template content: %s", exc) + raise ValueError(f"Failed to render the prompt for Word document: {exc}") from exc + + if isinstance(prompt, str): + logger.debug("No template provided for Word document. Using raw prompt content.") + return prompt + + raise ValueError("Prompt must be a string when no template is provided.") + + def _generate_new_docx(self, content: str) -> bytes: + """Generate a new `.docx` document containing the given content.""" + document = Document() + document.add_paragraph(content) + + buffer = BytesIO() + document.save(buffer) + buffer.seek(0) + return buffer.getvalue() + + def _inject_into_existing_docx(self, content: str) -> bytes: + """Inject content into an existing document by replacing the placeholder. + + The placeholder must appear fully inside a single run; if it only exists + across multiple runs, it will not be replaced. + """ + assert self._injection_config.existing_docx is not None + document = Document(self._injection_config.existing_docx) + + placeholder = self._injection_config.placeholder + replaced_any = False + + for paragraph in document.paragraphs: + if placeholder not in paragraph.text: + continue + + if self._replace_placeholder_in_paragraph(paragraph, placeholder, content): + replaced_any = True + + if not replaced_any: + logger.warning( + "No placeholder '%s' found in document '%s' or placeholder spanned multiple runs.", + placeholder, + self._injection_config.existing_docx, + ) + + buffer = BytesIO() + document.save(buffer) + buffer.seek(0) + return buffer.getvalue() + + @staticmethod + def _replace_placeholder_in_paragraph(paragraph: Any, placeholder: str, content: str) -> bool: + """Replace a placeholder inside a single run of a paragraph. + + This function searches all runs of a paragraph and performs a string + replacement in the first run whose text contains the placeholder. It + does not modify other runs, which helps preserve existing formatting. + + Returns: + bool: True if a replacement was made, False otherwise. + """ + for run in paragraph.runs: + if placeholder in run.text: + run.text = run.text.replace(placeholder, content) + return True + return False + + async def _serialize_docx(self, docx_bytes: bytes) -> DataTypeSerializer: + """Serialize the generated document using a data serializer.""" + extension = "docx" + + serializer = data_serializer_factory( + category="prompt-memory-entries", + data_type="binary_path", + extension=extension, + ) + await serializer.save_data(docx_bytes) + return serializer +