refactor(llm): simplify image processing with single-image transformation

khuyentran1401 · khuyentran1401 · commit 6e028702d482 · 2025-10-03T14:22:52.000-05:00
- Replace image_preprocessor with image_transform_fn that works on single images
- Add process_images() generic infrastructure in document_extraction_pipeline
- Simplify extract_receipts_pipeline to only scale_receipt_image function
- Remove nested functions and wrapper layers
- Add image_output_dir parameter (separate from transform function)
- Update README with simpler examples showing direct function references

Benefits:
- No nested functions or imports inside functions
- Clear separation: generic I/O vs specific transformation
- Easy to customize: just pass lambda img: img.rotate(90)
- Simpler mental model: one function transforms one image
diff --git a/llm/smart_data_extraction_llamaindex/README.md b/llm/smart_data_extraction_llamaindex/README.md
@@ -56,8 +56,9 @@ class Invoice(BaseModel):
     total_amount: float = Field(description="Total amount")
 
 
-# 2. Optional: Define transformations
+# 2. Optional: Define data transformer
 def transform_invoice_data(df: pd.DataFrame) -> pd.DataFrame:
+    """Transform extracted invoice data."""
     df = df.copy()
     df["vendor_name"] = df["vendor_name"].str.upper()
     df["total_amount"] = pd.to_numeric(df["total_amount"], errors="coerce")
@@ -82,7 +83,7 @@ if __name__ == "__main__":
         output_cls=Invoice,
         prompt=INVOICE_PROMPT,
         id_column="invoice_id",
-        transform_fn=transform_invoice_data,
+        data_transformer=transform_invoice_data,
     )
 
     print(result_df)
@@ -99,10 +100,9 @@ def extract_structured_data(
     prompt: str,
     id_column: str = "document_id",
     fields: Optional[List[str]] = None,
-    preprocess: bool = False,
-    output_dir: Optional[Path] = None,
-    scale_factor: int = 3,
-    transform_fn: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
+    image_transform_fn: Optional[Callable[[Image.Image], Image.Image]] = None,
+    image_output_dir: Optional[Path] = None,
+    data_transformer: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
 ) -> pd.DataFrame
 ```
 
@@ -114,10 +114,9 @@ def extract_structured_data(
 **Optional Parameters:**
 - `id_column`: Document ID column name (default: "document_id")
 - `fields`: Fields to extract (default: all model fields)
-- `preprocess`: Enable image preprocessing (default: False)
-- `output_dir`: Directory for preprocessed images
-- `scale_factor`: Image scaling factor (default: 3)
-- `transform_fn`: Custom transformation function
+- `image_transform_fn`: Optional function to transform images (takes PIL Image, returns PIL Image)
+- `image_output_dir`: Directory to save transformed images (required if image_transform_fn provided)
+- `data_transformer`: Optional function to transform the extracted DataFrame
 
 **Returns:**
 - `pd.DataFrame`: Extracted data
@@ -142,28 +141,30 @@ result = extract_structured_data(
 )
 ```
 
-### With Image Preprocessing
+### With Image Transformation
 
 ```python
 from pathlib import Path
-from extract_receipts_pipeline import Receipt
+from PIL import Image
+from extract_receipts_pipeline import Receipt, scale_receipt_image
 
 result = extract_structured_data(
     image_paths=["low_res.jpg"],
     output_cls=Receipt,
     prompt="Extract receipt: {context_str}",
-    preprocess=True,
-    output_dir=Path("processed_images"),
-    scale_factor=3,
+    image_transform_fn=scale_receipt_image,  # Simple function reference
+    image_output_dir=Path("processed_images"),
 )
 ```
 
-### With Custom Transformations
+### With Data Transformation
 
 ```python
 import pandas as pd
 
-def clean_data(df: pd.DataFrame) -> pd.DataFrame:
+def transform_form_data(df: pd.DataFrame) -> pd.DataFrame:
+    """Clean and normalize extracted form data."""
+    df = df.copy()
     df["name"] = df["name"].str.title()
     df["email"] = df["email"].str.lower()
     return df
@@ -172,7 +173,7 @@ result = extract_structured_data(
     image_paths=["form.pdf"],
     output_cls=FormData,
     prompt="Extract: {context_str}",
-    transform_fn=clean_data,
+    data_transformer=transform_form_data,
 )
 ```
 
@@ -182,20 +183,36 @@ To create a new document extractor (like the receipt pipeline):
 
 1. Import the generic `extract_structured_data` function from `document_extraction_pipeline`
 2. Define your Pydantic schema(s)
-3. (Optional) Create transformation function
-4. Define extraction prompt
-5. Add `__main__` block with example usage
+3. (Optional) Create `image_transform_fn` - a simple function that transforms one PIL Image
+4. (Optional) Create `data_transformer` function for data transformation
+5. Define extraction prompt
+6. Add `__main__` block with example usage
+
+**Example image transformation:**
+```python
+from PIL import Image
+
+def rotate_and_scale(img: Image.Image) -> Image.Image:
+    """Custom transformation: rotate 90 degrees and scale up."""
+    rotated = img.rotate(90, expand=True)
+    new_size = (rotated.width * 2, rotated.height * 2)
+    return rotated.resize(new_size, Image.Resampling.LANCZOS)
+```
 
 See [extract_receipts_pipeline.py](extract_receipts_pipeline.py) for a complete example.
 
 ## Dependencies
 
-Both files include uv inline script dependencies. Required packages:
+### Generic Pipeline
+Required packages (in `document_extraction_pipeline.py`):
 - llama-index
 - llama-index-program-openai
 - llama-parse
 - python-dotenv
 - pandas
-- pillow
+
+### Receipt Pipeline
+Additional packages (in `extract_receipts_pipeline.py`):
+- pillow (for image preprocessing)
 
 Run with `uv run <script_name>.py` - dependencies will be automatically installed.
diff --git a/llm/smart_data_extraction_llamaindex/document_extraction_pipeline.py b/llm/smart_data_extraction_llamaindex/document_extraction_pipeline.py
@@ -34,30 +34,39 @@ def configure_settings() -> None:
 	Settings.context_window = 8000
 
 
-def scale_image(image_path: Path, output_dir: Path, scale_factor: int = 3) -> Path:
-	"""Scale up an image using high-quality resampling.
+def process_images(
+	image_paths: List[str],
+	output_dir: Path,
+	transform_image_fn: Callable[[Image.Image], Image.Image]
+) -> List[str]:
+	"""Process images by applying a transformation function.
+
+	Generic infrastructure that loads images, applies transformation, and saves them.
 
 	Args:
-	    image_path: Path to the original image
-	    output_dir: Directory to save the scaled image
-	    scale_factor: Factor to scale up the image (default: 3x)
+	    image_paths: List of paths to images
+	    output_dir: Directory to save processed images
+	    transform_image_fn: Function that takes PIL Image and returns transformed PIL Image
 
 	Returns:
-	    Path to the scaled image
+	    List of paths to processed images
 	"""
-	# Load the image
-	img = Image.open(image_path)
+	output_dir.mkdir(parents=True, exist_ok=True)
+	processed_paths = []
 
-	# Scale up the image using high-quality resampling
-	new_size = (img.width * scale_factor, img.height * scale_factor)
-	img_resized = img.resize(new_size, Image.Resampling.LANCZOS)
+	for path in image_paths:
+		# Load image
+		img = Image.open(path)
 
-	# Save to output directory with same filename
-	output_dir.mkdir(parents=True, exist_ok=True)
-	output_path = output_dir / image_path.name
-	img_resized.save(output_path, quality=95)
+		# Apply transformation
+		img_transformed = transform_image_fn(img)
+
+		# Save transformed image
+		output_path = output_dir / Path(path).name
+		img_transformed.save(output_path, quality=95)
+		processed_paths.append(str(output_path))
 
-	return output_path
+	return processed_paths
 
 
 def extract_documents(
@@ -111,15 +120,15 @@ def create_extracted_df(
 	records: List[dict],
 	id_column: str,
 	fields: List[str],
-	transform_fn: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None
+	data_transformer: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None
 ) -> pd.DataFrame:
 	"""Create DataFrame from extracted records.
 
 	Args:
 	    records: List of extraction results with id and data
 	    id_column: Column name for document IDs
 	    fields: List of field names to extract from the Pydantic model
-	    transform_fn: Optional function to transform the DataFrame
+	    data_transformer: Optional function to transform the DataFrame
 
 	Returns:
 	    DataFrame with extracted fields
@@ -134,8 +143,8 @@ def create_extracted_df(
 		]
 	)
 
-	if transform_fn:
-		df = transform_fn(df)
+	if data_transformer:
+		df = data_transformer(df)
 
 	return df
 
@@ -146,10 +155,9 @@ def extract_structured_data(
 	prompt: str,
 	id_column: str = "document_id",
 	fields: Optional[List[str]] = None,
-	preprocess: bool = False,
-	output_dir: Optional[Path] = None,
-	scale_factor: int = 3,
-	transform_fn: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
+	image_transform_fn: Optional[Callable[[Image.Image], Image.Image]] = None,
+	image_output_dir: Optional[Path] = None,
+	data_transformer: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
 ) -> pd.DataFrame:
 	"""Extract structured data from documents using a generic pipeline.
 
@@ -159,10 +167,9 @@ def extract_structured_data(
 	    prompt: Extraction prompt template (must include {context_str})
 	    id_column: Column name for document identifiers
 	    fields: List of field names to extract (if None, uses all model fields)
-	    preprocess: Whether to scale/preprocess images
-	    output_dir: Directory for preprocessed images
-	    scale_factor: Image scaling factor if preprocessing
-	    transform_fn: Optional transformation function for DataFrames
+	    image_transform_fn: Optional function to transform individual images (takes PIL Image, returns PIL Image)
+	    image_output_dir: Directory to save transformed images (required if image_transform_fn provided)
+	    data_transformer: Optional function to transform the extracted DataFrame
 
 	Returns:
 	    DataFrame with extracted data
@@ -173,22 +180,19 @@ def extract_structured_data(
 	if fields is None:
 		fields = list(output_cls.model_fields.keys())
 
-	# Preprocess images if requested
-	if preprocess:
-		if output_dir is None:
-			raise ValueError("output_dir must be provided when preprocess=True")
-		print("Preprocessing images...")
-		paths_to_parse = [
-			scale_image(Path(p), output_dir, scale_factor=scale_factor)
-			for p in image_paths
-		]
+	# Process images if transformation function provided
+	if image_transform_fn:
+		if image_output_dir is None:
+			raise ValueError("image_output_dir must be provided when image_transform_fn is specified")
+		print("Processing images...")
+		paths_to_parse = process_images(image_paths, image_output_dir, image_transform_fn)
 	else:
 		paths_to_parse = image_paths
 
 	# Extract documents
 	structured_data = extract_documents(paths_to_parse, prompt, id_column, output_cls)
 
 	# Create extracted DataFrame
-	extracted_df = create_extracted_df(structured_data, id_column, fields, transform_fn)
+	extracted_df = create_extracted_df(structured_data, id_column, fields, data_transformer)
 
 	return extracted_df
diff --git a/llm/smart_data_extraction_llamaindex/extract_receipts_pipeline.py b/llm/smart_data_extraction_llamaindex/extract_receipts_pipeline.py
@@ -17,6 +17,7 @@
 
 import pandas as pd
 from document_extraction_pipeline import extract_structured_data
+from PIL import Image
 from pydantic import BaseModel, Field
 
 
@@ -46,9 +47,31 @@ class Receipt(BaseModel):
 	items: List[ReceiptItem] = Field(default_factory=list)
 
 
-# Receipt-specific transformations
-def transform_receipt_columns(df: pd.DataFrame) -> pd.DataFrame:
-	"""Apply receipt-specific transformations."""
+# Receipt-specific image transformation
+def scale_receipt_image(img: Image.Image, scale_factor: int = 3) -> Image.Image:
+	"""Scale up a receipt image for better OCR.
+
+	Args:
+	    img: PIL Image object
+	    scale_factor: Factor to scale up the image (default: 3x)
+
+	Returns:
+	    Transformed PIL Image
+	"""
+	new_size = (img.width * scale_factor, img.height * scale_factor)
+	return img.resize(new_size, Image.Resampling.LANCZOS)
+
+
+# Receipt-specific data transformations
+def transform_receipt_data(df: pd.DataFrame) -> pd.DataFrame:
+	"""Transform extracted receipt data (normalize text, convert types).
+
+	Args:
+	    df: DataFrame with extracted receipt data
+
+	Returns:
+	    Transformed DataFrame
+	"""
 	df = df.copy()
 	df["company"] = df["company"].str.upper()
 	df["total"] = pd.to_numeric(df["total"], errors="coerce")
@@ -78,17 +101,16 @@ def transform_receipt_columns(df: pd.DataFrame) -> pd.DataFrame:
 	num_receipts = 10
 	receipt_paths = sorted(receipt_dir.glob("*.jpg"))[:num_receipts]
 
-	# Run the pipeline
+	# Run the pipeline - pass transformation function directly
 	result_df = extract_structured_data(
 		image_paths=receipt_paths,
 		output_cls=Receipt,
 		prompt=RECEIPT_PROMPT,
 		id_column="receipt_id",
 		fields=["company", "total", "purchase_date"],
-		preprocess=True,
-		output_dir=adjusted_receipt_dir,
-		scale_factor=3,
-		transform_fn=transform_receipt_columns,
+		image_transform_fn=scale_receipt_image,
+		image_output_dir=adjusted_receipt_dir,
+		data_transformer=transform_receipt_data,
 	)
 
 	print("\nExtraction complete!")