@@ -34,30 +34,39 @@ def configure_settings() -> None:
3434 Settings .context_window = 8000
3535
3636
37- def scale_image (image_path : Path , output_dir : Path , scale_factor : int = 3 ) -> Path :
38- """Scale up an image using high-quality resampling.
37+ def process_images (
38+ image_paths : List [str ],
39+ output_dir : Path ,
40+ transform_image_fn : Callable [[Image .Image ], Image .Image ]
41+ ) -> List [str ]:
42+ """Process images by applying a transformation function.
43+
44+ Generic infrastructure that loads images, applies transformation, and saves them.
3945
4046 Args:
41- image_path: Path to the original image
42- output_dir: Directory to save the scaled image
43- scale_factor: Factor to scale up the image (default: 3x)
47+ image_paths: List of paths to images
48+ output_dir: Directory to save processed images
49+ transform_image_fn: Function that takes PIL Image and returns transformed PIL Image
4450
4551 Returns:
46- Path to the scaled image
52+ List of paths to processed images
4753 """
48- # Load the image
49- img = Image . open ( image_path )
54+ output_dir . mkdir ( parents = True , exist_ok = True )
55+ processed_paths = []
5056
51- # Scale up the image using high-quality resampling
52- new_size = ( img . width * scale_factor , img . height * scale_factor )
53- img_resized = img . resize ( new_size , Image .Resampling . LANCZOS )
57+ for path in image_paths :
58+ # Load image
59+ img = Image .open ( path )
5460
55- # Save to output directory with same filename
56- output_dir .mkdir (parents = True , exist_ok = True )
57- output_path = output_dir / image_path .name
58- img_resized .save (output_path , quality = 95 )
61+ # Apply transformation
62+ img_transformed = transform_image_fn (img )
63+
64+ # Save transformed image
65+ output_path = output_dir / Path (path ).name
66+ img_transformed .save (output_path , quality = 95 )
67+ processed_paths .append (str (output_path ))
5968
60- return output_path
69+ return processed_paths
6170
6271
6372def extract_documents (
@@ -111,15 +120,15 @@ def create_extracted_df(
111120 records : List [dict ],
112121 id_column : str ,
113122 fields : List [str ],
114- transform_fn : Optional [Callable [[pd .DataFrame ], pd .DataFrame ]] = None
123+ data_transformer : Optional [Callable [[pd .DataFrame ], pd .DataFrame ]] = None
115124) -> pd .DataFrame :
116125 """Create DataFrame from extracted records.
117126
118127 Args:
119128 records: List of extraction results with id and data
120129 id_column: Column name for document IDs
121130 fields: List of field names to extract from the Pydantic model
122- transform_fn : Optional function to transform the DataFrame
131+ data_transformer : Optional function to transform the DataFrame
123132
124133 Returns:
125134 DataFrame with extracted fields
@@ -134,8 +143,8 @@ def create_extracted_df(
134143 ]
135144 )
136145
137- if transform_fn :
138- df = transform_fn (df )
146+ if data_transformer :
147+ df = data_transformer (df )
139148
140149 return df
141150
@@ -146,10 +155,9 @@ def extract_structured_data(
146155 prompt : str ,
147156 id_column : str = "document_id" ,
148157 fields : Optional [List [str ]] = None ,
149- preprocess : bool = False ,
150- output_dir : Optional [Path ] = None ,
151- scale_factor : int = 3 ,
152- transform_fn : Optional [Callable [[pd .DataFrame ], pd .DataFrame ]] = None ,
158+ image_transform_fn : Optional [Callable [[Image .Image ], Image .Image ]] = None ,
159+ image_output_dir : Optional [Path ] = None ,
160+ data_transformer : Optional [Callable [[pd .DataFrame ], pd .DataFrame ]] = None ,
153161) -> pd .DataFrame :
154162 """Extract structured data from documents using a generic pipeline.
155163
@@ -159,10 +167,9 @@ def extract_structured_data(
159167 prompt: Extraction prompt template (must include {context_str})
160168 id_column: Column name for document identifiers
161169 fields: List of field names to extract (if None, uses all model fields)
162- preprocess: Whether to scale/preprocess images
163- output_dir: Directory for preprocessed images
164- scale_factor: Image scaling factor if preprocessing
165- transform_fn: Optional transformation function for DataFrames
170+ image_transform_fn: Optional function to transform individual images (takes PIL Image, returns PIL Image)
171+ image_output_dir: Directory to save transformed images (required if image_transform_fn provided)
172+ data_transformer: Optional function to transform the extracted DataFrame
166173
167174 Returns:
168175 DataFrame with extracted data
@@ -173,22 +180,19 @@ def extract_structured_data(
173180 if fields is None :
174181 fields = list (output_cls .model_fields .keys ())
175182
176- # Preprocess images if requested
177- if preprocess :
178- if output_dir is None :
179- raise ValueError ("output_dir must be provided when preprocess=True" )
180- print ("Preprocessing images..." )
181- paths_to_parse = [
182- scale_image (Path (p ), output_dir , scale_factor = scale_factor )
183- for p in image_paths
184- ]
183+ # Process images if transformation function provided
184+ if image_transform_fn :
185+ if image_output_dir is None :
186+ raise ValueError ("image_output_dir must be provided when image_transform_fn is specified" )
187+ print ("Processing images..." )
188+ paths_to_parse = process_images (image_paths , image_output_dir , image_transform_fn )
185189 else :
186190 paths_to_parse = image_paths
187191
188192 # Extract documents
189193 structured_data = extract_documents (paths_to_parse , prompt , id_column , output_cls )
190194
191195 # Create extracted DataFrame
192- extracted_df = create_extracted_df (structured_data , id_column , fields , transform_fn )
196+ extracted_df = create_extracted_df (structured_data , id_column , fields , data_transformer )
193197
194198 return extracted_df
0 commit comments