55import hashlib
66import os
77from typing import Any
8+ from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
89
9- import boto3
10+ from boto3 import client as boto_client
1011from botocore .exceptions import ClientError
1112
1213
@@ -50,21 +51,32 @@ def generate_s3_file_path(
5051) -> str :
5152 """Generate S3 file path with proper naming convention.
5253
53- Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
54- The commit-ID is always included in the URL. If no specific commit is provided,
55- the actual commit hash from the cloned repository is used.
56-
57- Args:
58- source: Git host (github, gitlab, etc.)
59- user_name: Repository owner/user
60- repo_name: Repository name
61- branch: Branch name (if available)
62- commit: Commit hash (should always be available now)
63- include_patterns: Include patterns set
64- ignore_patterns: Ignore patterns set
65-
66- Returns:
67- S3 file path string
54+ The file path is formatted as:
55+ /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
56+ The commit-ID is always included in the URL.
57+ If no specific commit is provided, the actual commit hash from the cloned repository is used.
58+
59+ Parameters
60+ ----------
61+ source : str
62+ Git host (e.g., github, gitlab, bitbucket, etc.).
63+ user_name : str
64+ Repository owner or user.
65+ repo_name : str
66+ Repository name.
67+ branch : str | None
68+ Branch name (if available).
69+ commit : str | None
70+ Commit hash (should always be available now).
71+ include_patterns : set[str] | None
72+ Set of patterns specifying which files to include.
73+ ignore_patterns : set[str]
74+ Set of patterns specifying which files to exclude.
75+
76+ Returns
77+ -------
78+ str
79+ S3 file path string.
6880
6981 """
7082 # Extract source from URL or default to "unknown"
@@ -89,29 +101,41 @@ def generate_s3_file_path(
89101 # Commit should always be available now, but provide fallback just in case
90102 commit_id = commit or "HEAD"
91103
92- # Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<hash>.txt
93104 return f"ingest/{ git_source } /{ user_name } /{ repo_name } /{ branch_name } /{ commit_id } /{ patterns_hash } .txt"
94105
95106
96- def create_s3_client () -> boto3 . client :
107+ def create_s3_client () -> boto_client : # type: ignore[name-defined]
97108 """Create and return an S3 client with configuration from environment."""
98109 config = get_s3_config ()
99- return boto3 . client ("s3" , ** config )
110+ return boto_client ("s3" , ** config )
100111
101112
102- def upload_to_s3 (content : str , s3_file_path : str , ingest_id : str ) -> str :
113+ def upload_to_s3 (content : str , s3_file_path : str , ingest_id : UUID ) -> str :
103114 """Upload content to S3 and return the public URL.
104115
105- Args:
106- content: The digest content to upload
107- s3_file_path: The S3 file path
108- ingest_id: The ingest ID to store as S3 object tag
109-
110- Returns:
111- Public URL to access the uploaded file
112-
113- Raises:
114- Exception: If upload fails
116+ This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file.
117+ The ingest ID is stored as an S3 object tag.
118+
119+ Parameters
120+ ----------
121+ content : str
122+ The digest content to upload.
123+ s3_file_path : str
124+ The S3 file path where the content will be stored.
125+ ingest_id : UUID
126+ The ingest ID to store as an S3 object tag.
127+
128+ Returns
129+ -------
130+ str
131+ Public URL to access the uploaded file.
132+
133+ Raises
134+ ------
135+ ValueError
136+ If S3 is not enabled.
137+ S3UploadError
138+ If the upload to S3 fails.
115139
116140 """
117141 if not is_s3_enabled ():
@@ -128,7 +152,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: str) -> str:
128152 Key = s3_file_path ,
129153 Body = content .encode ("utf-8" ),
130154 ContentType = "text/plain" ,
131- Tagging = f"ingest_id={ ingest_id } " ,
155+ Tagging = f"ingest_id={ ingest_id !s } " ,
132156 )
133157
134158 # Generate public URL
@@ -160,27 +184,36 @@ def _build_s3_url(key: str) -> str:
160184 return f"https://{ bucket_name } .s3.{ get_s3_config ()['region_name' ]} .amazonaws.com/{ key } "
161185
162186
163- def _check_object_tags (s3_client : boto3 .client , bucket_name : str , key : str , target_ingest_id : str ) -> bool :
187+ def _check_object_tags (
188+ s3_client : boto_client , # type: ignore[name-defined]
189+ bucket_name : str ,
190+ key : str ,
191+ target_ingest_id : UUID ,
192+ ) -> bool :
164193 """Check if an S3 object has the matching ingest_id tag."""
165194 try :
166195 tags_response = s3_client .get_object_tagging (Bucket = bucket_name , Key = key )
167196 tags = {tag ["Key" ]: tag ["Value" ] for tag in tags_response .get ("TagSet" , [])}
168- return tags .get ("ingest_id" ) == target_ingest_id
197+ return tags .get ("ingest_id" ) == str ( target_ingest_id )
169198 except ClientError :
170199 return False
171200
172201
173- def get_s3_url_for_ingest_id (ingest_id : str ) -> str | None :
202+ def get_s3_url_for_ingest_id (ingest_id : UUID ) -> str | None :
174203 """Get S3 URL for a given ingest ID if it exists.
175204
176- This is used by the download endpoint to redirect to S3 if available .
177- Searches for files using S3 object tags to find the matching ingest_id .
205+ Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found .
206+ Used by the download endpoint to redirect to S3 if available .
178207
179- Args:
180- ingest_id: The ingest ID
208+ Parameters
209+ ----------
210+ ingest_id : UUID
211+ The ingest ID to search for in S3 object tags.
181212
182- Returns:
183- S3 URL if file exists, None otherwise
213+ Returns
214+ -------
215+ str | None
216+ S3 URL if file exists, None otherwise.
184217
185218 """
186219 if not is_s3_enabled ():
0 commit comments