enforce UUID type for ingest_id, resolve comments

ix-56h · ix-56h · commit 86cb794dbc35 · 2025-07-22T17:29:35.000+02:00
diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
@@ -138,9 +138,9 @@ async def _parse_remote_repo(source: str, token: str | None = None) -> Ingestion
     host = parsed_url.netloc.lower()
     user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path)
 
-    _id = str(uuid.uuid4())
+    _id = uuid.uuid4()
     slug = f"{user_name}-{repo_name}"
-    local_path = TMP_BASE_PATH / _id / slug
+    local_path = TMP_BASE_PATH / str(_id) / slug
     url = f"https://{host}/{user_name}/{repo_name}"
 
     parsed = IngestionQuery(
@@ -329,7 +329,7 @@ def _parse_local_dir_path(path_str: str) -> IngestionQuery:
     """
     path_obj = Path(path_str).resolve()
     slug = path_obj.name if path_str == "." else path_str.strip("/")
-    return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
+    return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4())
 
 
 async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: str | None = None) -> str:
diff --git a/src/gitingest/schemas/ingestion.py b/src/gitingest/schemas/ingestion.py
@@ -4,6 +4,7 @@
 
 from dataclasses import dataclass
 from pathlib import Path  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
+from uuid import UUID  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
 
 from pydantic import BaseModel, Field
 
@@ -63,7 +64,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
         The URL of the repository.
     slug : str
         The slug of the repository.
-    id : str
+    id : UUID
         The ID of the repository.
     subpath : str
         The subpath to the repository or file (default: ``"/"``).
@@ -84,7 +85,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
     include_submodules : bool
         Whether to include all Git submodules within the repository. (default: ``False``)
     s3_url : str | None
-        The S3 URL where the digest is stored if S3 is enabled (default: ``None``).
+        The S3 URL where the digest is stored if S3 is enabled.
 
     """
 
@@ -93,7 +94,7 @@ class IngestionQuery(BaseModel):  # pylint: disable=too-many-instance-attributes
     local_path: Path
     url: str | None = None
     slug: str
-    id: str
+    id: UUID
     subpath: str = "/"
     type: str | None = None
     branch: str | None = None
diff --git a/src/gitingest/utils/s3_utils.py b/src/gitingest/utils/s3_utils.py
@@ -5,8 +5,9 @@
 import hashlib
 import os
 from typing import Any
+from uuid import UUID  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
 
-import boto3
+from boto3 import client as boto_client
 from botocore.exceptions import ClientError
 
 
@@ -50,21 +51,32 @@ def generate_s3_file_path(
 ) -> str:
     """Generate S3 file path with proper naming convention.
 
-    Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
-    The commit-ID is always included in the URL. If no specific commit is provided,
-    the actual commit hash from the cloned repository is used.
-
-    Args:
-        source: Git host (github, gitlab, etc.)
-        user_name: Repository owner/user
-        repo_name: Repository name
-        branch: Branch name (if available)
-        commit: Commit hash (should always be available now)
-        include_patterns: Include patterns set
-        ignore_patterns: Ignore patterns set
-
-    Returns:
-        S3 file path string
+    The file path is formatted as:
+    /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
+    The commit-ID is always included in the URL.
+    If no specific commit is provided, the actual commit hash from the cloned repository is used.
+
+    Parameters
+    ----------
+    source : str
+        Git host (e.g., github, gitlab, bitbucket, etc.).
+    user_name : str
+        Repository owner or user.
+    repo_name : str
+        Repository name.
+    branch : str | None
+        Branch name (if available).
+    commit : str | None
+        Commit hash (should always be available now).
+    include_patterns : set[str] | None
+        Set of patterns specifying which files to include.
+    ignore_patterns : set[str]
+        Set of patterns specifying which files to exclude.
+
+    Returns
+    -------
+    str
+        S3 file path string.
 
     """
     # Extract source from URL or default to "unknown"
@@ -89,29 +101,41 @@ def generate_s3_file_path(
     # Commit should always be available now, but provide fallback just in case
     commit_id = commit or "HEAD"
 
-    # Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<hash>.txt
     return f"ingest/{git_source}/{user_name}/{repo_name}/{branch_name}/{commit_id}/{patterns_hash}.txt"
 
 
-def create_s3_client() -> boto3.client:
+def create_s3_client() -> boto_client:  # type: ignore[name-defined]
     """Create and return an S3 client with configuration from environment."""
     config = get_s3_config()
-    return boto3.client("s3", **config)
+    return boto_client("s3", **config)
 
 
-def upload_to_s3(content: str, s3_file_path: str, ingest_id: str) -> str:
+def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
     """Upload content to S3 and return the public URL.
 
-    Args:
-        content: The digest content to upload
-        s3_file_path: The S3 file path
-        ingest_id: The ingest ID to store as S3 object tag
-
-    Returns:
-        Public URL to access the uploaded file
-
-    Raises:
-        Exception: If upload fails
+    This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file.
+    The ingest ID is stored as an S3 object tag.
+
+    Parameters
+    ----------
+    content : str
+        The digest content to upload.
+    s3_file_path : str
+        The S3 file path where the content will be stored.
+    ingest_id : UUID
+        The ingest ID to store as an S3 object tag.
+
+    Returns
+    -------
+    str
+        Public URL to access the uploaded file.
+
+    Raises
+    ------
+    ValueError
+        If S3 is not enabled.
+    S3UploadError
+        If the upload to S3 fails.
 
     """
     if not is_s3_enabled():
@@ -128,7 +152,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: str) -> str:
             Key=s3_file_path,
             Body=content.encode("utf-8"),
             ContentType="text/plain",
-            Tagging=f"ingest_id={ingest_id}",
+            Tagging=f"ingest_id={ingest_id!s}",
         )
 
         # Generate public URL
@@ -160,27 +184,36 @@ def _build_s3_url(key: str) -> str:
     return f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{key}"
 
 
-def _check_object_tags(s3_client: boto3.client, bucket_name: str, key: str, target_ingest_id: str) -> bool:
+def _check_object_tags(
+    s3_client: boto_client,  # type: ignore[name-defined]
+    bucket_name: str,
+    key: str,
+    target_ingest_id: UUID,
+) -> bool:
     """Check if an S3 object has the matching ingest_id tag."""
     try:
         tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key)
         tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])}
-        return tags.get("ingest_id") == target_ingest_id
+        return tags.get("ingest_id") == str(target_ingest_id)
     except ClientError:
         return False
 
 
-def get_s3_url_for_ingest_id(ingest_id: str) -> str | None:
+def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
     """Get S3 URL for a given ingest ID if it exists.
 
-    This is used by the download endpoint to redirect to S3 if available.
-    Searches for files using S3 object tags to find the matching ingest_id.
+    Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found.
+    Used by the download endpoint to redirect to S3 if available.
 
-    Args:
-        ingest_id: The ingest ID
+    Parameters
+    ----------
+    ingest_id : UUID
+        The ingest ID to search for in S3 object tags.
 
-    Returns:
-        S3 URL if file exists, None otherwise
+    Returns
+    -------
+    str | None
+        S3 URL if file exists, None otherwise.
 
     """
     if not is_s3_enabled():
diff --git a/src/server/query_processor.py b/src/server/query_processor.py
@@ -150,7 +150,7 @@ async def process_query(
         repo_url=input_text,
         short_repo_url=short_repo_url,
         summary=summary,
-        ingest_id=query.id,
+        ingest_id=str(query.id),
         tree=tree,
         content=content,
         default_max_file_size=slider_position,
diff --git a/src/server/routers/ingest.py b/src/server/routers/ingest.py
@@ -1,18 +1,23 @@
 """Ingest endpoint for the API."""
 
-from typing import Union
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+from uuid import UUID  # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
 
 from fastapi import APIRouter, HTTPException, Request, status
 from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
 from prometheus_client import Counter
 
 from gitingest.config import TMP_BASE_PATH
 from gitingest.utils.s3_utils import get_s3_url_for_ingest_id, is_s3_enabled
-from server.models import IngestRequest
 from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion
 from server.server_config import MAX_DISPLAY_SIZE
 from server.server_utils import limiter
 
+if TYPE_CHECKING:
+    from server.models import IngestRequest
+
 ingest_counter = Counter("gitingest_ingest_total", "Number of ingests", ["status", "url"])
 
 router = APIRouter()
@@ -94,7 +99,7 @@ async def api_ingest_get(
 
 
 @router.get("/api/download/file/{ingest_id}", response_model=None)
-async def download_ingest(ingest_id: str) -> Union[RedirectResponse, FileResponse]:  # noqa: FA100
+async def download_ingest(ingest_id: UUID) -> RedirectResponse | FileResponse:
     """Download the first text file produced for an ingest ID.
 
     **This endpoint retrieves the first ``*.txt`` file produced during the ingestion process**
@@ -103,7 +108,7 @@ async def download_ingest(ingest_id: str) -> Union[RedirectResponse, FileRespons
 
     **Parameters**
 
-    - **ingest_id** (`str`): Identifier that the ingest step emitted
+    - **ingest_id** (`UUID`): Identifier that the ingest step emitted
 
     **Returns**
 
@@ -124,7 +129,7 @@ async def download_ingest(ingest_id: str) -> Union[RedirectResponse, FileRespons
 
     # Fall back to local file serving
     # Normalize and validate the directory path
-    directory = (TMP_BASE_PATH / ingest_id).resolve()
+    directory = (TMP_BASE_PATH / str(ingest_id)).resolve()
     if not str(directory).startswith(str(TMP_BASE_PATH.resolve())):
         raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")