Skip to content

Commit 86cb794

Browse files
committed
enforce UUID type for ingest_id, resolve comments
1 parent 85ff2ce commit 86cb794

File tree

5 files changed

+91
-52
lines changed

5 files changed

+91
-52
lines changed

src/gitingest/query_parser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,9 @@ async def _parse_remote_repo(source: str, token: str | None = None) -> Ingestion
138138
host = parsed_url.netloc.lower()
139139
user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path)
140140

141-
_id = str(uuid.uuid4())
141+
_id = uuid.uuid4()
142142
slug = f"{user_name}-{repo_name}"
143-
local_path = TMP_BASE_PATH / _id / slug
143+
local_path = TMP_BASE_PATH / str(_id) / slug
144144
url = f"https://{host}/{user_name}/{repo_name}"
145145

146146
parsed = IngestionQuery(
@@ -329,7 +329,7 @@ def _parse_local_dir_path(path_str: str) -> IngestionQuery:
329329
"""
330330
path_obj = Path(path_str).resolve()
331331
slug = path_obj.name if path_str == "." else path_str.strip("/")
332-
return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
332+
return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4())
333333

334334

335335
async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: str | None = None) -> str:

src/gitingest/schemas/ingestion.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from dataclasses import dataclass
66
from pathlib import Path # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
7+
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
78

89
from pydantic import BaseModel, Field
910

@@ -63,7 +64,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
6364
The URL of the repository.
6465
slug : str
6566
The slug of the repository.
66-
id : str
67+
id : UUID
6768
The ID of the repository.
6869
subpath : str
6970
The subpath to the repository or file (default: ``"/"``).
@@ -84,7 +85,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
8485
include_submodules : bool
8586
Whether to include all Git submodules within the repository. (default: ``False``)
8687
s3_url : str | None
87-
The S3 URL where the digest is stored if S3 is enabled (default: ``None``).
88+
The S3 URL where the digest is stored if S3 is enabled.
8889
8990
"""
9091

@@ -93,7 +94,7 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
9394
local_path: Path
9495
url: str | None = None
9596
slug: str
96-
id: str
97+
id: UUID
9798
subpath: str = "/"
9899
type: str | None = None
99100
branch: str | None = None

src/gitingest/utils/s3_utils.py

Lines changed: 73 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
import hashlib
66
import os
77
from typing import Any
8+
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
89

9-
import boto3
10+
from boto3 import client as boto_client
1011
from botocore.exceptions import ClientError
1112

1213

@@ -50,21 +51,32 @@ def generate_s3_file_path(
5051
) -> str:
5152
"""Generate S3 file path with proper naming convention.
5253
53-
Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
54-
The commit-ID is always included in the URL. If no specific commit is provided,
55-
the actual commit hash from the cloned repository is used.
56-
57-
Args:
58-
source: Git host (github, gitlab, etc.)
59-
user_name: Repository owner/user
60-
repo_name: Repository name
61-
branch: Branch name (if available)
62-
commit: Commit hash (should always be available now)
63-
include_patterns: Include patterns set
64-
ignore_patterns: Ignore patterns set
65-
66-
Returns:
67-
S3 file path string
54+
The file path is formatted as:
55+
/ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<exclude&include hash>.txt
56+
The commit-ID is always included in the URL.
57+
If no specific commit is provided, the actual commit hash from the cloned repository is used.
58+
59+
Parameters
60+
----------
61+
source : str
62+
Git host (e.g., github, gitlab, bitbucket, etc.).
63+
user_name : str
64+
Repository owner or user.
65+
repo_name : str
66+
Repository name.
67+
branch : str | None
68+
Branch name (if available).
69+
commit : str | None
70+
Commit hash (should always be available now).
71+
include_patterns : set[str] | None
72+
Set of patterns specifying which files to include.
73+
ignore_patterns : set[str]
74+
Set of patterns specifying which files to exclude.
75+
76+
Returns
77+
-------
78+
str
79+
S3 file path string.
6880
6981
"""
7082
# Extract source from URL or default to "unknown"
@@ -89,29 +101,41 @@ def generate_s3_file_path(
89101
# Commit should always be available now, but provide fallback just in case
90102
commit_id = commit or "HEAD"
91103

92-
# Format: /ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/<hash>.txt
93104
return f"ingest/{git_source}/{user_name}/{repo_name}/{branch_name}/{commit_id}/{patterns_hash}.txt"
94105

95106

96-
def create_s3_client() -> boto3.client:
107+
def create_s3_client() -> boto_client: # type: ignore[name-defined]
97108
"""Create and return an S3 client with configuration from environment."""
98109
config = get_s3_config()
99-
return boto3.client("s3", **config)
110+
return boto_client("s3", **config)
100111

101112

102-
def upload_to_s3(content: str, s3_file_path: str, ingest_id: str) -> str:
113+
def upload_to_s3(content: str, s3_file_path: str, ingest_id: UUID) -> str:
103114
"""Upload content to S3 and return the public URL.
104115
105-
Args:
106-
content: The digest content to upload
107-
s3_file_path: The S3 file path
108-
ingest_id: The ingest ID to store as S3 object tag
109-
110-
Returns:
111-
Public URL to access the uploaded file
112-
113-
Raises:
114-
Exception: If upload fails
116+
This function uploads the provided content to an S3 bucket and returns the public URL for the uploaded file.
117+
The ingest ID is stored as an S3 object tag.
118+
119+
Parameters
120+
----------
121+
content : str
122+
The digest content to upload.
123+
s3_file_path : str
124+
The S3 file path where the content will be stored.
125+
ingest_id : UUID
126+
The ingest ID to store as an S3 object tag.
127+
128+
Returns
129+
-------
130+
str
131+
Public URL to access the uploaded file.
132+
133+
Raises
134+
------
135+
ValueError
136+
If S3 is not enabled.
137+
S3UploadError
138+
If the upload to S3 fails.
115139
116140
"""
117141
if not is_s3_enabled():
@@ -128,7 +152,7 @@ def upload_to_s3(content: str, s3_file_path: str, ingest_id: str) -> str:
128152
Key=s3_file_path,
129153
Body=content.encode("utf-8"),
130154
ContentType="text/plain",
131-
Tagging=f"ingest_id={ingest_id}",
155+
Tagging=f"ingest_id={ingest_id!s}",
132156
)
133157

134158
# Generate public URL
@@ -160,27 +184,36 @@ def _build_s3_url(key: str) -> str:
160184
return f"https://{bucket_name}.s3.{get_s3_config()['region_name']}.amazonaws.com/{key}"
161185

162186

163-
def _check_object_tags(s3_client: boto3.client, bucket_name: str, key: str, target_ingest_id: str) -> bool:
187+
def _check_object_tags(
188+
s3_client: boto_client, # type: ignore[name-defined]
189+
bucket_name: str,
190+
key: str,
191+
target_ingest_id: UUID,
192+
) -> bool:
164193
"""Check if an S3 object has the matching ingest_id tag."""
165194
try:
166195
tags_response = s3_client.get_object_tagging(Bucket=bucket_name, Key=key)
167196
tags = {tag["Key"]: tag["Value"] for tag in tags_response.get("TagSet", [])}
168-
return tags.get("ingest_id") == target_ingest_id
197+
return tags.get("ingest_id") == str(target_ingest_id)
169198
except ClientError:
170199
return False
171200

172201

173-
def get_s3_url_for_ingest_id(ingest_id: str) -> str | None:
202+
def get_s3_url_for_ingest_id(ingest_id: UUID) -> str | None:
174203
"""Get S3 URL for a given ingest ID if it exists.
175204
176-
This is used by the download endpoint to redirect to S3 if available.
177-
Searches for files using S3 object tags to find the matching ingest_id.
205+
Search for files in S3 using object tags to find the matching ingest_id and returns the S3 URL if found.
206+
Used by the download endpoint to redirect to S3 if available.
178207
179-
Args:
180-
ingest_id: The ingest ID
208+
Parameters
209+
----------
210+
ingest_id : UUID
211+
The ingest ID to search for in S3 object tags.
181212
182-
Returns:
183-
S3 URL if file exists, None otherwise
213+
Returns
214+
-------
215+
str | None
216+
S3 URL if file exists, None otherwise.
184217
185218
"""
186219
if not is_s3_enabled():

src/server/query_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ async def process_query(
150150
repo_url=input_text,
151151
short_repo_url=short_repo_url,
152152
summary=summary,
153-
ingest_id=query.id,
153+
ingest_id=str(query.id),
154154
tree=tree,
155155
content=content,
156156
default_max_file_size=slider_position,

src/server/routers/ingest.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
11
"""Ingest endpoint for the API."""
22

3-
from typing import Union
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
from uuid import UUID # noqa: TC003 (typing-only-standard-library-import) needed for type checking (pydantic)
47

58
from fastapi import APIRouter, HTTPException, Request, status
69
from fastapi.responses import FileResponse, JSONResponse, RedirectResponse
710
from prometheus_client import Counter
811

912
from gitingest.config import TMP_BASE_PATH
1013
from gitingest.utils.s3_utils import get_s3_url_for_ingest_id, is_s3_enabled
11-
from server.models import IngestRequest
1214
from server.routers_utils import COMMON_INGEST_RESPONSES, _perform_ingestion
1315
from server.server_config import MAX_DISPLAY_SIZE
1416
from server.server_utils import limiter
1517

18+
if TYPE_CHECKING:
19+
from server.models import IngestRequest
20+
1621
ingest_counter = Counter("gitingest_ingest_total", "Number of ingests", ["status", "url"])
1722

1823
router = APIRouter()
@@ -94,7 +99,7 @@ async def api_ingest_get(
9499

95100

96101
@router.get("/api/download/file/{ingest_id}", response_model=None)
97-
async def download_ingest(ingest_id: str) -> Union[RedirectResponse, FileResponse]: # noqa: FA100
102+
async def download_ingest(ingest_id: UUID) -> RedirectResponse | FileResponse:
98103
"""Download the first text file produced for an ingest ID.
99104
100105
**This endpoint retrieves the first ``*.txt`` file produced during the ingestion process**
@@ -103,7 +108,7 @@ async def download_ingest(ingest_id: str) -> Union[RedirectResponse, FileRespons
103108
104109
**Parameters**
105110
106-
- **ingest_id** (`str`): Identifier that the ingest step emitted
111+
- **ingest_id** (`UUID`): Identifier that the ingest step emitted
107112
108113
**Returns**
109114
@@ -124,7 +129,7 @@ async def download_ingest(ingest_id: str) -> Union[RedirectResponse, FileRespons
124129

125130
# Fall back to local file serving
126131
# Normalize and validate the directory path
127-
directory = (TMP_BASE_PATH / ingest_id).resolve()
132+
directory = (TMP_BASE_PATH / str(ingest_id)).resolve()
128133
if not str(directory).startswith(str(TMP_BASE_PATH.resolve())):
129134
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail=f"Invalid ingest ID: {ingest_id!r}")
130135

0 commit comments

Comments
 (0)