Skip to content

Commit 19c910e

Browse files
fix
1 parent 2b82344 commit 19c910e

18 files changed

+403
-435
lines changed

src/gitingest/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,5 @@
33
from gitingest.clone import clone_repo
44
from gitingest.entrypoint import ingest, ingest_async
55
from gitingest.ingestion import ingest_query
6-
from gitingest.query_parser import parse_query
76

8-
__all__ = ["clone_repo", "ingest", "ingest_async", "ingest_query", "parse_query"]
7+
__all__ = ["clone_repo", "ingest", "ingest_async", "ingest_query"]

src/gitingest/clone.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
5656
msg = "Repository not found. Make sure it is public or that you have provided a valid token."
5757
raise ValueError(msg)
5858

59-
commit = await resolve_commit(config, url=url, token=token)
59+
commit = await resolve_commit(config, token=token)
6060

6161
clone_cmd = ["git"]
6262
if token and is_github_host(url):
@@ -73,7 +73,7 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
7373

7474
# Checkout the subpath if it is a partial clone
7575
if partial_clone:
76-
await checkout_partial_clone(config, token)
76+
await checkout_partial_clone(config, token=token)
7777

7878
git = create_git_command(["git"], local_path, url, token)
7979

src/gitingest/entrypoint.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,21 @@
88
import warnings
99
from contextlib import asynccontextmanager
1010
from pathlib import Path
11-
from typing import AsyncGenerator
11+
from typing import TYPE_CHECKING, AsyncGenerator
12+
from urllib.parse import urlparse
1213

1314
from gitingest.clone import clone_repo
1415
from gitingest.config import MAX_FILE_SIZE
1516
from gitingest.ingestion import ingest_query
16-
from gitingest.query_parser import IngestionQuery, parse_query
17+
from gitingest.query_parser import parse_local_dir_path, parse_remote_repo
1718
from gitingest.utils.auth import resolve_token
19+
from gitingest.utils.compat_func import removesuffix
1820
from gitingest.utils.ignore_patterns import load_ignore_patterns
21+
from gitingest.utils.pattern_utils import process_patterns
22+
from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS
23+
24+
if TYPE_CHECKING:
25+
from gitingest.schemas import IngestionQuery
1926

2027

2128
async def ingest_async(
@@ -74,23 +81,28 @@ async def ingest_async(
7481
"""
7582
token = resolve_token(token)
7683

77-
query: IngestionQuery = await parse_query(
78-
source=source,
79-
max_file_size=max_file_size,
80-
from_web=False,
84+
source = removesuffix(source.strip(), ".git")
85+
86+
# Determine the parsing method based on the source type
87+
if urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
88+
# We either have a full URL or a domain-less slug
89+
query = await parse_remote_repo(source, token=token)
90+
query.include_submodules = include_submodules
91+
_override_branch_and_tag(query, branch=branch, tag=tag)
92+
93+
else:
94+
# Local path scenario
95+
query = parse_local_dir_path(source)
96+
97+
query.max_file_size = max_file_size
98+
query.ignore_patterns, query.include_patterns = process_patterns(
99+
exclude_patterns=exclude_patterns,
81100
include_patterns=include_patterns,
82-
ignore_patterns=exclude_patterns,
83-
token=token,
84101
)
85102

86103
if not include_gitignored:
87104
_apply_gitignores(query)
88105

89-
if query.url:
90-
_override_branch_and_tag(query, branch=branch, tag=tag)
91-
92-
query.include_submodules = include_submodules
93-
94106
async with _clone_repo_if_remote(query, token=token):
95107
summary, tree, content = ingest_query(query)
96108
await _write_output(tree, content=content, target=output)

src/gitingest/ingestion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
1212

1313
if TYPE_CHECKING:
14-
from gitingest.query_parser import IngestionQuery
14+
from gitingest.schemas import IngestionQuery
1515

1616

1717
def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:

src/gitingest/output_formatter.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from gitingest.utils.compat_func import readlink
1111

1212
if TYPE_CHECKING:
13-
from gitingest.query_parser import IngestionQuery
13+
from gitingest.schemas import IngestionQuery
1414

1515
_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
1616
(1_000_000, "M"),
@@ -84,6 +84,8 @@ def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False)
8484

8585
if query.commit:
8686
parts.append(f"Commit: {query.commit}")
87+
elif query.tag:
88+
parts.append(f"Tag: {query.tag}")
8789
elif query.branch and query.branch not in ("main", "master"):
8890
parts.append(f"Branch: {query.branch}")
8991

src/gitingest/query_parser.py

Lines changed: 34 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -2,101 +2,24 @@
22

33
from __future__ import annotations
44

5-
import re
65
import uuid
76
import warnings
87
from pathlib import Path
98
from urllib.parse import unquote, urlparse
109

1110
from gitingest.config import TMP_BASE_PATH
1211
from gitingest.schemas import IngestionQuery
13-
from gitingest.utils.exceptions import InvalidPatternError
1412
from gitingest.utils.git_utils import check_repo_exists, fetch_remote_branches_or_tags
15-
from gitingest.utils.ignore_patterns import DEFAULT_IGNORE_PATTERNS
1613
from gitingest.utils.query_parser_utils import (
1714
KNOWN_GIT_HOSTS,
1815
_get_user_and_repo_from_path,
1916
_is_valid_git_commit_hash,
20-
_is_valid_pattern,
2117
_validate_host,
2218
_validate_url_scheme,
2319
)
2420

2521

26-
async def parse_query(
27-
source: str,
28-
*,
29-
max_file_size: int,
30-
from_web: bool,
31-
include_patterns: set[str] | str | None = None,
32-
ignore_patterns: set[str] | str | None = None,
33-
token: str | None = None,
34-
) -> IngestionQuery:
35-
"""Parse the input source to extract details for the query and process the include and ignore patterns.
36-
37-
Parameters
38-
----------
39-
source : str
40-
The source URL or file path to parse.
41-
max_file_size : int
42-
The maximum file size in bytes to include.
43-
from_web : bool
44-
Flag indicating whether the source is a web URL.
45-
include_patterns : set[str] | str | None
46-
Patterns to include. Can be a set of strings or a single string.
47-
ignore_patterns : set[str] | str | None
48-
Patterns to ignore. Can be a set of strings or a single string.
49-
token : str | None
50-
GitHub personal access token (PAT) for accessing private repositories.
51-
52-
Returns
53-
-------
54-
IngestionQuery
55-
A dataclass object containing the parsed details of the repository or file path.
56-
57-
"""
58-
if source.endswith(".git"):
59-
source = source[:-4]
60-
61-
# Determine the parsing method based on the source type
62-
if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
63-
# We either have a full URL or a domain-less slug
64-
query = await _parse_remote_repo(source, token=token)
65-
else:
66-
# Local path scenario
67-
query = _parse_local_dir_path(source)
68-
69-
# Combine default ignore patterns + custom patterns
70-
ignore_patterns_set = DEFAULT_IGNORE_PATTERNS.copy()
71-
if ignore_patterns:
72-
ignore_patterns_set.update(_parse_patterns(ignore_patterns))
73-
74-
# Process include patterns and override ignore patterns accordingly
75-
if include_patterns:
76-
parsed_include = _parse_patterns(include_patterns)
77-
# Override ignore patterns with include patterns
78-
ignore_patterns_set = set(ignore_patterns_set) - set(parsed_include)
79-
else:
80-
parsed_include = None
81-
82-
return IngestionQuery(
83-
user_name=query.user_name,
84-
repo_name=query.repo_name,
85-
url=query.url,
86-
subpath=query.subpath,
87-
local_path=query.local_path,
88-
slug=query.slug,
89-
id=query.id,
90-
type=query.type,
91-
branch=query.branch,
92-
commit=query.commit,
93-
max_file_size=max_file_size,
94-
ignore_patterns=ignore_patterns_set,
95-
include_patterns=parsed_include,
96-
)
97-
98-
99-
async def _parse_remote_repo(source: str, token: str | None = None) -> IngestionQuery:
22+
async def parse_remote_repo(source: str, token: str | None = None) -> IngestionQuery:
10023
"""Parse a repository URL into a structured query dictionary.
10124
10225
If source is:
@@ -146,7 +69,8 @@ async def _parse_remote_repo(source: str, token: str | None = None) -> Ingestion
14669
local_path = TMP_BASE_PATH / _id / slug
14770
url = f"https://{host}/{user_name}/{repo_name}"
14871

149-
parsed = IngestionQuery(
72+
query = IngestionQuery(
73+
host=host,
15074
user_name=user_name,
15175
repo_name=repo_name,
15276
url=url,
@@ -158,57 +82,76 @@ async def _parse_remote_repo(source: str, token: str | None = None) -> Ingestion
15882
remaining_parts = parsed_url.path.strip("/").split("/")[2:]
15983

16084
if not remaining_parts:
161-
return parsed
85+
return query
16286

16387
possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob'
16488

16589
# If no extra path parts, just return
16690
if not remaining_parts:
167-
return parsed
91+
return query
16892

16993
# If this is an issues page or pull requests, return early without processing subpath
17094
# TODO: Handle issues and pull requests
17195
if remaining_parts and possible_type in {"issues", "pull"}:
17296
msg = f"Warning: Issues and pull requests are not yet supported: {url}. Returning repository root."
17397
warnings.warn(msg, RuntimeWarning, stacklevel=2)
174-
return parsed
98+
return query
17599

176100
if possible_type not in {"tree", "blob"}:
177101
# TODO: Handle other types
178102
msg = f"Warning: Type '{possible_type}' is not yet supported: {url}. Returning repository root."
179103
warnings.warn(msg, RuntimeWarning, stacklevel=2)
180-
return parsed
104+
return query
181105

182-
parsed.type = possible_type # 'tree' or 'blob'
106+
query.type = possible_type
183107

184108
# Commit, branch, or tag
185109
commit_or_branch_or_tag = remaining_parts[0]
186110
if _is_valid_git_commit_hash(commit_or_branch_or_tag): # Commit
187-
parsed.commit = commit_or_branch_or_tag
111+
query.commit = commit_or_branch_or_tag
188112
remaining_parts.pop(0) # Consume the commit hash
189113
else: # Branch or tag
190114
# Try to resolve a tag
191-
parsed.tag = await _configure_branch_or_tag(
115+
query.tag = await _configure_branch_or_tag(
192116
remaining_parts,
193117
url=url,
194118
ref_type="tags",
195119
token=token,
196120
)
197121

198122
# If no tag found, try to resolve a branch
199-
if not parsed.tag:
200-
parsed.branch = await _configure_branch_or_tag(
123+
if not query.tag:
124+
query.branch = await _configure_branch_or_tag(
201125
remaining_parts,
202126
url=url,
203127
ref_type="branches",
204128
token=token,
205129
)
206130

207131
# Only configure subpath if we have identified a commit, branch, or tag.
208-
if remaining_parts and (parsed.commit or parsed.branch or parsed.tag):
209-
parsed.subpath += "/".join(remaining_parts)
132+
if remaining_parts and (query.commit or query.branch or query.tag):
133+
query.subpath += "/".join(remaining_parts)
134+
135+
return query
210136

211-
return parsed
137+
138+
def parse_local_dir_path(path_str: str) -> IngestionQuery:
139+
"""Parse the given file path into a structured query dictionary.
140+
141+
Parameters
142+
----------
143+
path_str : str
144+
The file path to parse.
145+
146+
Returns
147+
-------
148+
IngestionQuery
149+
A dictionary containing the parsed details of the file path.
150+
151+
"""
152+
path_obj = Path(path_str).resolve()
153+
slug = path_obj.name if path_str == "." else path_str.strip("/")
154+
return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
212155

213156

214157
async def _configure_branch_or_tag(
@@ -272,69 +215,6 @@ async def _configure_branch_or_tag(
272215
return None
273216

274217

275-
def _parse_patterns(pattern: set[str] | str) -> set[str]:
276-
"""Parse and validate file/directory patterns for inclusion or exclusion.
277-
278-
Takes either a single pattern string or set of pattern strings and processes them into a normalized list.
279-
Patterns are split on commas and spaces, validated for allowed characters, and normalized.
280-
281-
Parameters
282-
----------
283-
pattern : set[str] | str
284-
Pattern(s) to parse - either a single string or set of strings
285-
286-
Returns
287-
-------
288-
set[str]
289-
A set of normalized patterns.
290-
291-
Raises
292-
------
293-
InvalidPatternError
294-
If any pattern contains invalid characters. Only alphanumeric characters,
295-
dash (-), underscore (_), dot (.), forward slash (/), plus (+), and
296-
asterisk (*) are allowed.
297-
298-
"""
299-
patterns = pattern if isinstance(pattern, set) else {pattern}
300-
301-
parsed_patterns: set[str] = set()
302-
for p in patterns:
303-
parsed_patterns = parsed_patterns.union(set(re.split(",| ", p)))
304-
305-
# Remove empty string if present
306-
parsed_patterns = parsed_patterns - {""}
307-
308-
# Normalize Windows paths to Unix-style paths
309-
parsed_patterns = {p.replace("\\", "/") for p in parsed_patterns}
310-
311-
# Validate and normalize each pattern
312-
for p in parsed_patterns:
313-
if not _is_valid_pattern(p):
314-
raise InvalidPatternError(p)
315-
316-
return parsed_patterns
317-
318-
319-
def _parse_local_dir_path(path_str: str) -> IngestionQuery:
320-
"""Parse the given file path into a structured query dictionary.
321-
322-
Parameters
323-
----------
324-
path_str : str
325-
The file path to parse.
326-
327-
Returns
328-
-------
329-
IngestionQuery
330-
A dictionary containing the parsed details of the file path.
331-
332-
"""
333-
path_obj = Path(path_str).resolve()
334-
slug = path_obj.name if path_str == "." else path_str.strip("/")
335-
return IngestionQuery(local_path=path_obj, slug=slug, id=str(uuid.uuid4()))
336-
337-
338218
async def try_domains_for_user_and_repo(user_name: str, repo_name: str, token: str | None = None) -> str:
339219
"""Attempt to find a valid repository host for the given ``user_name`` and ``repo_name``.
340220

0 commit comments

Comments
 (0)