22
33from __future__ import annotations
44
5- import re
65import uuid
76import warnings
87from pathlib import Path
98from urllib .parse import unquote , urlparse
109
1110from gitingest .config import TMP_BASE_PATH
1211from gitingest .schemas import IngestionQuery
13- from gitingest .utils .exceptions import InvalidPatternError
1412from gitingest .utils .git_utils import check_repo_exists , fetch_remote_branches_or_tags
15- from gitingest .utils .ignore_patterns import DEFAULT_IGNORE_PATTERNS
1613from gitingest .utils .query_parser_utils import (
1714 KNOWN_GIT_HOSTS ,
1815 _get_user_and_repo_from_path ,
1916 _is_valid_git_commit_hash ,
20- _is_valid_pattern ,
2117 _validate_host ,
2218 _validate_url_scheme ,
2319)
2420
2521
26- async def parse_query (
27- source : str ,
28- * ,
29- max_file_size : int ,
30- from_web : bool ,
31- include_patterns : set [str ] | str | None = None ,
32- ignore_patterns : set [str ] | str | None = None ,
33- token : str | None = None ,
34- ) -> IngestionQuery :
35- """Parse the input source to extract details for the query and process the include and ignore patterns.
36-
37- Parameters
38- ----------
39- source : str
40- The source URL or file path to parse.
41- max_file_size : int
42- The maximum file size in bytes to include.
43- from_web : bool
44- Flag indicating whether the source is a web URL.
45- include_patterns : set[str] | str | None
46- Patterns to include. Can be a set of strings or a single string.
47- ignore_patterns : set[str] | str | None
48- Patterns to ignore. Can be a set of strings or a single string.
49- token : str | None
50- GitHub personal access token (PAT) for accessing private repositories.
51-
52- Returns
53- -------
54- IngestionQuery
55- A dataclass object containing the parsed details of the repository or file path.
56-
57- """
58- if source .endswith (".git" ):
59- source = source [:- 4 ]
60-
61- # Determine the parsing method based on the source type
62- if from_web or urlparse (source ).scheme in ("https" , "http" ) or any (h in source for h in KNOWN_GIT_HOSTS ):
63- # We either have a full URL or a domain-less slug
64- query = await _parse_remote_repo (source , token = token )
65- else :
66- # Local path scenario
67- query = _parse_local_dir_path (source )
68-
69- # Combine default ignore patterns + custom patterns
70- ignore_patterns_set = DEFAULT_IGNORE_PATTERNS .copy ()
71- if ignore_patterns :
72- ignore_patterns_set .update (_parse_patterns (ignore_patterns ))
73-
74- # Process include patterns and override ignore patterns accordingly
75- if include_patterns :
76- parsed_include = _parse_patterns (include_patterns )
77- # Override ignore patterns with include patterns
78- ignore_patterns_set = set (ignore_patterns_set ) - set (parsed_include )
79- else :
80- parsed_include = None
81-
82- return IngestionQuery (
83- user_name = query .user_name ,
84- repo_name = query .repo_name ,
85- url = query .url ,
86- subpath = query .subpath ,
87- local_path = query .local_path ,
88- slug = query .slug ,
89- id = query .id ,
90- type = query .type ,
91- branch = query .branch ,
92- commit = query .commit ,
93- max_file_size = max_file_size ,
94- ignore_patterns = ignore_patterns_set ,
95- include_patterns = parsed_include ,
96- )
97-
98-
99- async def _parse_remote_repo (source : str , token : str | None = None ) -> IngestionQuery :
22+ async def parse_remote_repo (source : str , token : str | None = None ) -> IngestionQuery :
10023 """Parse a repository URL into a structured query dictionary.
10124
10225 If source is:
@@ -146,7 +69,8 @@ async def _parse_remote_repo(source: str, token: str | None = None) -> Ingestion
14669 local_path = TMP_BASE_PATH / _id / slug
14770 url = f"https://{ host } /{ user_name } /{ repo_name } "
14871
149- parsed = IngestionQuery (
72+ query = IngestionQuery (
73+ host = host ,
15074 user_name = user_name ,
15175 repo_name = repo_name ,
15276 url = url ,
@@ -158,57 +82,76 @@ async def _parse_remote_repo(source: str, token: str | None = None) -> Ingestion
15882 remaining_parts = parsed_url .path .strip ("/" ).split ("/" )[2 :]
15983
16084 if not remaining_parts :
161- return parsed
85+ return query
16286
16387 possible_type = remaining_parts .pop (0 ) # e.g. 'issues', 'pull', 'tree', 'blob'
16488
16589 # If no extra path parts, just return
16690 if not remaining_parts :
167- return parsed
91+ return query
16892
16993 # If this is an issues page or pull requests, return early without processing subpath
17094 # TODO: Handle issues and pull requests
17195 if remaining_parts and possible_type in {"issues" , "pull" }:
17296 msg = f"Warning: Issues and pull requests are not yet supported: { url } . Returning repository root."
17397 warnings .warn (msg , RuntimeWarning , stacklevel = 2 )
174- return parsed
98+ return query
17599
176100 if possible_type not in {"tree" , "blob" }:
177101 # TODO: Handle other types
178102 msg = f"Warning: Type '{ possible_type } ' is not yet supported: { url } . Returning repository root."
179103 warnings .warn (msg , RuntimeWarning , stacklevel = 2 )
180- return parsed
104+ return query
181105
182- parsed .type = possible_type # 'tree' or 'blob'
106+ query .type = possible_type
183107
184108 # Commit, branch, or tag
185109 commit_or_branch_or_tag = remaining_parts [0 ]
186110 if _is_valid_git_commit_hash (commit_or_branch_or_tag ): # Commit
187- parsed .commit = commit_or_branch_or_tag
111+ query .commit = commit_or_branch_or_tag
188112 remaining_parts .pop (0 ) # Consume the commit hash
189113 else : # Branch or tag
190114 # Try to resolve a tag
191- parsed .tag = await _configure_branch_or_tag (
115+ query .tag = await _configure_branch_or_tag (
192116 remaining_parts ,
193117 url = url ,
194118 ref_type = "tags" ,
195119 token = token ,
196120 )
197121
198122 # If no tag found, try to resolve a branch
199- if not parsed .tag :
200- parsed .branch = await _configure_branch_or_tag (
123+ if not query .tag :
124+ query .branch = await _configure_branch_or_tag (
201125 remaining_parts ,
202126 url = url ,
203127 ref_type = "branches" ,
204128 token = token ,
205129 )
206130
207131 # Only configure subpath if we have identified a commit, branch, or tag.
208- if remaining_parts and (parsed .commit or parsed .branch or parsed .tag ):
209- parsed .subpath += "/" .join (remaining_parts )
132+ if remaining_parts and (query .commit or query .branch or query .tag ):
133+ query .subpath += "/" .join (remaining_parts )
134+
135+ return query
210136
211- return parsed
137+
138+ def parse_local_dir_path (path_str : str ) -> IngestionQuery :
139+ """Parse the given file path into a structured query dictionary.
140+
141+ Parameters
142+ ----------
143+ path_str : str
144+ The file path to parse.
145+
146+ Returns
147+ -------
148+ IngestionQuery
149+ A dictionary containing the parsed details of the file path.
150+
151+ """
152+ path_obj = Path (path_str ).resolve ()
153+ slug = path_obj .name if path_str == "." else path_str .strip ("/" )
154+ return IngestionQuery (local_path = path_obj , slug = slug , id = str (uuid .uuid4 ()))
212155
213156
214157async def _configure_branch_or_tag (
@@ -272,69 +215,6 @@ async def _configure_branch_or_tag(
272215 return None
273216
274217
275- def _parse_patterns (pattern : set [str ] | str ) -> set [str ]:
276- """Parse and validate file/directory patterns for inclusion or exclusion.
277-
278- Takes either a single pattern string or set of pattern strings and processes them into a normalized list.
279- Patterns are split on commas and spaces, validated for allowed characters, and normalized.
280-
281- Parameters
282- ----------
283- pattern : set[str] | str
284- Pattern(s) to parse - either a single string or set of strings
285-
286- Returns
287- -------
288- set[str]
289- A set of normalized patterns.
290-
291- Raises
292- ------
293- InvalidPatternError
294- If any pattern contains invalid characters. Only alphanumeric characters,
295- dash (-), underscore (_), dot (.), forward slash (/), plus (+), and
296- asterisk (*) are allowed.
297-
298- """
299- patterns = pattern if isinstance (pattern , set ) else {pattern }
300-
301- parsed_patterns : set [str ] = set ()
302- for p in patterns :
303- parsed_patterns = parsed_patterns .union (set (re .split (",| " , p )))
304-
305- # Remove empty string if present
306- parsed_patterns = parsed_patterns - {"" }
307-
308- # Normalize Windows paths to Unix-style paths
309- parsed_patterns = {p .replace ("\\ " , "/" ) for p in parsed_patterns }
310-
311- # Validate and normalize each pattern
312- for p in parsed_patterns :
313- if not _is_valid_pattern (p ):
314- raise InvalidPatternError (p )
315-
316- return parsed_patterns
317-
318-
319- def _parse_local_dir_path (path_str : str ) -> IngestionQuery :
320- """Parse the given file path into a structured query dictionary.
321-
322- Parameters
323- ----------
324- path_str : str
325- The file path to parse.
326-
327- Returns
328- -------
329- IngestionQuery
330- A dictionary containing the parsed details of the file path.
331-
332- """
333- path_obj = Path (path_str ).resolve ()
334- slug = path_obj .name if path_str == "." else path_str .strip ("/" )
335- return IngestionQuery (local_path = path_obj , slug = slug , id = str (uuid .uuid4 ()))
336-
337-
338218async def try_domains_for_user_and_repo (user_name : str , repo_name : str , token : str | None = None ) -> str :
339219 """Attempt to find a valid repository host for the given ``user_name`` and ``repo_name``.
340220
0 commit comments