Skip to content

Commit 202f779

Browse files
resolve commit
1 parent 340edb6 commit 202f779

File tree

5 files changed

+170
-47
lines changed

5 files changed

+170
-47
lines changed

src/gitingest/clone.py

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
from gitingest.config import DEFAULT_TIMEOUT
99
from gitingest.utils.git_utils import (
1010
check_repo_exists,
11+
checkout_partial_clone,
1112
create_git_auth_header,
1213
create_git_command,
1314
ensure_git_installed,
1415
is_github_host,
16+
resolve_commit,
1517
run_command,
1618
)
17-
from gitingest.utils.os_utils import ensure_directory
19+
from gitingest.utils.os_utils import ensure_directory_exists_or_create
1820
from gitingest.utils.timeout_wrapper import async_timeout
1921

2022
if TYPE_CHECKING:
@@ -45,71 +47,42 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None:
4547
# Extract and validate query parameters
4648
url: str = config.url
4749
local_path: str = config.local_path
48-
commit: str | None = config.commit
49-
branch: str | None = config.branch
50-
tag: str | None = config.tag
5150
partial_clone: bool = config.subpath != "/"
5251

53-
# Create parent directory if it doesn't exist
54-
await ensure_directory(Path(local_path).parent)
52+
await ensure_git_installed()
53+
await ensure_directory_exists_or_create(Path(local_path).parent)
5554

56-
# Check if the repository exists
5755
if not await check_repo_exists(url, token=token):
5856
msg = "Repository not found. Make sure it is public or that you have provided a valid token."
5957
raise ValueError(msg)
6058

59+
commit = await resolve_commit(config, url=url, token=token)
60+
6161
clone_cmd = ["git"]
6262
if token and is_github_host(url):
6363
clone_cmd += ["-c", create_git_auth_header(token, url=url)]
6464

65-
clone_cmd += ["clone", "--single-branch"]
66-
67-
if config.include_submodules:
68-
clone_cmd += ["--recurse-submodules"]
69-
65+
clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"]
7066
if partial_clone:
7167
clone_cmd += ["--filter=blob:none", "--sparse"]
7268

73-
# Shallow clone unless a specific commit is requested
74-
if not commit:
75-
clone_cmd += ["--depth=1"]
76-
77-
# Prefer tag over branch when both are provided
78-
if tag:
79-
clone_cmd += ["--branch", tag]
80-
elif branch and branch.lower() not in ("main", "master"):
81-
clone_cmd += ["--branch", branch]
82-
8369
clone_cmd += [url, local_path]
8470

8571
# Clone the repository
86-
await ensure_git_installed()
8772
await run_command(*clone_cmd)
8873

8974
# Checkout the subpath if it is a partial clone
9075
if partial_clone:
91-
await _checkout_partial_clone(config, token)
76+
await checkout_partial_clone(config, token)
9277

93-
# Checkout the commit if it is provided
94-
if commit:
95-
checkout_cmd = create_git_command(["git"], local_path, url, token)
96-
await run_command(*checkout_cmd, "checkout", commit)
78+
git = create_git_command(["git"], local_path, url, token)
9779

80+
# Ensure the commit is locally available
81+
await run_command(*git, "fetch", "--depth=1", "origin", commit)
9882

99-
async def _checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
100-
"""Configure sparse-checkout for a partially cloned repository.
83+
# Write the work-tree at that commit
84+
await run_command(*git, "checkout", commit)
10185

102-
Parameters
103-
----------
104-
config : CloneConfig
105-
The configuration for cloning the repository, including subpath and blob flag.
106-
token : str | None
107-
GitHub personal access token (PAT) for accessing private repositories.
108-
109-
"""
110-
subpath = config.subpath.lstrip("/")
111-
if config.blob:
112-
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
113-
subpath = str(Path(subpath).parent.as_posix())
114-
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
115-
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
86+
# Update submodules
87+
if config.include_submodules:
88+
await run_command(*git, "submodule", "update", "--init", "--recursive", "--depth=1")

src/gitingest/query_parser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ async def parse_query(
5555
A dataclass object containing the parsed details of the repository or file path.
5656
5757
"""
58+
if source.endswith(".git"):
59+
source = source[:-4]
60+
5861
# Determine the parsing method based on the source type
5962
if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
6063
# We either have a full URL or a domain-less slug

src/gitingest/utils/git_utils.py

Lines changed: 148 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import base64
77
import re
88
import sys
9-
from typing import Final
9+
from pathlib import Path
10+
from typing import TYPE_CHECKING, Final, Iterable, Literal
1011
from urllib.parse import urlparse
1112

1213
import httpx
@@ -16,6 +17,9 @@
1617
from gitingest.utils.exceptions import InvalidGitHubTokenError
1718
from server.server_utils import Colors
1819

20+
if TYPE_CHECKING:
21+
from gitingest.schemas import CloneConfig
22+
1923
# GitHub Personal-Access tokens (classic + fine-grained).
2024
# - ghp_ / gho_ / ghu_ / ghs_ / ghr_ → 36 alphanumerics
2125
# - github_pat_ → 22 alphanumerics + "_" + 59 alphanumerics
@@ -321,3 +325,146 @@ def validate_github_token(token: str) -> None:
321325
"""
322326
if not re.fullmatch(_GITHUB_PAT_PATTERN, token):
323327
raise InvalidGitHubTokenError
328+
329+
330+
async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None:
331+
"""Configure sparse-checkout for a partially cloned repository.
332+
333+
Parameters
334+
----------
335+
config : CloneConfig
336+
The configuration for cloning the repository, including subpath and blob flag.
337+
token : str | None
338+
GitHub personal access token (PAT) for accessing private repositories.
339+
340+
"""
341+
subpath = config.subpath.lstrip("/")
342+
if config.blob:
343+
# Remove the file name from the subpath when ingesting from a file url (e.g. blob/branch/path/file.txt)
344+
subpath = str(Path(subpath).parent.as_posix())
345+
checkout_cmd = create_git_command(["git"], config.local_path, config.url, token)
346+
await run_command(*checkout_cmd, "sparse-checkout", "set", subpath)
347+
348+
349+
async def resolve_commit(config: CloneConfig, url: str, token: str | None) -> str:
350+
"""Resolve the commit to use for the clone.
351+
352+
Parameters
353+
----------
354+
config : CloneConfig
355+
The configuration for cloning the repository.
356+
url : str
357+
The URL of the remote repository.
358+
token : str | None
359+
GitHub personal access token (PAT) for accessing private repositories.
360+
361+
Returns
362+
-------
363+
str
364+
365+
"""
366+
if config.commit:
367+
commit = config.commit
368+
elif config.tag:
369+
commit = await _resolve_ref_to_sha(url, ref=config.tag, kind="tag", token=token)
370+
elif config.branch:
371+
commit = await _resolve_ref_to_sha(url, ref=config.branch, kind="branch", token=token)
372+
else:
373+
commit = await _resolve_ref_to_sha(url, ref="HEAD", kind="branch", token=token)
374+
return commit
375+
376+
377+
async def _resolve_ref_to_sha(
378+
url: str,
379+
ref: str,
380+
kind: Literal["branch", "tag"],
381+
*,
382+
token: str | None = None,
383+
) -> str:
384+
"""Return the commit SHA that <kind>/<ref> points to in <url>.
385+
386+
* Branch → first line from ``git ls-remote``.
387+
* Tag → if annotated, prefer the peeled ``^{}`` line (commit).
388+
389+
Parameters
390+
----------
391+
url : str
392+
The URL of the remote repository.
393+
ref : str
394+
The reference to resolve to a commit SHA.
395+
kind : Literal["branch", "tag"]
396+
The kind of reference to resolve to a commit SHA.
397+
token : str | None
398+
GitHub personal access token (PAT) for accessing private repositories.
399+
400+
Returns
401+
-------
402+
str
403+
The commit SHA.
404+
405+
Raises
406+
------
407+
ValueError
408+
If the ref does not exist in the remote repository.
409+
410+
"""
411+
await ensure_git_installed()
412+
413+
# Build: git [-c http.<host>/.extraheader=Auth...] ls-remote <url> <pattern>
414+
cmd: list[str] = ["git"]
415+
if token and is_github_host(url):
416+
cmd += ["-c", create_git_auth_header(token, url=url)]
417+
418+
if ref == "HEAD":
419+
pattern = "HEAD"
420+
elif kind == "branch":
421+
pattern = f"refs/heads/{ref}"
422+
else: # tag
423+
pattern = f"refs/tags/{ref}*"
424+
425+
cmd += ["ls-remote", url, pattern]
426+
stdout, _ = await run_command(*cmd)
427+
428+
lines = stdout.decode().splitlines()
429+
430+
sha = _pick_commit_sha(lines)
431+
if not sha:
432+
msg = f"{kind} {ref!r} not found in {url}"
433+
raise ValueError(msg)
434+
435+
return sha
436+
437+
438+
def _pick_commit_sha(lines: Iterable[str]) -> str | None:
439+
"""Return a commit SHA from ``git ls-remote`` output.
440+
441+
• Annotated tag → prefer the peeled line (<sha> refs/tags/x^{})
442+
• Branch / lightweight tag → first non-peeled line
443+
444+
445+
Parameters
446+
----------
447+
lines : Iterable[str]
448+
The lines of a ``git ls-remote`` output.
449+
450+
Returns
451+
-------
452+
str | None
453+
The commit SHA, or ``None`` if no commit SHA is found.
454+
455+
"""
456+
first_non_peeled: str | None = None
457+
458+
for ln in lines:
459+
if not ln.strip():
460+
continue
461+
462+
sha, ref = ln.split(maxsplit=1)
463+
464+
if ref.endswith("^{}"): # peeled commit of annotated tag
465+
return sha # ← best match, done
466+
467+
if first_non_peeled is None: # remember the first ordinary line
468+
first_non_peeled = sha
469+
470+
return first_non_peeled # branch or lightweight tag (or None)

src/gitingest/utils/os_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pathlib import Path
44

55

6-
async def ensure_directory(path: Path) -> None:
6+
async def ensure_directory_exists_or_create(path: Path) -> None:
77
"""Ensure the directory exists, creating it if necessary.
88
99
Parameters

tests/test_clone.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ async def test_clone_with_commit(repo_exists_true: AsyncMock, run_command_mock:
3333
When ``clone_repo`` is called,
3434
Then the repository should be cloned and checked out at that commit.
3535
"""
36-
expected_call_count = 2
36+
expected_call_count = 3 # clone + fetch + checkout
3737
clone_config = CloneConfig(
3838
url=DEMO_URL,
3939
local_path=LOCAL_REPO_PATH,

0 commit comments

Comments
 (0)