Skip to content

Commit 740c3d3

Browse files
committed
Enhance configuration options in README and CLI
- Added examples for configuring processing limits in README.md. - Introduced new CLI options for max files, max total size, and max directory depth. - Updated environment variable support in config.py for various limits. - Modified ingestion functions to accept new parameters for file processing limits. - Enhanced limit checks in ingestion logic to utilize new configuration options.
1 parent 3e83ba3 commit 740c3d3

File tree

8 files changed

+190
-26
lines changed

8 files changed

+190
-26
lines changed

README.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,12 +135,58 @@ By default, the digest is written to a text file (`digest.txt`) in your current
135135
- Use `--output/-o <filename>` to write to a specific file.
136136
- Use `--output/-o -` to output directly to `STDOUT` (useful for piping to other tools).
137137

138+
Configure processing limits:
139+
140+
```bash
141+
# Set higher limits for large repositories
142+
gitingest https://github.com/torvalds/linux \
143+
--max-files 100000 \
144+
--max-total-size 2147483648 \
145+
--max-directory-depth 25
146+
147+
# Process only Python files up to 1MB each
148+
gitingest /path/to/project \
149+
--include-pattern "*.py" \
150+
--max-size 1048576 \
151+
--max-files 1000
152+
```
153+
138154
See more options and usage details with:
139155

140156
```bash
141157
gitingest --help
142158
```
143159

160+
### 🔧 Configuration via Environment Variables
161+
162+
You can configure various limits and settings using environment variables. All configuration environment variables start with the `GITINGEST_` prefix:
163+
164+
**File Processing Configuration:**
165+
- `GITINGEST_MAX_FILE_SIZE` - Maximum size of a single file to process (default: 10485760 bytes, 10MB)
166+
- `GITINGEST_MAX_FILES` - Maximum number of files to process (default: 10000)
167+
- `GITINGEST_MAX_TOTAL_SIZE_BYTES` - Maximum size of output file (default: 524288000 bytes, 500MB)
168+
- `GITINGEST_MAX_DIRECTORY_DEPTH` - Maximum depth of directory traversal (default: 20)
169+
- `GITINGEST_DEFAULT_TIMEOUT` - Default operation timeout in seconds (default: 60)
170+
- `GITINGEST_OUTPUT_FILE_NAME` - Default output filename (default: "digest.txt")
171+
- `GITINGEST_TMP_BASE_PATH` - Base path for temporary files (default: system temp directory)
172+
173+
**Server Configuration (for self-hosting):**
174+
- `GITINGEST_MAX_DISPLAY_SIZE` - Maximum size of content to display in UI (default: 300000 bytes)
175+
- `GITINGEST_DELETE_REPO_AFTER` - Repository cleanup timeout in seconds (default: 3600, 1 hour)
176+
- `GITINGEST_MAX_FILE_SIZE_KB` - Maximum file size for UI slider in KB (default: 102400, 100MB)
177+
- `GITINGEST_MAX_SLIDER_POSITION` - Maximum slider position in UI (default: 500)
178+
179+
**Example usage:**
180+
181+
```bash
182+
# Configure for large scientific repositories
183+
export GITINGEST_MAX_FILES=50000
184+
export GITINGEST_MAX_FILE_SIZE=20971520 # 20MB
185+
export GITINGEST_MAX_TOTAL_SIZE_BYTES=1073741824 # 1GB
186+
187+
gitingest https://github.com/some/large-repo
188+
```
189+
144190
## 🐍 Python package usage
145191

146192
```python
@@ -169,6 +215,15 @@ summary, tree, content = ingest("https://github.com/username/private-repo")
169215

170216
# Include repository submodules
171217
summary, tree, content = ingest("https://github.com/username/repo-with-submodules", include_submodules=True)
218+
219+
# Configure limits programmatically
220+
summary, tree, content = ingest(
221+
"https://github.com/username/large-repo",
222+
max_file_size=20 * 1024 * 1024, # 20MB per file
223+
max_files=50000, # 50k files max
224+
max_total_size_bytes=1024**3, # 1GB total
225+
max_directory_depth=30 # 30 levels deep
226+
)
172227
```
173228

174229
By default, this won't write a file but can be enabled with the `output` argument.

src/gitingest/cli.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@
99
import click
1010
from typing_extensions import Unpack
1111

12-
from gitingest.config import MAX_FILE_SIZE, OUTPUT_FILE_NAME
12+
from gitingest.config import MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MAX_DIRECTORY_DEPTH, OUTPUT_FILE_NAME
1313
from gitingest.entrypoint import ingest_async
1414

1515

1616
class _CLIArgs(TypedDict):
1717
source: str
1818
max_size: int
19+
max_files: int
20+
max_total_size: int
21+
max_directory_depth: int
1922
exclude_pattern: tuple[str, ...]
2023
include_pattern: tuple[str, ...]
2124
branch: str | None
@@ -34,6 +37,24 @@ class _CLIArgs(TypedDict):
3437
show_default=True,
3538
help="Maximum file size to process in bytes",
3639
)
40+
@click.option(
41+
"--max-files",
42+
default=MAX_FILES,
43+
show_default=True,
44+
help="Maximum number of files to process",
45+
)
46+
@click.option(
47+
"--max-total-size",
48+
default=MAX_TOTAL_SIZE_BYTES,
49+
show_default=True,
50+
help="Maximum total size of all files in bytes",
51+
)
52+
@click.option(
53+
"--max-directory-depth",
54+
default=MAX_DIRECTORY_DEPTH,
55+
show_default=True,
56+
help="Maximum depth of directory traversal",
57+
)
3758
@click.option("--exclude-pattern", "-e", multiple=True, help="Shell-style patterns to exclude.")
3859
@click.option(
3960
"--include-pattern",
@@ -112,6 +133,9 @@ async def _async_main(
112133
source: str,
113134
*,
114135
max_size: int = MAX_FILE_SIZE,
136+
max_files: int = MAX_FILES,
137+
max_total_size: int = MAX_TOTAL_SIZE_BYTES,
138+
max_directory_depth: int = MAX_DIRECTORY_DEPTH,
115139
exclude_pattern: tuple[str, ...] | None = None,
116140
include_pattern: tuple[str, ...] | None = None,
117141
branch: str | None = None,
@@ -170,6 +194,9 @@ async def _async_main(
170194
summary, _, _ = await ingest_async(
171195
source,
172196
max_file_size=max_size,
197+
max_files=max_files,
198+
max_total_size_bytes=max_total_size,
199+
max_directory_depth=max_directory_depth,
173200
include_patterns=include_patterns,
174201
exclude_patterns=exclude_patterns,
175202
branch=branch,

src/gitingest/config.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,34 @@
11
"""Configuration file for the project."""
22

3+
import os
34
import tempfile
45
from pathlib import Path
56

6-
MAX_FILE_SIZE = 10 * 1024 * 1024 # Maximum size of a single file to process (10 MB)
7-
MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal
8-
MAX_FILES = 10_000 # Maximum number of files to process
9-
MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # Maximum size of output file (500 MB)
10-
DEFAULT_TIMEOUT = 60 # seconds
7+
# Helper function to get environment variables with type conversion
8+
def _get_env_var(key: str, default, cast_func=None):
9+
"""Get environment variable with GITINGEST_ prefix and optional type casting."""
10+
env_key = f"GITINGEST_{key}"
11+
value = os.environ.get(env_key)
12+
13+
if value is None:
14+
return default
15+
16+
if cast_func:
17+
try:
18+
return cast_func(value)
19+
except (ValueError, TypeError):
20+
print(f"Warning: Invalid value for {env_key}: {value}. Using default: {default}")
21+
return default
22+
23+
return value
1124

12-
OUTPUT_FILE_NAME = "digest.txt"
25+
# Configuration with environment variable support
26+
MAX_FILE_SIZE = _get_env_var("MAX_FILE_SIZE", 10 * 1024 * 1024, int) # Maximum size of a single file to process (10 MB)
27+
MAX_DIRECTORY_DEPTH = _get_env_var("MAX_DIRECTORY_DEPTH", 20, int) # Maximum depth of directory traversal
28+
MAX_FILES = _get_env_var("MAX_FILES", 10_000, int) # Maximum number of files to process
29+
MAX_TOTAL_SIZE_BYTES = _get_env_var("MAX_TOTAL_SIZE_BYTES", 500 * 1024 * 1024, int) # Maximum size of output file (500 MB)
30+
DEFAULT_TIMEOUT = _get_env_var("DEFAULT_TIMEOUT", 60, int) # seconds
1331

14-
TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest"
32+
OUTPUT_FILE_NAME = _get_env_var("OUTPUT_FILE_NAME", "digest.txt")
33+
34+
TMP_BASE_PATH = Path(_get_env_var("TMP_BASE_PATH", tempfile.gettempdir())) / "gitingest"

src/gitingest/entrypoint.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ async def ingest_async(
2222
source: str,
2323
*,
2424
max_file_size: int = MAX_FILE_SIZE,
25+
max_files: int | None = None,
26+
max_total_size_bytes: int | None = None,
27+
max_directory_depth: int | None = None,
2528
include_patterns: str | set[str] | None = None,
2629
exclude_patterns: str | set[str] | None = None,
2730
branch: str | None = None,
@@ -77,6 +80,9 @@ async def ingest_async(
7780
query: IngestionQuery = await parse_query(
7881
source=source,
7982
max_file_size=max_file_size,
83+
max_files=max_files,
84+
max_total_size_bytes=max_total_size_bytes,
85+
max_directory_depth=max_directory_depth,
8086
from_web=False,
8187
include_patterns=include_patterns,
8288
ignore_patterns=exclude_patterns,
@@ -101,6 +107,9 @@ def ingest(
101107
source: str,
102108
*,
103109
max_file_size: int = MAX_FILE_SIZE,
110+
max_files: int | None = None,
111+
max_total_size_bytes: int | None = None,
112+
max_directory_depth: int | None = None,
104113
include_patterns: str | set[str] | None = None,
105114
exclude_patterns: str | set[str] | None = None,
106115
branch: str | None = None,
@@ -122,6 +131,12 @@ def ingest(
122131
The source to analyze, which can be a URL (for a Git repository) or a local directory path.
123132
max_file_size : int
124133
Maximum allowed file size for file ingestion. Files larger than this size are ignored (default: 10 MB).
134+
max_files : int | None
135+
Maximum number of files to process. If ``None``, uses the default from config (default: 10,000).
136+
max_total_size_bytes : int | None
137+
Maximum total size of all files to process in bytes. If ``None``, uses the default from config (default: 500 MB).
138+
max_directory_depth : int | None
139+
Maximum depth of directory traversal. If ``None``, uses the default from config (default: 20).
125140
include_patterns : str | set[str] | None
126141
Pattern or set of patterns specifying which files to include. If ``None``, all files are included.
127142
exclude_patterns : str | set[str] | None
@@ -159,6 +174,9 @@ def ingest(
159174
ingest_async(
160175
source=source,
161176
max_file_size=max_file_size,
177+
max_files=max_files,
178+
max_total_size_bytes=max_total_size_bytes,
179+
max_directory_depth=max_directory_depth,
162180
include_patterns=include_patterns,
163181
exclude_patterns=exclude_patterns,
164182
branch=branch,

src/gitingest/ingestion.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem
9797
Statistics tracking object for the total file count and size.
9898
9999
"""
100-
if limit_exceeded(stats, depth=node.depth):
100+
if limit_exceeded(stats, depth=node.depth, query=query):
101101
return
102102

103103
for sub_path in node.path.iterdir():
@@ -113,7 +113,7 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem
113113
if sub_path.stat().st_size > query.max_file_size:
114114
print(f"Skipping file {sub_path}: would exceed max file size limit")
115115
continue
116-
_process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
116+
_process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path, query=query)
117117
elif sub_path.is_dir():
118118
child_directory_node = FileSystemNode(
119119
name=sub_path.name,
@@ -167,7 +167,7 @@ def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemS
167167
parent_node.file_count += 1
168168

169169

170-
def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None:
170+
def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path, query: IngestionQuery) -> None:
171171
"""Process a file in the file system.
172172
173173
This function checks the file's size, increments the statistics, and reads its content.
@@ -183,14 +183,16 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
183183
Statistics tracking object for the total file count and size.
184184
local_path : Path
185185
The base path of the repository or directory being processed.
186+
query : IngestionQuery
187+
The query object containing the limit configurations.
186188
187189
"""
188-
if stats.total_files + 1 > MAX_FILES:
189-
print(f"Maximum file limit ({MAX_FILES}) reached")
190+
if stats.total_files + 1 > query.max_files:
191+
print(f"Maximum file limit ({query.max_files}) reached")
190192
return
191193

192194
file_size = path.stat().st_size
193-
if stats.total_size + file_size > MAX_TOTAL_SIZE_BYTES:
195+
if stats.total_size + file_size > query.max_total_size_bytes:
194196
print(f"Skipping file {path}: would exceed total size limit")
195197
return
196198

@@ -212,7 +214,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
212214
parent_node.file_count += 1
213215

214216

215-
def limit_exceeded(stats: FileSystemStats, depth: int) -> bool:
217+
def limit_exceeded(stats: FileSystemStats, depth: int, query: IngestionQuery) -> bool:
216218
"""Check if any of the traversal limits have been exceeded.
217219
218220
This function checks if the current traversal has exceeded any of the configured limits:
@@ -224,23 +226,25 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool:
224226
Statistics tracking object for the total file count and size.
225227
depth : int
226228
The current depth of directory traversal.
229+
query : IngestionQuery
230+
The query object containing the limit configurations.
227231
228232
Returns
229233
-------
230234
bool
231235
``True`` if any limit has been exceeded, ``False`` otherwise.
232236
233237
"""
234-
if depth > MAX_DIRECTORY_DEPTH:
235-
print(f"Maximum depth limit ({MAX_DIRECTORY_DEPTH}) reached")
238+
if depth > query.max_directory_depth:
239+
print(f"Maximum depth limit ({query.max_directory_depth}) reached")
236240
return True
237241

238-
if stats.total_files >= MAX_FILES:
239-
print(f"Maximum file limit ({MAX_FILES}) reached")
242+
if stats.total_files >= query.max_files:
243+
print(f"Maximum file limit ({query.max_files}) reached")
240244
return True # TODO: end recursion
241245

242-
if stats.total_size >= MAX_TOTAL_SIZE_BYTES:
243-
print(f"Maxumum total size limit ({MAX_TOTAL_SIZE_BYTES / 1024 / 1024:.1f}MB) reached")
246+
if stats.total_size >= query.max_total_size_bytes:
247+
print(f"Maxumum total size limit ({query.max_total_size_bytes / 1024 / 1024:.1f}MB) reached")
244248
return True # TODO: end recursion
245249

246250
return False

src/gitingest/query_parser.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ async def parse_query(
2828
source: str,
2929
*,
3030
max_file_size: int,
31+
max_files: int | None = None,
32+
max_total_size_bytes: int | None = None,
33+
max_directory_depth: int | None = None,
3134
from_web: bool,
3235
include_patterns: set[str] | str | None = None,
3336
ignore_patterns: set[str] | str | None = None,
@@ -41,6 +44,12 @@ async def parse_query(
4144
The source URL or file path to parse.
4245
max_file_size : int
4346
The maximum file size in bytes to include.
47+
max_files : int | None
48+
The maximum number of files to process. If None, uses default from config.
49+
max_total_size_bytes : int | None
50+
The maximum total size of all files in bytes. If None, uses default from config.
51+
max_directory_depth : int | None
52+
The maximum depth of directory traversal. If None, uses default from config.
4453
from_web : bool
4554
Flag indicating whether the source is a web URL.
4655
include_patterns : set[str] | str | None
@@ -89,6 +98,9 @@ async def parse_query(
8998
branch=query.branch,
9099
commit=query.commit,
91100
max_file_size=max_file_size,
101+
max_files=max_files if max_files is not None else query.max_files,
102+
max_total_size_bytes=max_total_size_bytes if max_total_size_bytes is not None else query.max_total_size_bytes,
103+
max_directory_depth=max_directory_depth if max_directory_depth is not None else query.max_directory_depth,
92104
ignore_patterns=ignore_patterns_set,
93105
include_patterns=parsed_include,
94106
)

src/gitingest/schemas/ingestion.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from pydantic import BaseModel, Field
99

10-
from gitingest.config import MAX_FILE_SIZE
10+
from gitingest.config import MAX_FILE_SIZE, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MAX_DIRECTORY_DEPTH
1111

1212

1313
@dataclass
@@ -77,6 +77,12 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
7777
The tag of the repository.
7878
max_file_size : int
7979
The maximum file size to ingest (default: 10 MB).
80+
max_files : int
81+
The maximum number of files to process (default: 10,000).
82+
max_total_size_bytes : int
83+
The maximum total size of all files in bytes (default: 500 MB).
84+
max_directory_depth : int
85+
The maximum depth of directory traversal (default: 20).
8086
ignore_patterns : set[str]
8187
The patterns to ignore (default: ``set()``).
8288
include_patterns : set[str] | None
@@ -98,6 +104,9 @@ class IngestionQuery(BaseModel): # pylint: disable=too-many-instance-attributes
98104
commit: str | None = None
99105
tag: str | None = None
100106
max_file_size: int = Field(default=MAX_FILE_SIZE)
107+
max_files: int = Field(default=MAX_FILES)
108+
max_total_size_bytes: int = Field(default=MAX_TOTAL_SIZE_BYTES)
109+
max_directory_depth: int = Field(default=MAX_DIRECTORY_DEPTH)
101110
ignore_patterns: set[str] = set() # TODO: ignore_patterns and include_patterns have the same type
102111
include_patterns: set[str] | None = None
103112
include_submodules: bool = False

0 commit comments

Comments
 (0)