diff --git a/.claude/accuracy-sweep-state.json b/.claude/accuracy-sweep-state.json index f4b62063..f4a1bb6d 100644 --- a/.claude/accuracy-sweep-state.json +++ b/.claude/accuracy-sweep-state.json @@ -4,6 +4,15 @@ "focal": { "last_inspected": "2026-03-30T13:00:00Z", "issue": 1092 }, "multispectral": { "last_inspected": "2026-03-30T14:00:00Z", "issue": 1094 }, "proximity": { "last_inspected": "2026-03-30T15:00:00Z", "issue": null, "notes": "Direction >= boundary fragile but works due to truncated constant. Float32 truncation is design choice. No wrong-results bugs found." }, - "curvature": { "last_inspected": "2026-03-30T15:00:00Z", "issue": null, "notes": "Formula matches ArcGIS reference. Backends consistent. No issues found." } + "curvature": { "last_inspected": "2026-03-30T15:00:00Z", "issue": null, "notes": "Formula matches ArcGIS reference. Backends consistent. No issues found." }, + "hillshade": { "last_inspected": "2026-04-10T12:00:00Z", "issue": null, "notes": "Horn's method correct. All backends consistent. NaN propagation correct. float32 adequate for [0,1] output." }, + "terrain": { "last_inspected": "2026-04-10T12:00:00Z", "issue": null, "notes": "Perlin/Worley/ridged noise correct. Dask chunk boundaries produce bit-identical results. No precision issues." }, + "perlin": { "last_inspected": "2026-04-10T12:00:00Z", "issue": null, "notes": "Improved Perlin noise implementation correct. Fade/gradient functions verified. Backend-consistent. Continuous at cell boundaries." }, + "sieve": { "last_inspected": "2026-04-13T12:00:00Z", "issue": null, "notes": "Union-find CCL correct. NaN excluded from labeling. All backends funnel through _sieve_numpy." }, + "polygonize": { "last_inspected": "2026-04-13T12:00:00Z", "issue": 1190, "notes": "NaN pixels not masked in numpy/dask backends. Fix in PR #1194." }, + "cost_distance": { "last_inspected": "2026-04-13T12:00:00Z", "issue": 1191, "notes": "CuPy Bellman-Ford max_iterations = h+w instead of h*w. Fix in PR #1192." }, + "visibility": { "last_inspected": "2026-04-13T12:00:00Z", "issue": null, "notes": "Bresenham line, LOS kernel, Fresnel zone all correct. All backends converge to numpy." }, + "kde": { "last_inspected": "2026-04-13T12:00:00Z", "issue": 1198, "notes": "kde/line_density return zeros for descending-y templates. Fix in PR #1199." }, + "polygon_clip": { "last_inspected": "2026-04-13T12:00:00Z", "issue": 1197, "notes": "crop=True + all_touched=True drops boundary pixels. Fix in PR #1200." } } } diff --git a/.claude/commands/sweep-accuracy.md b/.claude/commands/sweep-accuracy.md new file mode 100644 index 00000000..e763a9ca --- /dev/null +++ b/.claude/commands/sweep-accuracy.md @@ -0,0 +1,168 @@ +# Accuracy Sweep: Dispatch subagents to audit under-inspected modules + +Analyze xrspatial modules by recency and inspection history, then launch +subagents to audit the highest-priority modules in parallel. + +Optional arguments: $ARGUMENTS +(e.g. `--top 3`, `--exclude slope,aspect`, `--only-terrain`, `--reset-state`) + +--- + +## Step 1 -- Gather module metadata via git + +For every `.py` file directly under `xrspatial/` (skip `__init__.py`, +`_version.py`, `__main__.py`, `utils.py`, `accessor.py`, `preview.py`, +`dataset_support.py`, `diagnostics.py`, `analytics.py`), collect: + +| Field | How | +|-------|-----| +| **last_modified** | `git log -1 --format=%aI -- xrspatial/.py` | +| **first_commit** | `git log --diff-filter=A --format=%aI -- xrspatial/.py` | +| **total_commits** | `git log --oneline -- xrspatial/.py \| wc -l` | +| **recent_accuracy_commits** | `git log --oneline --grep='accuracy\|precision\|numerical\|geodesic' -- xrspatial/.py` | + +Store results in a temporary variable -- do NOT write intermediate files. + +## Step 2 -- Load inspection state + +Read the state file at `.claude/accuracy-sweep-state.json`. + +If it does not exist, treat every module as never-inspected. + +If `$ARGUMENTS` contains `--reset-state`, delete the file and treat +everything as never-inspected. + +The state file schema: + +```json +{ + "inspections": { + "slope": { "last_inspected": "2026-03-28T14:00:00Z", "issue": 1042 }, + "aspect": { "last_inspected": "2026-03-28T15:30:00Z", "issue": 1043 } + } +} +``` + +## Step 3 -- Score each module + +Compute a priority score for each module. Higher = more urgent. + +``` +days_since_inspected = (today - last_inspected).days # 9999 if never inspected +days_since_modified = (today - last_modified).days +total_commits = from Step 1 +has_recent_accuracy_work = 1 if recent_accuracy_commits is non-empty, else 0 + +score = (days_since_inspected * 3) + + (total_commits * 0.5) + - (days_since_modified * 0.2) + - (has_recent_accuracy_work * 500) +``` + +Rationale: +- Modules never inspected dominate (9999 * 3) +- More commits = more complex = more likely to have bugs +- Recently modified modules slightly deprioritized (someone just touched them) +- Modules with existing accuracy work heavily deprioritized + +## Step 4 -- Apply filters from $ARGUMENTS + +- `--top N` -- only include the top N modules (default: 3) +- `--exclude mod1,mod2` -- remove named modules from the list +- `--only-terrain` -- restrict to slope, aspect, curvature, terrain, + terrain_metrics, hillshade, sky_view_factor +- `--only-focal` -- restrict to focal, convolution, morphology, bilateral, + edge_detection, glcm +- `--only-hydro` -- restrict to flood, cost_distance, geodesic, + surface_distance, viewshed, erosion, diffusion + +## Step 5 -- Print the ranked table and launch subagents + +### 5a. Print the ranked table + +Print a markdown table showing ALL scored modules (not just the selected ones), +sorted by score descending: + +``` +| Rank | Module | Score | Last Inspected | Last Modified | Commits | +|------|-----------------|--------|----------------|---------------|---------| +| 1 | viewshed | 30012 | never | 45 days ago | 23 | +| 2 | flood | 29998 | never | 120 days ago | 18 | +| ... | ... | ... | ... | ... | ... | +``` + +### 5b. Launch subagents for the top N modules + +For each of the top N modules (default 3), launch an Agent in parallel using +`isolation: "worktree"` and `mode: "auto"`. All N agents must be dispatched +in a single message so they run concurrently. + +Each agent's prompt must be self-contained and follow this template (adapt +the module name, path, and metadata): + +``` +You are auditing xrspatial/{module}.py for numerical accuracy issues. + +This module has {commits} commits and has never been inspected for accuracy. + +**Your task:** + +1. Read xrspatial/{module}.py thoroughly. + +2. Identify any potential accuracy issues: + - Floating point precision loss (e.g. catastrophic cancellation, summing + small numbers into large accumulators, unnecessary float32 intermediates) + - Incorrect NaN/nodata propagation (NaN inputs silently producing finite + outputs, or valid inputs wrongly becoming NaN) + - Off-by-one errors in neighborhood/window operations + - Missing or wrong Earth curvature corrections (for geographic CRS ops) + - Backend inconsistencies (numpy vs cupy vs dask producing different results + for the same input) + - Incorrect algorithm implementation vs. reference literature + +3. For each real issue found, run /rockout to fix it end-to-end (creates a + GitHub issue, worktree branch, fix, tests, and PR). + +4. If you find NO accuracy issues after thorough review, report that the module + is clean -- no action needed. + +5. After finishing (whether you found issues or not), update the inspection + state file .claude/accuracy-sweep-state.json by reading its current contents + and adding/updating the entry for "{module}" with today's ISO date and the + issue number (or null if no issue was created). + +Important: +- Be rigorous. Only flag real bugs that produce wrong numerical results, not + style issues or theoretical concerns. +- Read the tests in xrspatial/tests/ for this module to understand expected + behavior before flagging issues. +- This repo uses ArrayTypeFunctionMapping to dispatch across numpy/cupy/dask + backends. Check all backend paths, not just numpy. +``` + +### 5c. Print a status line + +After dispatching, print: + +``` +Launched {N} accuracy audit agents: {module1}, {module2}, {module3} +``` + +## Step 6 -- State updates + +State is updated by the subagents themselves (see agent prompt step 5). +After completion, the user can verify state with: + +``` +cat .claude/accuracy-sweep-state.json +``` + +To reset all tracking: `/sweep-accuracy --reset-state` + +--- + +## General Rules + +- Do NOT modify any source files directly. Subagents handle fixes via /rockout. +- Keep the output concise -- the table and agent dispatch are the deliverables. +- If $ARGUMENTS is empty, use defaults: top 3, no category filter, no exclusions. diff --git a/.claude/commands/sweep-security.md b/.claude/commands/sweep-security.md new file mode 100644 index 00000000..91dd5cb4 --- /dev/null +++ b/.claude/commands/sweep-security.md @@ -0,0 +1,250 @@ +# Security Sweep: Dispatch subagents to audit modules for security vulnerabilities + +Audit xrspatial modules for security issues specific to numeric/GPU raster +libraries: unbounded allocations, integer overflow, NaN logic bombs, GPU +kernel bounds, file path injection, and dtype confusion. Subagents fix +CRITICAL/HIGH issues via /rockout. + +Optional arguments: $ARGUMENTS +(e.g. `--top 3`, `--exclude slope,aspect`, `--only-io`, `--reset-state`) + +--- + +## Step 1 -- Gather module metadata via git and grep + +Enumerate candidate modules: + +**Single-file modules:** Every `.py` file directly under `xrspatial/`, excluding +`__init__.py`, `_version.py`, `__main__.py`, `utils.py`, `accessor.py`, +`preview.py`, `dataset_support.py`, `diagnostics.py`, `analytics.py`. + +**Subpackage modules:** `geotiff/`, `reproject/`, and `hydro/` directories under +`xrspatial/`. Treat each as a single audit unit. List all `.py` files within +each (excluding `__init__.py`). + +For every module, collect: + +| Field | How | +|-------|-----| +| **last_modified** | `git log -1 --format=%aI -- ` (for subpackages, most recent file) | +| **total_commits** | `git log --oneline -- \| wc -l` | +| **loc** | `wc -l < ` (for subpackages, sum all files) | +| **has_cuda_kernels** | grep file(s) for `@cuda.jit` | +| **has_file_io** | grep file(s) for `open(`, `mkstemp`, `os.path`, `pathlib` | +| **has_numba_jit** | grep file(s) for `@ngjit`, `@njit`, `@jit`, `numba.jit` | +| **allocates_from_dims** | grep file(s) for `np.empty(height`, `np.zeros(height`, `np.empty(H`, `np.empty(h `, `cp.empty(`, and width variants | +| **has_shared_memory** | grep file(s) for `cuda.shared.array` | + +Store results in memory -- do NOT write intermediate files. + +## Step 2 -- Load inspection state + +Read `.claude/security-sweep-state.json`. + +If it does not exist, treat every module as never-inspected. + +If `$ARGUMENTS` contains `--reset-state`, delete the file and treat +everything as never-inspected. + +State file schema: + +```json +{ + "inspections": { + "cost_distance": { + "last_inspected": "2026-04-10T14:00:00Z", + "issue": 1150, + "severity_max": "HIGH", + "categories_found": [1, 2] + } + } +} +``` + +## Step 3 -- Score each module + +``` +days_since_inspected = (today - last_inspected).days # 9999 if never +days_since_modified = (today - last_modified).days + +score = (days_since_inspected * 3) + + (has_file_io * 400) + + (allocates_from_dims * 300) + + (has_cuda_kernels * 250) + + (has_shared_memory * 200) + + (has_numba_jit * 100) + + (loc * 0.05) + - (days_since_modified * 0.2) +``` + +Rationale: +- File I/O is the only external-escape vector (400) +- Unbounded allocation is a DoS vector across all backends (300) +- CUDA bugs cause silent memory corruption (250) +- Shared memory overflow is a CUDA sub-risk (200) +- Numba JIT is ubiquitous -- lower weight avoids noise (100) +- Larger files have more surface area (0.05 per line) +- Recently modified code slightly deprioritized + +## Step 4 -- Apply filters from $ARGUMENTS + +- `--top N` -- only audit the top N modules (default: 3) +- `--exclude mod1,mod2` -- remove named modules from the list +- `--only-terrain` -- restrict to: slope, aspect, curvature, terrain, + terrain_metrics, hillshade, sky_view_factor +- `--only-focal` -- restrict to: focal, convolution, morphology, bilateral, + edge_detection, glcm +- `--only-hydro` -- restrict to: flood, cost_distance, geodesic, + surface_distance, viewshed, erosion, diffusion, hydro (subpackage) +- `--only-io` -- restrict to: geotiff, reproject, rasterize, polygonize + +## Step 5 -- Print the ranked table and launch subagents + +### 5a. Print the ranked table + +Print a markdown table showing ALL scored modules (not just selected ones), +sorted by score descending: + +``` +| Rank | Module | Score | Last Inspected | CUDA | FileIO | Alloc | Numba | LOC | +|------|-----------------|--------|----------------|------|--------|-------|-------|------| +| 1 | geotiff | 30600 | never | yes | yes | no | yes | 1400 | +| 2 | hydro | 30300 | never | yes | no | yes | yes | 8200 | +| ... | ... | ... | ... | ... | ... | ... | ... | ... | +``` + +### 5b. Launch subagents for the top N modules + +For each of the top N modules (default 3), launch an Agent in parallel using +`isolation: "worktree"` and `mode: "auto"`. All N agents must be dispatched +in a single message so they run concurrently. + +Each agent's prompt must be self-contained and follow this template (adapt +the module name, paths, and metadata): + +``` +You are auditing the xrspatial module "{module}" for security vulnerabilities. + +This module has {commits} commits and {loc} lines of code. + +Read these files: {module_files} + +Also read xrspatial/utils.py to understand _validate_raster() behavior. + +**Your task:** + +1. Read all listed files thoroughly. + +2. Audit for these 6 security categories. For each, look for the specific + patterns described. Only flag issues ACTUALLY present in the code. + + **Cat 1 — Unbounded Allocation / Denial of Service** + - np.empty(), np.zeros(), np.full() where size comes from array dimensions + (height*width, H*W, nrows*ncols) without a configurable max or memory check + - CuPy equivalents (cp.empty, cp.zeros) + - Queue/heap arrays sized at height*width without bounds validation + Severity: HIGH if no memory guard exists; MEDIUM if a partial guard exists + + **Cat 2 — Integer Overflow in Index Math** + - height*width multiplication in int32 (overflows silently at ~46340x46340) + - Flat index calculations (r*width + c) in numba JIT without overflow check + - Queue index variables in int32 that could overflow for large arrays + Severity: HIGH for int32 overflow in production paths; MEDIUM for int64 + overflow only possible with unrealistic dimensions (>3 billion pixels) + + **Cat 3 — NaN/Inf as Logic Errors** + - Division without zero-check in numba kernels + - log/sqrt of potentially negative values without guard + - Accumulation loops that could hit Inf (summing many large values) + - Missing NaN propagation: NaN input silently produces finite output + - Incorrect NaN check: using == instead of != for NaN detection in numba + Severity: HIGH if in flood routing, erosion, viewshed, or cost_distance + (safety-critical modules); MEDIUM otherwise + + **Cat 4 — GPU Kernel Bounds Safety** + - CUDA kernels missing `if i >= H or j >= W: return` bounds guard + - cuda.shared.array with fixed size that could overflow with adversarial + input parameters + - Missing cuda.syncthreads() after shared memory writes before reads + - Thread block dimensions that could cause register spill or launch failure + Severity: CRITICAL if bounds guard is missing (out-of-bounds GPU write); + HIGH for shared memory overflow or missing syncthreads + + **Cat 5 — File Path Injection** + - File paths constructed from user strings without os.path.realpath() or + os.path.abspath() canonicalization + - Path traversal via ../ not prevented + - Temporary file creation in user-controlled directories + Severity: CRITICAL if user-provided path is used without any + canonicalization; HIGH if partial canonicalization is bypassable + + **Cat 6 — Dtype Confusion** + - Public API functions that do NOT call _validate_raster() on their inputs + - Numba kernels that assume float64 but could receive float32 or int arrays + - Operations where dtype mismatch causes silent wrong results (not an error) + - CuPy/NumPy backend inconsistency in dtype handling + Severity: HIGH if wrong results are silent; MEDIUM if an error occurs but + the error message is misleading + +3. For each real issue found, assign a severity (CRITICAL/HIGH/MEDIUM/LOW) + and note the exact file and line number. + +4. If any CRITICAL or HIGH issue is found, run /rockout to fix it end-to-end + (GitHub issue, worktree branch, fix, tests, and PR). + For MEDIUM/LOW issues, document them but do not fix. + +5. After finishing (whether you found issues or not), update the inspection + state file .claude/security-sweep-state.json by reading its current + contents and adding/updating the entry for "{module}" with: + - "last_inspected": today's ISO date + - "issue": the issue number from rockout (or null if clean / MEDIUM-only) + - "severity_max": highest severity found (or null if clean) + - "categories_found": list of category numbers that had findings (e.g. [1, 2]) + +Important: +- Only flag real, exploitable issues. False positives waste time. +- Read the tests for this module to understand expected behavior. +- For CUDA code, verify bounds guards are truly missing -- many kernels already + have `if i >= H or j >= W: return`. +- Do NOT flag the use of numba @jit itself as a security issue. Focus on what + the JIT code does, not that it uses JIT. +- For the hydro subpackage: focus on one representative variant (d8) in detail, + then note which dinf/mfd files share the same pattern. Do not read all 29 + files line by line. +- This repo uses ArrayTypeFunctionMapping to dispatch across numpy/cupy/dask + backends. Check all backend paths, not just numpy. +``` + +### 5c. Print a status line + +After dispatching, print: + +``` +Launched {N} security audit agents: {module1}, {module2}, {module3} +``` + +## Step 6 -- State updates + +State is updated by the subagents themselves (see agent prompt step 5). +After completion, verify state with: + +``` +cat .claude/security-sweep-state.json +``` + +To reset all tracking: `/sweep-security --reset-state` + +--- + +## General Rules + +- Do NOT modify any source files directly. Subagents handle fixes via /rockout. +- Keep the output concise -- the table and agent dispatch are the deliverables. +- If $ARGUMENTS is empty, use defaults: top 3, no category filter, no exclusions. +- State file (`.claude/security-sweep-state.json`) is gitignored by convention -- + do not add it to git. +- For subpackage modules (geotiff, reproject, hydro), the subagent should read + ALL `.py` files in the subpackage directory, not just `__init__.py`. +- Only flag patterns that are ACTUALLY present in the code. Do not report + hypothetical issues or patterns that "could" occur with imaginary inputs. +- False positives are worse than missed issues. When in doubt, skip. diff --git a/.claude/security-sweep-state.json b/.claude/security-sweep-state.json new file mode 100644 index 00000000..6ec530bd --- /dev/null +++ b/.claude/security-sweep-state.json @@ -0,0 +1,22 @@ +{ + "inspections": { + "reproject": { + "last_inspected": "2026-04-13", + "issue": null, + "severity_max": null, + "categories_found": [] + }, + "geotiff": { + "last_inspected": "2026-04-13", + "issue": 1195, + "severity_max": "HIGH", + "categories_found": [1] + }, + "hydro": { + "last_inspected": "2026-04-13", + "issue": null, + "severity_max": "MEDIUM", + "categories_found": [1, 6] + } + } +} diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 338e7d06..d991f123 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -19,6 +19,27 @@ from ._geotags import GeoInfo, GeoTransform, extract_geo_info from ._header import IFD, TIFFHeader, parse_all_ifds, parse_header +# --------------------------------------------------------------------------- +# Allocation guard: reject TIFF dimensions that would exhaust memory +# --------------------------------------------------------------------------- + +#: Default maximum total pixel count (width * height * samples). +#: ~1 billion pixels, which is ~4 GB for float32 single-band. +#: Override per-call via the ``max_pixels`` keyword argument. +MAX_PIXELS_DEFAULT = 1_000_000_000 + + +def _check_dimensions(width, height, samples, max_pixels): + """Raise ValueError if the requested allocation exceeds *max_pixels*.""" + total = width * height * samples + if total > max_pixels: + raise ValueError( + f"TIFF image dimensions ({width} x {height} x {samples} = " + f"{total:,} pixels) exceed the safety limit of " + f"{max_pixels:,} pixels. Pass a larger max_pixels value to " + f"read_to_array() if this file is legitimate." + ) + # --------------------------------------------------------------------------- # Data source abstraction @@ -292,7 +313,8 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples, # --------------------------------------------------------------------------- def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, - dtype: np.dtype, window=None) -> np.ndarray: + dtype: np.dtype, window=None, + max_pixels: int = MAX_PIXELS_DEFAULT) -> np.ndarray: """Read a strip-organized TIFF image. Parameters @@ -307,6 +329,8 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, Output pixel dtype. window : tuple or None (row_start, col_start, row_stop, col_stop) or None for full image. + max_pixels : int + Maximum allowed pixel count (width * height * samples). Returns ------- @@ -344,6 +368,8 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, out_h = r1 - r0 out_w = c1 - c0 + _check_dimensions(out_w, out_h, samples, max_pixels) + if samples > 1: result = np.empty((out_h, out_w, samples), dtype=dtype) else: @@ -408,7 +434,8 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, # --------------------------------------------------------------------------- def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, - dtype: np.dtype, window=None) -> np.ndarray: + dtype: np.dtype, window=None, + max_pixels: int = MAX_PIXELS_DEFAULT) -> np.ndarray: """Read a tile-organized TIFF image. Parameters @@ -423,6 +450,8 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, Output pixel dtype. window : tuple or None (row_start, col_start, row_stop, col_stop) or None for full image. + max_pixels : int + Maximum allowed pixel count (width * height * samples). Returns ------- @@ -462,6 +491,8 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, out_h = r1 - r0 out_w = c1 - c0 + _check_dimensions(out_w, out_h, samples, max_pixels) + _alloc = np.zeros if window is not None else np.empty if samples > 1: result = _alloc((out_h, out_w, samples), dtype=dtype) @@ -545,7 +576,9 @@ def _decode_one(job): # --------------------------------------------------------------------------- def _read_cog_http(url: str, overview_level: int | None = None, - band: int | None = None) -> tuple[np.ndarray, GeoInfo]: + band: int | None = None, + max_pixels: int = MAX_PIXELS_DEFAULT, + ) -> tuple[np.ndarray, GeoInfo]: """Read a COG via HTTP range requests. Parameters @@ -556,6 +589,8 @@ def _read_cog_http(url: str, overview_level: int | None = None, Which overview to read (0 = full res, 1 = first overview, etc.). band : int Band index (0-based, for multi-band files). + max_pixels : int + Maximum allowed pixel count (width * height * samples). Returns ------- @@ -613,6 +648,8 @@ def _read_cog_http(url: str, overview_level: int | None = None, tiles_across = math.ceil(width / tw) tiles_down = math.ceil(height / th) + _check_dimensions(width, height, samples, max_pixels) + if samples > 1: result = np.empty((height, width, samples), dtype=dtype) else: @@ -653,7 +690,9 @@ def _read_cog_http(url: str, overview_level: int | None = None, # --------------------------------------------------------------------------- def read_to_array(source: str, *, window=None, overview_level: int | None = None, - band: int | None = None) -> tuple[np.ndarray, GeoInfo]: + band: int | None = None, + max_pixels: int = MAX_PIXELS_DEFAULT, + ) -> tuple[np.ndarray, GeoInfo]: """Read a GeoTIFF/COG to a numpy array. Parameters @@ -666,13 +705,18 @@ def read_to_array(source: str, *, window=None, overview_level: int | None = None Overview level (0 = full res). band : int Band index for multi-band files. + max_pixels : int + Maximum allowed total pixel count (width * height * samples). + Prevents memory exhaustion from crafted TIFF headers. + Default is 1 billion (~4 GB for float32 single-band). Returns ------- (np.ndarray, GeoInfo) tuple """ if source.startswith(('http://', 'https://')): - return _read_cog_http(source, overview_level=overview_level, band=band) + return _read_cog_http(source, overview_level=overview_level, band=band, + max_pixels=max_pixels) # Local file or cloud storage: read all bytes then parse if _is_fsspec_uri(source): @@ -701,9 +745,11 @@ def read_to_array(source: str, *, window=None, overview_level: int | None = None geo_info = extract_geo_info(ifd, data, header.byte_order) if ifd.is_tiled: - arr = _read_tiles(data, ifd, header, dtype, window) + arr = _read_tiles(data, ifd, header, dtype, window, + max_pixels=max_pixels) else: - arr = _read_strips(data, ifd, header, dtype, window) + arr = _read_strips(data, ifd, header, dtype, window, + max_pixels=max_pixels) # For multi-band with band selection, extract single band if arr.ndim == 3 and ifd.samples_per_pixel > 1 and band is not None: diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index ca817f14..616c84b1 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -136,6 +136,8 @@ def parse_vrt(xml_str: str, vrt_dir: str = '.') -> VRTDataset: relative.get('relativeToVRT', '0') == '1') if is_relative and not os.path.isabs(filename): filename = os.path.join(vrt_dir, filename) + # Canonicalize to prevent path traversal (e.g. ../) + filename = os.path.realpath(filename) src_band = int(_text(src_elem, 'SourceBand') or '1') diff --git a/xrspatial/geotiff/tests/test_features.py b/xrspatial/geotiff/tests/test_features.py index 06d76cfd..8aabc689 100644 --- a/xrspatial/geotiff/tests/test_features.py +++ b/xrspatial/geotiff/tests/test_features.py @@ -1,6 +1,8 @@ """Tests for new features: multi-band, integer nodata, packbits, zstd, dask, BigTIFF.""" from __future__ import annotations +import os + import numpy as np import pytest import xarray as xr @@ -823,12 +825,10 @@ def test_vrt_parser(self): assert vrt.bands[0].nodata == 0.0 assert len(vrt.bands[0].sources) == 1 src = vrt.bands[0].sources[0] - assert src.filename == '/data/tile.tif' + assert src.filename == os.path.realpath('/data/tile.tif') assert src.src_rect.x_off == 10 -import os - class TestCloudStorage: def test_cloud_scheme_detection(self): diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py new file mode 100644 index 00000000..a3dd8c88 --- /dev/null +++ b/xrspatial/geotiff/tests/test_security.py @@ -0,0 +1,194 @@ +"""Security tests for the geotiff subpackage. + +Tests for: +- Unbounded allocation guard (issue #1184) +- VRT path traversal prevention (issue #1185) +""" +from __future__ import annotations + +import os +import struct +import tempfile + +import numpy as np +import pytest + +from xrspatial.geotiff._reader import ( + MAX_PIXELS_DEFAULT, + _check_dimensions, + _read_strips, + _read_tiles, + read_to_array, +) +from xrspatial.geotiff._header import parse_header, parse_all_ifds +from xrspatial.geotiff._dtypes import tiff_dtype_to_numpy +from .conftest import make_minimal_tiff + + +# --------------------------------------------------------------------------- +# Cat 1: Unbounded allocation guard +# --------------------------------------------------------------------------- + +class TestDimensionGuard: + def test_check_dimensions_rejects_oversized(self): + """_check_dimensions raises when total pixels exceed the limit.""" + with pytest.raises(ValueError, match="exceed the safety limit"): + _check_dimensions(100_000, 100_000, 1, MAX_PIXELS_DEFAULT) + + def test_check_dimensions_accepts_normal(self): + """_check_dimensions does not raise for normal sizes.""" + _check_dimensions(1000, 1000, 1, MAX_PIXELS_DEFAULT) + + def test_check_dimensions_considers_samples(self): + """Multi-band images multiply the pixel budget.""" + # 50_000 x 50_000 x 3 = 7.5 billion, should be rejected + with pytest.raises(ValueError, match="exceed the safety limit"): + _check_dimensions(50_000, 50_000, 3, MAX_PIXELS_DEFAULT) + + def test_custom_limit(self): + """A custom max_pixels lets callers tighten or relax the limit.""" + # Tight limit: 100 pixels + with pytest.raises(ValueError, match="exceed the safety limit"): + _check_dimensions(20, 20, 1, max_pixels=100) + + # Relaxed: passes with large limit + _check_dimensions(100_000, 100_000, 1, max_pixels=100_000_000_000) + + def test_read_strips_rejects_huge_header(self): + """_read_strips refuses to allocate when header claims huge dims.""" + # Build a valid TIFF with small pixel data but huge header dimensions. + # We fake the header to claim 100000x100000 but only provide 4x4 data. + data = make_minimal_tiff(4, 4, np.dtype('float32')) + header = parse_header(data) + ifds = parse_all_ifds(data, header) + ifd = ifds[0] + + # Monkey-patch the IFD width/height to simulate a crafted header + from xrspatial.geotiff._header import IFDEntry + ifd.entries[256] = IFDEntry(tag=256, type_id=3, count=1, value=100_000) + ifd.entries[257] = IFDEntry(tag=257, type_id=3, count=1, value=100_000) + + dtype = tiff_dtype_to_numpy(ifd.bits_per_sample, ifd.sample_format) + + with pytest.raises(ValueError, match="exceed the safety limit"): + _read_strips(data, ifd, header, dtype, max_pixels=1_000_000) + + def test_read_tiles_rejects_huge_header(self): + """_read_tiles refuses to allocate when header claims huge dims.""" + data = make_minimal_tiff(8, 8, np.dtype('float32'), tiled=True, tile_size=4) + header = parse_header(data) + ifds = parse_all_ifds(data, header) + ifd = ifds[0] + + from xrspatial.geotiff._header import IFDEntry + ifd.entries[256] = IFDEntry(tag=256, type_id=3, count=1, value=100_000) + ifd.entries[257] = IFDEntry(tag=257, type_id=3, count=1, value=100_000) + + dtype = tiff_dtype_to_numpy(ifd.bits_per_sample, ifd.sample_format) + + with pytest.raises(ValueError, match="exceed the safety limit"): + _read_tiles(data, ifd, header, dtype, max_pixels=1_000_000) + + def test_read_to_array_max_pixels_kwarg(self, tmp_path): + """read_to_array passes max_pixels through to the internal readers.""" + expected = np.arange(16, dtype=np.float32).reshape(4, 4) + data = make_minimal_tiff(4, 4, np.dtype('float32'), pixel_data=expected) + path = str(tmp_path / "small.tif") + with open(path, 'wb') as f: + f.write(data) + + # Should succeed with a generous limit + arr, _ = read_to_array(path, max_pixels=1_000_000) + np.testing.assert_array_equal(arr, expected) + + # Should fail with a tiny limit + with pytest.raises(ValueError, match="exceed the safety limit"): + read_to_array(path, max_pixels=10) + + def test_normal_read_unaffected(self, tmp_path): + """Normal reads within the default limit are not affected.""" + expected = np.arange(64, dtype=np.float32).reshape(8, 8) + data = make_minimal_tiff(8, 8, np.dtype('float32'), pixel_data=expected) + path = str(tmp_path / "normal.tif") + with open(path, 'wb') as f: + f.write(data) + + arr, _ = read_to_array(path) + np.testing.assert_array_equal(arr, expected) + + +# --------------------------------------------------------------------------- +# Cat 5: VRT path traversal +# --------------------------------------------------------------------------- + +class TestVRTPathTraversal: + def test_relative_path_canonicalized(self, tmp_path): + """Relative paths in VRT SourceFilename are canonicalized.""" + from xrspatial.geotiff._vrt import parse_vrt + + vrt_xml = ''' + + + ../../../etc/shadow + 1 + + + + +''' + + vrt_dir = str(tmp_path / "subdir") + os.makedirs(vrt_dir) + + vrt = parse_vrt(vrt_xml, vrt_dir) + source_path = vrt.bands[0].sources[0].filename + + # After canonicalization, the path should NOT contain ".." + assert ".." not in source_path + # It should be an absolute path + assert os.path.isabs(source_path) + # Verify it was resolved through realpath + assert source_path == os.path.realpath(source_path) + + def test_normal_relative_path_still_works(self, tmp_path): + """Normal relative paths without traversal still resolve correctly.""" + from xrspatial.geotiff._vrt import parse_vrt + + vrt_xml = ''' + + + data/tile.tif + 1 + + + + +''' + + vrt_dir = str(tmp_path) + vrt = parse_vrt(vrt_xml, vrt_dir) + source_path = vrt.bands[0].sources[0].filename + + expected = os.path.realpath(os.path.join(vrt_dir, "data", "tile.tif")) + assert source_path == expected + + def test_absolute_path_also_canonicalized(self, tmp_path): + """Absolute paths in VRT are also canonicalized.""" + from xrspatial.geotiff._vrt import parse_vrt + + vrt_xml = ''' + + + /tmp/../tmp/test.tif + 1 + + + + +''' + + vrt = parse_vrt(vrt_xml, str(tmp_path)) + source_path = vrt.bands[0].sources[0].filename + + assert ".." not in source_path + assert source_path == os.path.realpath("/tmp/../tmp/test.tif")