Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]

- Sort converter output by Hilbert distance against the CRS's total bounds instead of by raw WKB byte order.

## [v0.2.15] - 2026-02-16

- Add a `get_default_collection` to the Registry
Expand Down
43 changes: 43 additions & 0 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import sys
from pathlib import Path

import geopandas as gpd
import numpy as np
import pytest
from loguru import logger

from vecorel_cli.conversion.base import BaseConverter
from vecorel_cli.convert import ConvertData
from vecorel_cli.registry import Registry
from vecorel_cli.validate import ValidateData
from vecorel_cli.vecorel.hilbert import crs_total_bounds, hilbert_distances_from_bounds


@pytest.fixture(autouse=True)
Expand Down Expand Up @@ -41,6 +44,46 @@ def test_converter(tmp_folder, choice):
assert validation_result.errors == []


@pytest.mark.parametrize("choice", ["example"])
def test_converter_hilbert_curve_order(tmp_folder, choice):
"""The converter must emit rows in Hilbert-curve order against the CRS's
total bounds. This is what makes per-partition outputs of the same dataset
mergeable without re-sorting: they all share a CRS-derived Hilbert grid."""
dest = tmp_folder / "converted.parquet"
ConvertData(choice).convert(dest, cache=(test_path / choice))

gdf = gpd.read_parquet(dest)
assert len(gdf) > 1, "Need at least 2 rows to verify ordering."

total_bounds = crs_total_bounds(gdf.crs)
bounds = gdf.geometry.bounds.to_numpy(dtype=np.float64, copy=False)
keys = hilbert_distances_from_bounds(bounds, total_bounds)

diffs = np.diff(keys)
if np.any(diffs < 0):
bad = int(np.argmax(diffs < 0))
raise AssertionError(
"Output rows are not in non-decreasing Hilbert order against the "
f"CRS total bounds ({total_bounds}).\n"
f" First out-of-order pair at index {bad}->{bad + 1}: "
f"key {int(keys[bad])} > {int(keys[bad + 1])}."
)


def test_crs_total_bounds_geographic():
# Geographic CRS without a more restrictive area_of_use falls back to world.
assert crs_total_bounds("EPSG:4326") == (-180.0, -90.0, 180.0, 90.0)


def test_crs_total_bounds_etrs89():
# EPSG:4258 is geographic with a European area_of_use; the returned bounds
# must reflect that AoU rather than world bounds (otherwise Hilbert keys
# for European data would all be quantised into one tiny cell).
bounds = crs_total_bounds("EPSG:4258")
assert bounds[0] > -180.0 and bounds[2] < 180.0, bounds
assert bounds[1] > -90.0 and bounds[3] < 90.0, bounds


def test_not_existing_converter(tmp_folder):
with pytest.raises(Exception, match="Converter 'not_existing' not found"):
converter = ConvertData("not_existing")
Expand Down
6 changes: 6 additions & 0 deletions vecorel_cli/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@

GEOPARQUET_VERSIONS = ["1.0.0", "1.1.0"]
GEOPARQUET_DEFAULT_VERSION = "1.1.0"

# Default level used when sorting features along a Hilbert curve. A level of N
# splits each axis of the reference bounds into 2**N cells; 16 gives a
# 65,536 x 65,536 grid, fine enough for world-scale references at sub-km
# resolution near the equator.
HILBERT_DEFAULT_LEVEL = 16
8 changes: 7 additions & 1 deletion vecorel_cli/conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ..cli.util import display_pandas_unrestricted
from ..encoding.geoparquet import GeoParquet
from ..vecorel.collection import Collection
from ..vecorel.hilbert import hilbert_sort_geodataframe
from ..vecorel.schemas import Schemas
from ..vecorel.typing import Sources
from ..vecorel.util import get_fs, name_from_uri, stream_file
Expand Down Expand Up @@ -408,7 +409,12 @@ def convert(
self.info("Removing Z geometry dimension")
gdf.geometry = gdf.geometry.force_2d()

gdf.sort_values("geometry", inplace=True, ignore_index=True)
# Sort by Hilbert distance against the CRS's total bounds. This gives
# row groups good spatial locality, and — crucially — produces the same
# ordering for any independently-converted partition of the same dataset
# (they share a CRS, therefore a Hilbert reference grid), so per-file
# outputs can be merged later without re-sorting.
gdf = hilbert_sort_geodataframe(gdf)

# 8. Remove all columns that are not listed
drop_columns = list(set(gdf.columns) - set(actual_columns.values()))
Expand Down
111 changes: 111 additions & 0 deletions vecorel_cli/vecorel/hilbert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Hilbert curve sorting against CRS-stable reference bounds.

Sorting by Hilbert distance gives row groups good spatial locality, which
improves bbox-pushdown query performance and downstream compression. We
deliberately sort against the *CRS's* declared extent rather than the
dataframe's own bbox: that way, two independently produced files (for
example two provincial parts that will be merged later) end up using the
same Hilbert reference grid, and their orderings are comparable.
"""

from __future__ import annotations

import numpy as np

from ..const import HILBERT_DEFAULT_LEVEL

# Geographic world bounds, used as the fallback when a CRS exposes no
# `area_of_use` (e.g. some custom or aggregated CRSs).
_GEOGRAPHIC_WORLD_BOUNDS = (-180.0, -90.0, 180.0, 90.0)


def crs_total_bounds(crs) -> tuple[float, float, float, float]:
"""Return a stable, deterministic ``(xmin, ymin, xmax, ymax)`` covering the
CRS's valid extent expressed in the CRS's own coordinate units.

For geographic CRSs this returns the CRS's declared ``area_of_use`` in
degrees. For projected CRSs the geographic ``area_of_use`` is projected
onto the CRS axes (sampled along a 25x25 grid to capture curvature).

The result is independent of any particular dataset: it depends only on
the CRS. Two independently-converted parts of the same dataset therefore
share the same Hilbert reference grid and their orderings can be merged
without recomputation.
"""
from pyproj import CRS, Transformer

if crs is None:
return _GEOGRAPHIC_WORLD_BOUNDS

c = CRS.from_user_input(crs)
aou = c.area_of_use

if c.is_geographic:
if aou is None:
return _GEOGRAPHIC_WORLD_BOUNDS
return (float(aou.west), float(aou.south), float(aou.east), float(aou.north))

if aou is None:
raise ValueError(
f"Cannot derive total bounds for projected CRS {crs!r}: no area_of_use defined"
)

transformer = Transformer.from_crs("EPSG:4326", c, always_xy=True)
n = 25
xs = np.linspace(aou.west, aou.east, n)
ys = np.linspace(aou.south, aou.north, n)
grid_x, grid_y = np.meshgrid(xs, ys)
px, py = transformer.transform(grid_x.ravel(), grid_y.ravel())
px = np.asarray(px, dtype=np.float64)
py = np.asarray(py, dtype=np.float64)
mask = np.isfinite(px) & np.isfinite(py)
if not mask.any():
raise ValueError(f"Could not project area_of_use into CRS {crs!r}")
return (
float(px[mask].min()),
float(py[mask].min()),
float(px[mask].max()),
float(py[mask].max()),
)


def hilbert_distances_from_bounds(
bounds: np.ndarray,
total_bounds: tuple[float, float, float, float],
level: int = HILBERT_DEFAULT_LEVEL,
) -> np.ndarray:
"""Compute per-row Hilbert distances from feature bbox rows + CRS bounds.

``bounds`` must be an ``(N, 4)`` float64 array of ``[xmin, ymin, xmax, ymax]``
per feature. Returns a ``(N,)`` uint64 array of Hilbert keys; sorting by
this key gives the Hilbert-curve ordering at the given resolution.
"""
from geopandas.tools.hilbert_curve import _continuous_to_discrete_coords, _encode

x, y = _continuous_to_discrete_coords(bounds, level, list(total_bounds))
return _encode(level, x, y).astype(np.uint64, copy=False)


def hilbert_sort_geodataframe(gdf, level: int = HILBERT_DEFAULT_LEVEL):
"""Return a copy of ``gdf`` sorted by Hilbert distance against the CRS's
total bounds. The original index is reset.

If ``gdf.crs`` is None or has no area_of_use, falls back to geographic
world bounds; in that case the resulting order is still stable but Hilbert
keys may not be comparable to other independently-produced outputs.
"""
if gdf.empty:
return gdf.reset_index(drop=True)

total = crs_total_bounds(gdf.crs)
bounds_2d: np.ndarray = gdf.geometry.bounds.to_numpy(dtype=np.float64, copy=False)
keys = hilbert_distances_from_bounds(bounds_2d, total, level=level)
order = np.argsort(keys, kind="stable")
return gdf.iloc[order].reset_index(drop=True)


__all__ = [
"crs_total_bounds",
"hilbert_distances_from_bounds",
"hilbert_sort_geodataframe",
]
Loading