From 28f01f473b135a2f0a0b95451ee09d20f86a538a Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 20 Apr 2026 01:03:59 +0900 Subject: [PATCH 01/30] docs: add admin UI and key visualizer design MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Proposes a standalone cmd/elastickv-admin binary and a TiKV-style key visualizer heatmap. Avoids the Prometheus client dependency in the initial phases by adding an in-process LiveSummary alongside the existing observers, and keeps sampler hot-path overhead below the benchmark noise floor via adaptive 1-in-N sampling with a ≥95% capture SLO. --- docs/admin_ui_key_visualizer_design.md | 260 +++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 docs/admin_ui_key_visualizer_design.md diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md new file mode 100644 index 00000000..3536cf62 --- /dev/null +++ b/docs/admin_ui_key_visualizer_design.md @@ -0,0 +1,260 @@ +# Admin UI and Key Visualizer Design for Elastickv + +## 1. Background + +Elastickv currently exposes four data-plane surfaces (gRPC `RawKV`/`TransactionalKV`, Redis, DynamoDB, S3) and one control-plane surface (`Distribution.ListRoutes`, `SplitRange`). Operational insight is provided today by: + +- Prometheus metrics on `--metricsAddress` (default `:9090`), backed by `monitoring.Registry` (`monitoring/registry.go:12`). +- Pre-built Grafana dashboards under `monitoring/grafana/`. +- `grpcurl` against the `Distribution` and `RaftAdmin` services. +- `cmd/raftadmin` and `cmd/client` CLIs. + +There is no first-party Web UI, and — critically — no per-key or per-route traffic signal. Operators cannot answer questions such as "which key range is hot right now?", "is the load skewed across Raft groups?", or "did the last `SplitRange` actually relieve the hotspot?" without building ad-hoc Prometheus queries, and even those queries cannot drill below the Raft-group aggregate. + +This document proposes a built-in admin Web UI, shipped as a separate binary `cmd/elastickv-admin`, and a TiKV-style **Key Visualizer** that renders a time × key-range heatmap of load. The design reuses existing control-plane gRPC APIs (routes, Raft status) and adds a minimal, hot-path-safe sampler for per-route traffic. The initial milestones intentionally avoid depending on the Prometheus client library so that the admin binary remains independently buildable and shippable. + +## 2. Goals and Non-goals + +### 2.1 Goals + +1. Ship a standalone admin binary `cmd/elastickv-admin` that connects to one or more elastickv nodes over gRPC and serves a Web UI. +2. Provide a single UI that covers cluster overview, routes, Raft groups, adapter throughput, and the key visualizer. +3. Produce a time × key-space heatmap with at least four switchable series: read count, write count, read bytes, write bytes. +4. Follow hotspot shards across `SplitRange` / merge events so the heatmap stays continuous. +5. Keep the sampler's hot-path overhead within the measurement noise floor; define success as "≥95% of operations captured with no detectable regression in coordinator benchmarks." +6. Stay off the Prometheus client library in Phases 0–3. Traffic counters used by the UI are maintained by the in-process sampler and a small adapter-side aggregator that already exists on the hot path. +7. Make the admin binary easy to deploy: a single Go binary with the SPA embedded via `go:embed`, producing one artifact per platform in CI. + +### 2.2 Non-goals + +1. Replacement of the existing Grafana dashboards. The admin UI focuses on cluster state and the keyspace view; long-horizon trend analysis remains a Prometheus/Grafana concern. +2. Per-individual-key statistics. The visualizer operates on route-level buckets, not on a `GET` / `PUT` trace. +3. Authentication or authorization in the initial milestones. The admin binary binds to localhost by default and expects operators to layer their own access control (SSH tunnel, reverse proxy, network ACL). Authentication is out of scope for Phases 0–4. +4. Query console (SQL/Redis/DynamoDB REPL) inside the UI. Deferred. +5. Multi-cluster federation. Scope is a single cluster; the admin binary may target any single node. + +## 3. High-level Architecture + +```mermaid +flowchart LR + Browser["Browser (Svelte SPA, embedded)"] + + subgraph AdminHost["Operator machine or sidecar"] + Admin["cmd/elastickv-admin :8080"] + end + + subgraph Cluster["Elastickv Cluster"] + Node1["Node A"] + Node2["Node B"] + Node3["Node C"] + end + + Browser -- "HTTP/JSON + WebSocket" --> Admin + Admin -- "gRPC: Distribution, RaftAdmin, Admin.KeyViz" --> Node1 + Admin -- "gRPC" --> Node2 + Admin -- "gRPC" --> Node3 + + subgraph NodeInternal["Inside each Node"] + Sampler["keyviz.Sampler"] + Coord["kv.ShardedCoordinator"] + Dist["distribution.Engine"] + Raft["raftengine.StatusReader"] + AdminSvc["Admin gRPC Service"] + end + + Coord -- "Observe(routeID, op, size)" --> Sampler + AdminSvc --> Sampler + AdminSvc --> Dist + AdminSvc --> Raft +``` + +The admin binary holds no authoritative state. All data is fetched on demand from nodes via a new `Admin` gRPC service. The sampler's ring buffer lives inside each node's process, rebuildable after restart (see §5.5). + +### 3.1 Why a separate binary + +- Release cadence for the UI is decoupled from the data plane. +- The admin binary can be placed on an operator workstation or a sidecar pod, so a compromised UI does not imply a compromised data node. +- Node binaries remain free of the Prometheus client (goal §2.1-6) and of any SPA assets. +- `cmd/elastickv-admin --node=host:50051` is the full invocation; no config files are required for the default use case. + +## 4. API Surface + +Two layers: + +**Layer A — gRPC, node → admin binary.** A new `Admin` service on each node, registered on the same gRPC port as `RawKV` (`--address`, default `:50051`). All methods are read-only in Phases 0–3. + +| RPC | Purpose | +|---|---| +| `GetClusterOverview` | Node identity, Raft leader map per group, aggregate QPS | +| `ListRoutes` | Existing `Distribution.ListRoutes` (reused, not duplicated) | +| `GetRaftGroups` | Per-group state (leader, term, commit/applied, last contact) | +| `GetAdapterSummary` | Per-adapter QPS and latency quantiles from the in-process aggregator | +| `GetKeyVizMatrix` | Heatmap matrix (see §5.4) | +| `GetRouteDetail` | Time series for one route (drill-down) | +| `StreamEvents` | Server-stream of route-state transitions and fresh matrix columns | + +**Layer B — HTTP/JSON, browser → admin binary.** Thin pass-through wrappers over the gRPC calls, plus static asset serving. + +| Method | Path | Purpose | +|---|---|---| +| GET | `/` (and `/assets/*`) | Embedded SPA | +| GET | `/api/cluster/overview` | Wraps `GetClusterOverview` | +| GET | `/api/routes` | Wraps `ListRoutes` + derived size/leader | +| GET | `/api/raft/groups` | Wraps `GetRaftGroups` | +| GET | `/api/adapters/summary` | Wraps `GetAdapterSummary` | +| GET | `/api/keyviz/matrix` | Wraps `GetKeyVizMatrix` | +| GET | `/api/keyviz/routes/{routeID}` | Wraps `GetRouteDetail` | +| WS | `/api/stream` | Multiplexes `StreamEvents` from all targeted nodes | + +HTTP errors use a minimal `{code, message}` envelope. No caching headers on read endpoints. + +### 4.1 `GetKeyVizMatrix` parameters + +| Field | Type | Default | Notes | +|---|---|---|---| +| `series` | enum(`reads`,`writes`,`readBytes`,`writeBytes`) | `writes` | Selects which counter is returned | +| `from` | timestamp | now−1h | Inclusive | +| `to` | timestamp | now | Exclusive | +| `rows` | int | 256 | Target Y-axis resolution (server may return fewer) | + +Response matrix format: `matrix[i][j]` is the value for bucket `i` at time column `j`. Keys in `start`/`end` are raw bytes; the server supplies `label` as a printable preview (§5.6). + +## 5. Key Visualizer + +### 5.1 Sampling point + +A single call site is added at the dispatch entry of `kv.ShardedCoordinator` (see `kv/sharded_coordinator.go`), immediately after the request is resolved to a `RouteID`: + +```go +sampler.Observe(routeID, op, keyLen, valueLen) +``` + +`sampler` is an interface; the default implementation is lock-free and nil-safe, so a nil sampler compiles to a predictable branch and no allocation. The hook runs *before* Raft proposal so it measures offered load, not applied load. + +Reads and writes are both sampled on the leader. Followers do not sample, because follower-local reads flow through the same coordinator path on the follower. + +### 5.2 Adaptive sub-sampling and the 95% SLO + +Observing every call is cheap but not free. To guarantee "no detectable regression," the sampler uses **adaptive 1-in-N sampling per route**: + +- Each route maintains an atomic `sampleRate` counter. +- Under low load (below a threshold QPS per route), `sampleRate = 1` (every op counted). +- As per-route QPS rises, `sampleRate` doubles stepwise, and the increment applied is multiplied by `sampleRate` so counters remain unbiased estimators. +- The controller targets **≥95% capture rate** in steady state: `sampleRate` is only raised when the write contention on the atomic increment itself crosses a noise-floor CPU threshold, measured at flush time. +- Worst-case error on per-bucket totals is bounded by `1/sqrt(observedSamples)`, so buckets with fewer samples are tagged in the response for the UI to hatch them. + +Benchmark gate in CI: run `BenchmarkCoordinatorDispatch` with the sampler disabled and with it enabled; the delta must be within the benchmark's own run-to-run variance (noise floor), not a fixed percentage. If a future change inflates variance, the gate fails until the noise floor is reduced or the sampler is made cheaper. + +### 5.3 In-memory representation + +``` +Sampler + ├─ routes map[RouteID]*routeCounters // current 1s window (reads,writes,readBytes,writeBytes, plus sampleRate) + └─ history *ringBuffer[matrixColumn] // one column per stepSeconds (default 60s) +``` + +Every `stepSeconds` a flush goroutine swaps the map into a new column of the ring buffer. + +The ring buffer default is **24 hours of 60 s columns = 1440 columns**. Memory estimate: `1440 × routes × 4 × 8B`. For 10 k routes: ~460 MiB. The flush goroutine compacts columns beyond 1 hour into 5-minute aggregates, bringing the steady-state cost to under 80 MiB for the same route count. + +### 5.4 Keeping up with splits and merges + +`distribution.Engine` already emits a watch stream on route-state transitions. The sampler subscribes and, on a split, copies the parent route's historical column values into both children so the heatmap stays visually continuous across the event. On a merge, child columns are summed into the surviving parent. The current `routeCounters` map is updated atomically under the watch callback; in-flight `Observe` calls that raced with the transition are attributed to whichever route is visible at observe time — acceptable because the loss is bounded by a single step window. + +### 5.5 Bucketing for the response + +The API's `rows` parameter is a *target*, not a guarantee. The server walks the route list in lexicographic order of `start` and greedily merges adjacent routes until the row count fits. Merge priority: lowest total activity across the requested window, so hotspots stay un-merged and visible. + +### 5.6 Persistence + +Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptable for an MVP and keeps the Raft critical path untouched. + +Phase 3 writes compacted columns to the default Raft group under reserved keys `!admin|keyviz||...`. Writes are batched once per flush and are not part of user transactions, so they cannot stall the data plane. A TTL of 7 days is applied via the existing HLC-based expiry (`store/lsm_store.go:24`). + +### 5.7 Key preview labels + +Raw keys are binary. The UI needs a printable hint per bucket. Strategy: + +1. If all keys in the bucket's `[start, end)` are valid UTF-8 with no control characters, return the common byte prefix truncated to 24 chars. +2. Otherwise, return a hex preview of the common prefix plus `…`. +3. Internal reserved prefixes (`!txn|`, `!dist|*`, `!admin|*`) are labelled explicitly and rendered with a distinct color in the UI, so system traffic is never confused with user traffic. + +## 6. Adapter Summary Without Prometheus + +The existing `monitoring.Registry` observers record into Prometheus counters/histograms — useful for Grafana, but not readable back without pulling in the Prometheus client library. To keep the admin binary and node binary free of that dependency during Phases 0–3: + +- A small sibling struct `monitoring.LiveSummary` is added alongside each observer. It maintains, in parallel with the existing Prometheus writes, an in-process rolling window (10-second buckets, 5-minute history) of request count and latency reservoir samples per adapter and per operation. +- `LiveSummary` is read-only from the outside and lock-free on the write path (atomic counters + a tiny t-digest per op). +- `GetAdapterSummary` reads directly from `LiveSummary`. The Prometheus exposition remains unchanged and untouched. + +This adds roughly a dozen integer fields per tracked operation and avoids both the Prometheus dependency and the need to scrape `/metrics` from within the admin binary. + +## 7. Frontend + +- **Stack**: SvelteKit (static adapter) + TypeScript + Tailwind + ECharts (`heatmap` series). +- **Why Svelte**: smaller bundle (~150 KB gzipped for the full app vs ~350 KB for React + equivalent libs), fewer transitive dependency updates to audit, trivial static build that embeds cleanly with `go:embed`. Selected explicitly to favour maintenance simplicity and deployment size. +- **Layout**: left nav with Overview / Routes / Raft / Adapters / Key Visualizer. +- **Key Visualizer page**: + - X-axis time, Y-axis route buckets, brush-to-zoom on both axes. + - Series switcher (reads / writes / readBytes / writeBytes). + - Range selection opens a drawer with the underlying route list, current leader, size, and a link to the Raft group page. + - Live mode: a WebSocket push appends a new column every `stepSeconds` without refetching history. + - Buckets below the 95%-capture threshold are hatched to signal estimation uncertainty. +- **Build**: `web/` at repo root, `pnpm build` output copied to `cmd/elastickv-admin/dist/`, embedded with `//go:embed dist`. +- **Dev flow**: Vite dev server on `:5173` proxies `/api` and `/stream` to a locally running `cmd/elastickv-admin`. + +## 8. Integration Points + +| File | Change | +|---|---| +| `cmd/elastickv-admin/` (new) | Main, HTTP server, gRPC clients, embedded SPA. | +| `adapter/admin_grpc.go` (new) | Server-side implementation of the `Admin` gRPC service, registered in `main.go`. | +| `proto/admin.proto` (new) | Service definition for `Admin`. | +| `kv/sharded_coordinator.go` | One-line `sampler.Observe(...)` at dispatch entry; `sampler` is `keyviz.Sampler` injected via constructor, nil-safe. | +| `keyviz/` (new) | `Sampler`, adaptive sub-sampler, ring buffer, route-watch subscriber, preview logic, tests. | +| `monitoring/live_summary.go` (new) | Rolling-window adapter counters, hooked into existing observers. | +| `main.go` | Register `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers. No new flags on the node binary. | +| `web/` (new) | Svelte SPA source. | + +No changes to Raft, FSM, MVCC, or any protocol adapter beyond the single sampler call site and the `LiveSummary` hook that sits next to the existing Prometheus writes. + +## 9. Deployment and Operation + +- The admin binary is not intended to be exposed on the public network in its initial form. It has no auth. Default bind is `127.0.0.1:8080`. +- Typical operator workflow: `ssh -L 8080:localhost:8080 operator@host` then `elastickv-admin --node=localhost:50051`, or run the binary on a laptop and point it at a reachable node. +- The admin binary is stateless; it can be killed and restarted without coordination. +- CI produces release artifacts for `linux/amd64`, `linux/arm64`, `darwin/arm64`, and `windows/amd64`. + +## 10. Performance Considerations + +- Sampler fast path: one atomic read of `sampleRate`, a modulo check, and four atomic increments on a hit. No allocation per call. +- The coordinator already holds the `RouteID` at the hook site, so the sampler does not re-resolve. +- The flush goroutine takes a write lock on the route-counters map exactly once per `stepSeconds`. A CAS-based double-buffer is the first optimisation if profiling shows contention. +- API endpoints cap `to − from` at 7 days and `rows` at 1024 to bound server work. +- `LiveSummary` adds a second atomic increment alongside each existing Prometheus `Inc()`; its cost is on the order of a nanosecond and well below the noise floor in §5.2. + +## 11. Testing + +1. Unit tests for `keyviz.Sampler` (concurrent Observe, adaptive sub-sampling correctness, flush, split/merge reshaping, 95% capture SLO under synthetic load). +2. Integration test in `kv/` that drives synthetic traffic through the coordinator and asserts the matrix reflects the skew. +3. gRPC handler tests with a fake engine and fake Raft status reader. +4. Benchmark gate: `BenchmarkCoordinatorDispatch` with sampler off vs on. CI fails if the difference exceeds the benchmark's own run-to-run variance. +5. Playwright smoke test against the embedded SPA to catch build-time regressions. + +## 12. Phased Delivery + +| Phase | Scope | Exit criteria | +|---|---|---| +| 0 | `cmd/elastickv-admin` skeleton, `Admin` gRPC service stub, empty SPA shell, CI wiring. | Binary builds, `/api/cluster/overview` returns live data from a real node. | +| 1 | Overview, Routes, Raft Groups, Adapters pages. `LiveSummary` added. No sampler. | All read-only pages match `grpcurl` ground truth. | +| 2 | Key Visualizer MVP: in-memory sampler with adaptive sub-sampling, reads/writes series, static matrix API. | Benchmark gate green; heatmap shows synthetic hotspot within 2 s of load; capture rate ≥95% at target QPS. | +| 3 | Bytes series, drill-down, split/merge continuity, persistence of compacted columns in the default Raft group. | Heatmap remains continuous across a live `SplitRange`; restart preserves last 7 days. | +| 4 (deferred) | Mutating admin operations (`SplitRange` from UI), authentication. Out of scope for this design; a follow-up design will cover it. | — | + +Phases 0–2 are the minimum operationally useful product; Phase 3 is the "ship-quality" target. + +## 13. Open Questions + +1. Is the 24 h × 60 s retention default right, or should it scale with route count? A node with 100 k routes would exceed the ~80 MiB compacted footprint — consider a configurable `--keyvizRetention` on the node binary in Phase 2. +2. Do we want to expose follower-local read traffic separately from leader traffic in Phase 2, or defer that split to Phase 3? +3. Should `GetKeyVizMatrix` support requesting data from multiple nodes at once (fan-out in the admin binary) to reduce operator confusion when the leader moves, or is "always point at the current leader" simpler? From 07b6ad0819aa4d4165c463beb95a6ae8ce40053f Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Mon, 20 Apr 2026 02:24:52 +0900 Subject: [PATCH 02/30] docs(admin-ui): address review feedback on sampler concurrency and scaling - Replace lock-free claim with concrete shard-striped map + atomic pointer-swap flush; child counters are installed before RouteID is published so Observe never races with splits/merges (H1). - Distribute KeyViz persistence across the user Raft groups that own each route, with per-group hourly batches, instead of centralising writes on the default group (H2). - Cap tracked routes via --keyvizMaxTrackedRoutes (default 10000) with activity-weighted coarsening of adjacent routes; memory footprint is now bounded independent of total route count (H3). - Redefine the accuracy SLO as plus-or-minus 5 percent relative error at 95 percent CI on bucket totals, since raw capture rate is incompatible with Horvitz-Thompson estimation under stepwise sub-sampling (M1). - Replace the hot-path t-digest in LiveSummary with a fixed-bucket log-linear histogram so every observation is a single atomic add (M2). - Make cluster-wide fan-out the default mode of cmd/elastickv-admin; GetKeyVizMatrix now only serves this node leader-owned routes and the admin binary merges and reports per-node partial-status (M3). --- docs/admin_ui_key_visualizer_design.md | 114 +++++++++++++++++-------- 1 file changed, 80 insertions(+), 34 deletions(-) diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index 3536cf62..bac28240 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -21,7 +21,7 @@ This document proposes a built-in admin Web UI, shipped as a separate binary `cm 2. Provide a single UI that covers cluster overview, routes, Raft groups, adapter throughput, and the key visualizer. 3. Produce a time × key-space heatmap with at least four switchable series: read count, write count, read bytes, write bytes. 4. Follow hotspot shards across `SplitRange` / merge events so the heatmap stays continuous. -5. Keep the sampler's hot-path overhead within the measurement noise floor; define success as "≥95% of operations captured with no detectable regression in coordinator benchmarks." +5. Keep the sampler's hot-path overhead within the measurement noise floor of `BenchmarkCoordinatorDispatch`. Accuracy is expressed as a bound on the **estimator's relative error**, not a raw capture rate (see §5.2). 6. Stay off the Prometheus client library in Phases 0–3. Traffic counters used by the UI are maintained by the in-process sampler and a small adapter-side aggregator that already exists on the hot path. 7. Make the admin binary easy to deploy: a single Go binary with the SPA embedded via `go:embed`, producing one artifact per platform in CI. @@ -89,8 +89,8 @@ Two layers: | `ListRoutes` | Existing `Distribution.ListRoutes` (reused, not duplicated) | | `GetRaftGroups` | Per-group state (leader, term, commit/applied, last contact) | | `GetAdapterSummary` | Per-adapter QPS and latency quantiles from the in-process aggregator | -| `GetKeyVizMatrix` | Heatmap matrix (see §5.4) | -| `GetRouteDetail` | Time series for one route (drill-down) | +| `GetKeyVizMatrix` | Heatmap matrix for **this node's leader-owned routes only** (see §5.4). The admin binary fans out and merges. | +| `GetRouteDetail` | Time series for one route (drill-down). Served only by the current leader of that route. | | `StreamEvents` | Server-stream of route-state transitions and fresh matrix columns | **Layer B — HTTP/JSON, browser → admin binary.** Thin pass-through wrappers over the gRPC calls, plus static asset serving. @@ -129,37 +129,59 @@ A single call site is added at the dispatch entry of `kv.ShardedCoordinator` (se sampler.Observe(routeID, op, keyLen, valueLen) ``` -`sampler` is an interface; the default implementation is lock-free and nil-safe, so a nil sampler compiles to a predictable branch and no allocation. The hook runs *before* Raft proposal so it measures offered load, not applied load. +`sampler` is an interface; the default implementation is nil-safe (a nil sampler compiles to one branch and no allocation). The hook runs *before* Raft proposal so it measures offered load, not applied load. -Reads and writes are both sampled on the leader. Followers do not sample, because follower-local reads flow through the same coordinator path on the follower. +Reads and writes are both sampled on the leader. Followers do not sample, because follower-local reads flow through the same coordinator path on the follower. Because only the current leader samples, a cluster-wide heatmap requires the admin binary to fan out and merge across nodes (§9.1) — pointing at a single node would produce a partial view. -### 5.2 Adaptive sub-sampling and the 95% SLO +The hot path is not literally lock-free; "low-overhead concurrent" is a more honest description. The data structures used are: -Observing every call is cheap but not free. To guarantee "no detectable regression," the sampler uses **adaptive 1-in-N sampling per route**: +- **Current-window counters**: `routes` is a **fixed-size 16-way shard-striped map**, each shard holding `map[RouteID]*routeCounters` behind its own `sync.RWMutex`. `Observe` takes the shard's **read lock** for the lookup and uses `atomic.AddUint64` on the counter fields. Adding a new `RouteID` takes the shard's write lock exactly once per route (amortised to zero after warm-up). No `Observe` call ever runs against a plain Go map undergoing concurrent writes. +- **Flush**: instead of holding a long write lock, the flush goroutine **atomically swaps** the `*routeCounters` pointer for each key using `atomic.Pointer[routeCounters]`, then reads the old pointer's frozen counters to build the new matrix column. `Observe` that loaded the old pointer before the swap completes its increments against the (now-retired) old counters, which the next flush will harvest. No counts are lost; at most one step-boundary's worth of counts land in the next column instead of the current one. +- **Split/merge** (§5.4): the route-watch callback creates the new child shards' counters *before* the `distribution.Engine` exposes the new `RouteID` to the coordinator, so by the time `Observe` sees the new `RouteID` the counter already exists and the callback does not race with the hot path. -- Each route maintains an atomic `sampleRate` counter. -- Under low load (below a threshold QPS per route), `sampleRate = 1` (every op counted). -- As per-route QPS rises, `sampleRate` doubles stepwise, and the increment applied is multiplied by `sampleRate` so counters remain unbiased estimators. -- The controller targets **≥95% capture rate** in steady state: `sampleRate` is only raised when the write contention on the atomic increment itself crosses a noise-floor CPU threshold, measured at flush time. -- Worst-case error on per-bucket totals is bounded by `1/sqrt(observedSamples)`, so buckets with fewer samples are tagged in the response for the UI to hatch them. +### 5.2 Adaptive sub-sampling and the accuracy SLO -Benchmark gate in CI: run `BenchmarkCoordinatorDispatch` with the sampler disabled and with it enabled; the delta must be within the benchmark's own run-to-run variance (noise floor), not a fixed percentage. If a future change inflates variance, the gate fails until the noise floor is reduced or the sampler is made cheaper. +Observing every call is cheap but not free. To stay under the benchmark noise floor at very high per-route QPS, the sampler may sub-sample via **adaptive 1-in-N per route**. Counters remain unbiased estimators because each accepted sample increments by `sampleRate`. -### 5.3 In-memory representation +The capture rate itself is not the SLO — at `sampleRate = 8` the raw capture rate is 12.5%, but the estimator is still unbiased. What the UI cares about is the **relative error of the bucket total** shown in the heatmap. The SLO is therefore: + +> For every bucket displayed in the response, the estimated total is within **±5% of the true value with 95% confidence**, over the bucket's full step window (default 60 s). + +For Poisson-ish traffic, the relative error of the Horvitz–Thompson estimator is approximately `sampleRate / sqrt(acceptedSamples)`. Setting this ≤0.05 at 95% CI gives a required `acceptedSamples ≥ (1.96 · sampleRate / 0.05)²`. The adaptive controller enforces this by never raising `sampleRate` past the point where the most recent window's `acceptedSamples` falls below that bound; if a burst violates the bound the affected buckets are flagged in the response and the UI renders them hatched so the operator knows the estimate is soft. + +`sampleRate` only rises at all when the previous flush window's CPU attributed to `Observe` crosses a measured threshold. In steady state with moderate per-route QPS, `sampleRate` stays at 1 and every op is counted. + +Benchmark gate in CI: `BenchmarkCoordinatorDispatch` with sampler off vs on; the delta must stay within run-to-run variance. Separately, a correctness test drives a known synthetic workload through a sub-sampling sampler and asserts the ±5% / 95%-CI bound holds across 1000 trials. + +### 5.3 In-memory representation and the route budget ``` Sampler - ├─ routes map[RouteID]*routeCounters // current 1s window (reads,writes,readBytes,writeBytes, plus sampleRate) + ├─ routes [16]shard // shard-striped; each shard holds map[RouteID]*routeCounters + RWMutex + │ each routeCounters has (reads, writes, readBytes, writeBytes, sampleRate) └─ history *ringBuffer[matrixColumn] // one column per stepSeconds (default 60s) ``` -Every `stepSeconds` a flush goroutine swaps the map into a new column of the ring buffer. +Every `stepSeconds` a flush goroutine swaps each route's counter pointer (§5.1) and drops a new column into the ring buffer. + +**Route budget and memory cap.** Naïve sizing (`columns × routes × series × 8B`) does not scale: 1 M routes × 1440 columns × 4 series × 8 B = ~46 GiB. Unbounded growth is unacceptable. The sampler enforces a hard budget on tracked routes: + +- A new flag `--keyvizMaxTrackedRoutes` (default **10 000** per node) caps the size of `routes`. +- When `ListRoutes` exceeds the cap, the sampler **coarsens adjacent routes into virtual tracking buckets** sized to fit the budget. This is a purely internal aggregation; the admin binary still sees real `RouteID`s in `ListRoutes`, but their `Observe` calls land in the shared bucket, and the heatmap row simply labels the range `[start-of-first, end-of-last)`. +- Coarsening is greedy on sorted `start` with merge priority given to **lowest recent activity**, so hot routes stay 1:1 until the budget is exhausted. +- Compacted storage: columns older than 1 hour are re-bucketed into 5-minute aggregates, and columns older than 6 hours into 1-hour aggregates. The resulting steady-state footprint is: -The ring buffer default is **24 hours of 60 s columns = 1440 columns**. Memory estimate: `1440 × routes × 4 × 8B`. For 10 k routes: ~460 MiB. The flush goroutine compacts columns beyond 1 hour into 5-minute aggregates, bringing the steady-state cost to under 80 MiB for the same route count. +| Tracked routes | Ring-buffer retention | Footprint (4 series × 8 B) | +|---|---|---| +| 10 000 (default cap) | 24 h (1440 × 60 s) | ~1.8 GiB raw, **~120 MiB** after tiered compaction | +| 10 000 | 1 h only | **~18 MiB** | +| 1 000 | 24 h compacted | ~12 MiB | + +If an operator needs higher fidelity across more routes than the cap allows, they raise `--keyvizMaxTrackedRoutes` knowingly; the log emits an `INFO` at startup stating the selected cap and projected memory. If the cap is hit at runtime, an `INFO` fires once per hour naming which adjacent routes were coalesced. ### 5.4 Keeping up with splits and merges -`distribution.Engine` already emits a watch stream on route-state transitions. The sampler subscribes and, on a split, copies the parent route's historical column values into both children so the heatmap stays visually continuous across the event. On a merge, child columns are summed into the surviving parent. The current `routeCounters` map is updated atomically under the watch callback; in-flight `Observe` calls that raced with the transition are attributed to whichever route is visible at observe time — acceptable because the loss is bounded by a single step window. +`distribution.Engine` already emits a watch stream on route-state transitions. The sampler subscribes and, on a split, copies the parent route's historical column values into both children so the heatmap stays visually continuous across the event. On a merge, child columns are summed into the surviving parent. Current-window updates use the shard-striped, pointer-swap scheme from §5.1: child shards' `routeCounters` are installed **before** the `distribution.Engine` publishes the new `RouteID` to the coordinator, so `Observe` never dereferences a missing route. Counts that raced a transition are attributed to whichever `RouteID` the coordinator resolved — acceptable because the loss is bounded by a single step window. ### 5.5 Bucketing for the response @@ -169,7 +191,15 @@ The API's `rows` parameter is a *target*, not a guarantee. The server walks the Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptable for an MVP and keeps the Raft critical path untouched. -Phase 3 writes compacted columns to the default Raft group under reserved keys `!admin|keyviz||...`. Writes are batched once per flush and are not part of user transactions, so they cannot stall the data plane. A TTL of 7 days is applied via the existing HLC-based expiry (`store/lsm_store.go:24`). +Phase 3 persists compacted columns **distributed across the user Raft groups themselves, not the default group**. Concentrating KeyViz writes on the default group would centralise I/O and Raft-log growth onto a single group, creating exactly the kind of hotspot this feature is built to surface. Instead: + +- Each compacted KeyViz column for a route is written to the **Raft group that owns that route**, under a reserved key `!admin|keyviz||`. +- Writes are batched hourly per group (not per flush) and dispatched as a single low-priority proposal per group, keeping the write amplification proportional to the group's own traffic. +- A TTL of 7 days is applied via the existing HLC-based expiry (`store/lsm_store.go:24`). +- The admin binary, on a history query, fans out to all groups' leaders (§9.1) and merges the returned slices. +- For coarsened virtual buckets (§5.3), the column is written to the group owning the bucket's **first** constituent route, with a small index entry under `!admin|keyviz|index|` on the same group so the fan-out reader can discover it. The index entry is the only per-hour write that is shared — but its size is bounded by the route-budget cap, not by total traffic. + +This keeps the data-plane Raft-log overhead bounded by per-group load and fails independently when a single group is unavailable. ### 5.7 Key preview labels @@ -183,8 +213,9 @@ Raw keys are binary. The UI needs a printable hint per bucket. Strategy: The existing `monitoring.Registry` observers record into Prometheus counters/histograms — useful for Grafana, but not readable back without pulling in the Prometheus client library. To keep the admin binary and node binary free of that dependency during Phases 0–3: -- A small sibling struct `monitoring.LiveSummary` is added alongside each observer. It maintains, in parallel with the existing Prometheus writes, an in-process rolling window (10-second buckets, 5-minute history) of request count and latency reservoir samples per adapter and per operation. -- `LiveSummary` is read-only from the outside and lock-free on the write path (atomic counters + a tiny t-digest per op). +- A small sibling struct `monitoring.LiveSummary` is added alongside each observer. It maintains, in parallel with the existing Prometheus writes, an in-process rolling window (10-second buckets, 5-minute history) of request count and latency per adapter and per operation. +- Latency is tracked with a **fixed-bucket log-linear histogram** (256 pre-sized buckets covering 1 µs – 10 s, similar to the Prometheus default schema but owned in-process). Each observation is a single `atomic.AddUint64` on the bucket's counter — no sort, no merge, no locks, predictable nanosecond cost. Quantiles (p50/p95/p99) are interpolated at read time by `GetAdapterSummary`. A t-digest was considered but rejected because its centroid merge cost is not bounded on the hot path and is hard to make concurrent without a lock. +- Count, in-flight, and byte totals are plain `atomic.Uint64`. - `GetAdapterSummary` reads directly from `LiveSummary`. The Prometheus exposition remains unchanged and untouched. This adds roughly a dozen integer fields per tracked operation and avoids both the Prometheus dependency and the need to scrape `/metrics` from within the admin binary. @@ -221,25 +252,39 @@ No changes to Raft, FSM, MVCC, or any protocol adapter beyond the single sampler ## 9. Deployment and Operation - The admin binary is not intended to be exposed on the public network in its initial form. It has no auth. Default bind is `127.0.0.1:8080`. -- Typical operator workflow: `ssh -L 8080:localhost:8080 operator@host` then `elastickv-admin --node=localhost:50051`, or run the binary on a laptop and point it at a reachable node. +- Typical operator workflow: `ssh -L 8080:localhost:8080 operator@host` then `elastickv-admin --nodes=host1:50051,host2:50051,host3:50051`, or run the binary on a laptop and point it at any reachable subset of nodes. - The admin binary is stateless; it can be killed and restarted without coordination. - CI produces release artifacts for `linux/amd64`, `linux/arm64`, `darwin/arm64`, and `windows/amd64`. +### 9.1 Cluster-wide fan-out + +Because only the current leader of a Raft group records samples for that group's routes (§5.1), pointing the admin binary at a single node produces a **partial heatmap** covering only the routes that node happens to lead. To give operators a complete view by default, the admin binary runs in **fan-out mode**: + +- `--nodes` accepts a comma-separated list of seed addresses. The admin binary calls `GetClusterOverview` on any reachable seed to discover the current full membership (node → gRPC endpoint, plus per-group leader identity). +- For each query (`GetKeyVizMatrix`, `GetRouteDetail`, `GetAdapterSummary`), the admin binary issues parallel gRPC calls to every known node and merges results server-side before sending one combined JSON payload to the browser. +- Merging rule for the heatmap: each route appears in exactly one node's response (the leader's), so the merge is a concatenation with deduplication on `RouteID`. If two nodes report the same `RouteID` (a leadership change during the query window), the response with the **later** last-sampled timestamp wins, and the other is discarded. +- Degraded mode: if any node is unreachable, the admin binary returns a partial result with a per-node `{node, ok, error}` status array so the UI can surface "3 of 4 nodes responded" instead of silently hiding ranges. The heatmap hatches rows whose owning node failed. +- A single-node mode (`--nodes=one:50051 --no-fanout`) is retained for operators who explicitly want the partial view. + ## 10. Performance Considerations -- Sampler fast path: one atomic read of `sampleRate`, a modulo check, and four atomic increments on a hit. No allocation per call. +- Sampler fast path on a hit: shard-select (a bitmask on `RouteID`), `RLock` on that shard, map lookup, `atomic.Pointer[routeCounters].Load`, then `atomic.AddUint64` on the four counters. No allocation per call, no global lock. - The coordinator already holds the `RouteID` at the hook site, so the sampler does not re-resolve. -- The flush goroutine takes a write lock on the route-counters map exactly once per `stepSeconds`. A CAS-based double-buffer is the first optimisation if profiling shows contention. +- The flush goroutine performs atomic pointer swaps per tracked route; there is no global write lock covering `Observe` calls. Splits and merges install child counters before publishing the new `RouteID` (§5.4), so the callback does not race with the hot path. - API endpoints cap `to − from` at 7 days and `rows` at 1024 to bound server work. -- `LiveSummary` adds a second atomic increment alongside each existing Prometheus `Inc()`; its cost is on the order of a nanosecond and well below the noise floor in §5.2. +- `LiveSummary` adds a second atomic increment alongside each existing Prometheus `Inc()`, plus one atomic increment on a fixed-bucket histogram counter. Cost is on the order of a nanosecond and well below the noise floor in §5.2. +- Fan-out cost (§9.1) is N parallel gRPC calls; each node already serves `GetKeyVizMatrix` only for its own leader-owned routes, so the response size is distributed and the aggregate wall-clock is bounded by the slowest node, not the sum. ## 11. Testing -1. Unit tests for `keyviz.Sampler` (concurrent Observe, adaptive sub-sampling correctness, flush, split/merge reshaping, 95% capture SLO under synthetic load). -2. Integration test in `kv/` that drives synthetic traffic through the coordinator and asserts the matrix reflects the skew. -3. gRPC handler tests with a fake engine and fake Raft status reader. -4. Benchmark gate: `BenchmarkCoordinatorDispatch` with sampler off vs on. CI fails if the difference exceeds the benchmark's own run-to-run variance. -5. Playwright smoke test against the embedded SPA to catch build-time regressions. +1. Unit tests for `keyviz.Sampler`: concurrent `Observe` under the `-race` detector across all shards, flush correctness via the pointer-swap protocol, split/merge reshaping, and the **accuracy SLO** (1000 trials of synthetic workload must satisfy ±5% relative error at 95% CI per §5.2). +2. Route-budget test: generate more than `--keyvizMaxTrackedRoutes` routes and assert that coarsening preserves total observed traffic and keeps hot routes un-merged. +3. Integration test in `kv/` that drives synthetic traffic through the coordinator and asserts the matrix reflects the skew. +4. gRPC handler tests with a fake engine and fake Raft status reader. +5. Fan-out test: admin binary against a 3-node fake cluster, including one unreachable node; the merged response must include the partial-status array. +6. Persistence test: write compacted columns to per-route groups, take a leadership transfer, and verify the reader sees the complete history across groups. +7. Benchmark gate: `BenchmarkCoordinatorDispatch` with sampler off vs on. CI fails if the difference exceeds the benchmark's own run-to-run variance. +8. Playwright smoke test against the embedded SPA to catch build-time regressions. ## 12. Phased Delivery @@ -247,14 +292,15 @@ No changes to Raft, FSM, MVCC, or any protocol adapter beyond the single sampler |---|---|---| | 0 | `cmd/elastickv-admin` skeleton, `Admin` gRPC service stub, empty SPA shell, CI wiring. | Binary builds, `/api/cluster/overview` returns live data from a real node. | | 1 | Overview, Routes, Raft Groups, Adapters pages. `LiveSummary` added. No sampler. | All read-only pages match `grpcurl` ground truth. | -| 2 | Key Visualizer MVP: in-memory sampler with adaptive sub-sampling, reads/writes series, static matrix API. | Benchmark gate green; heatmap shows synthetic hotspot within 2 s of load; capture rate ≥95% at target QPS. | -| 3 | Bytes series, drill-down, split/merge continuity, persistence of compacted columns in the default Raft group. | Heatmap remains continuous across a live `SplitRange`; restart preserves last 7 days. | +| 2 | Key Visualizer MVP: in-memory sampler with adaptive sub-sampling, reads/writes series, fan-out across nodes, static matrix API. | Benchmark gate green; heatmap shows synthetic hotspot within 2 s of load; ±5% / 95%-CI accuracy SLO holds under synthetic bursts; fan-out returns complete view with 1 node down. | +| 3 | Bytes series, drill-down, split/merge continuity, persistence of compacted columns distributed **per owning Raft group**. | Heatmap remains continuous across a live `SplitRange`; restart preserves last 7 days; no single Raft group sees more than its share of KeyViz writes. | | 4 (deferred) | Mutating admin operations (`SplitRange` from UI), authentication. Out of scope for this design; a follow-up design will cover it. | — | Phases 0–2 are the minimum operationally useful product; Phase 3 is the "ship-quality" target. ## 13. Open Questions -1. Is the 24 h × 60 s retention default right, or should it scale with route count? A node with 100 k routes would exceed the ~80 MiB compacted footprint — consider a configurable `--keyvizRetention` on the node binary in Phase 2. +1. Default value of `--keyvizMaxTrackedRoutes`. 10 000 is conservative; operators with very large clusters may prefer a higher default paired with shorter retention. Settle during Phase 2 benchmarking. 2. Do we want to expose follower-local read traffic separately from leader traffic in Phase 2, or defer that split to Phase 3? -3. Should `GetKeyVizMatrix` support requesting data from multiple nodes at once (fan-out in the admin binary) to reduce operator confusion when the leader moves, or is "always point at the current leader" simpler? +3. In fan-out (§9.1), should the admin binary **pin** to the seed list or dynamically refresh membership from `GetClusterOverview` on every request? Dynamic is more correct during scale events; pinned is simpler and avoids stampedes on the seed. +4. For the Phase 3 persistence schema, should KeyViz writes share a transaction with other per-group low-priority maintenance (compaction metadata, etc.) to amortise Raft cost, or remain a dedicated batch for easier rollback? From 7a8dbae65944617eac45014d98537b799ab3d36c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Tue, 21 Apr 2026 23:02:47 +0900 Subject: [PATCH 03/30] docs: address keyviz review feedback --- docs/admin_ui_key_visualizer_design.md | 31 +++++++++++++------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index bac28240..3dd60e0a 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -133,11 +133,11 @@ sampler.Observe(routeID, op, keyLen, valueLen) Reads and writes are both sampled on the leader. Followers do not sample, because follower-local reads flow through the same coordinator path on the follower. Because only the current leader samples, a cluster-wide heatmap requires the admin binary to fan out and merge across nodes (§9.1) — pointing at a single node would produce a partial view. -The hot path is not literally lock-free; "low-overhead concurrent" is a more honest description. The data structures used are: +The hot path uses lock-free reads for route lookup and counter increments. The data structures used are: -- **Current-window counters**: `routes` is a **fixed-size 16-way shard-striped map**, each shard holding `map[RouteID]*routeCounters` behind its own `sync.RWMutex`. `Observe` takes the shard's **read lock** for the lookup and uses `atomic.AddUint64` on the counter fields. Adding a new `RouteID` takes the shard's write lock exactly once per route (amortised to zero after warm-up). No `Observe` call ever runs against a plain Go map undergoing concurrent writes. +- **Current-window counters**: `routes` is an immutable `routeTable` published through `atomic.Pointer[routeTable]`. `routeTable` owns `map[RouteID]*routeSlot`; each `routeSlot` owns an `atomic.Pointer[routeCounters]`. `Observe` loads the current table, performs a plain map lookup against that immutable snapshot, loads the slot's counter pointer, and uses `atomic.AddUint64` on the counter fields. Adding a new `RouteID` or replacing split/merge mappings performs a copy-on-write table update under a non-hot-path `routesMu`, then publishes the new table with one atomic store. No `Observe` call ever runs against a Go map that can be mutated concurrently. - **Flush**: instead of holding a long write lock, the flush goroutine **atomically swaps** the `*routeCounters` pointer for each key using `atomic.Pointer[routeCounters]`, then reads the old pointer's frozen counters to build the new matrix column. `Observe` that loaded the old pointer before the swap completes its increments against the (now-retired) old counters, which the next flush will harvest. No counts are lost; at most one step-boundary's worth of counts land in the next column instead of the current one. -- **Split/merge** (§5.4): the route-watch callback creates the new child shards' counters *before* the `distribution.Engine` exposes the new `RouteID` to the coordinator, so by the time `Observe` sees the new `RouteID` the counter already exists and the callback does not race with the hot path. +- **Split/merge** (§5.4): the route-watch callback creates the new child slots and publishes a new immutable `routeTable` *before* the `distribution.Engine` exposes the new `RouteID` to the coordinator, so by the time `Observe` sees the new `RouteID` the counter already exists and the callback does not race with the hot path. ### 5.2 Adaptive sub-sampling and the accuracy SLO @@ -147,7 +147,7 @@ The capture rate itself is not the SLO — at `sampleRate = 8` the raw capture r > For every bucket displayed in the response, the estimated total is within **±5% of the true value with 95% confidence**, over the bucket's full step window (default 60 s). -For Poisson-ish traffic, the relative error of the Horvitz–Thompson estimator is approximately `sampleRate / sqrt(acceptedSamples)`. Setting this ≤0.05 at 95% CI gives a required `acceptedSamples ≥ (1.96 · sampleRate / 0.05)²`. The adaptive controller enforces this by never raising `sampleRate` past the point where the most recent window's `acceptedSamples` falls below that bound; if a burst violates the bound the affected buckets are flagged in the response and the UI renders them hatched so the operator knows the estimate is soft. +For Poisson-ish traffic, the relative error of the Horvitz–Thompson estimator is approximately `1 / sqrt(acceptedSamples)` for 1-in-N sub-sampling where N > 1. Setting this ≤0.05 at 95% CI gives a required `acceptedSamples ≥ (1.96 / 0.05)² ≈ 1537`, independent of the current 1-in-N rate. Buckets sampled at `sampleRate = 1` are exact and do not need the bound. The adaptive controller enforces this by never raising `sampleRate` past the point where the most recent window's `acceptedSamples` falls below that bound; if a burst violates the bound the affected buckets are flagged in the response and the UI renders them hatched so the operator knows the estimate is soft. `sampleRate` only rises at all when the previous flush window's CPU attributed to `Observe` crosses a measured threshold. In steady state with moderate per-route QPS, `sampleRate` stays at 1 and every op is counted. @@ -157,8 +157,8 @@ Benchmark gate in CI: `BenchmarkCoordinatorDispatch` with sampler off vs on; the ``` Sampler - ├─ routes [16]shard // shard-striped; each shard holds map[RouteID]*routeCounters + RWMutex - │ each routeCounters has (reads, writes, readBytes, writeBytes, sampleRate) + ├─ routes atomic.Pointer[routeTable] // immutable map[RouteID]*routeSlot, COW-updated off the hot path + │ each routeSlot points to (reads, writes, readBytes, writeBytes, sampleRate) └─ history *ringBuffer[matrixColumn] // one column per stepSeconds (default 60s) ``` @@ -181,7 +181,7 @@ If an operator needs higher fidelity across more routes than the cap allows, the ### 5.4 Keeping up with splits and merges -`distribution.Engine` already emits a watch stream on route-state transitions. The sampler subscribes and, on a split, copies the parent route's historical column values into both children so the heatmap stays visually continuous across the event. On a merge, child columns are summed into the surviving parent. Current-window updates use the shard-striped, pointer-swap scheme from §5.1: child shards' `routeCounters` are installed **before** the `distribution.Engine` publishes the new `RouteID` to the coordinator, so `Observe` never dereferences a missing route. Counts that raced a transition are attributed to whichever `RouteID` the coordinator resolved — acceptable because the loss is bounded by a single step window. +`distribution.Engine` already emits a watch stream on route-state transitions. The sampler subscribes and, on a split, copies the parent route's historical column values into both children so the heatmap stays visually continuous across the event. On a merge, child columns are summed into the surviving parent. Current-window updates use the immutable-table, pointer-swap scheme from §5.1: child `routeSlot`s and `routeCounters` are installed in a freshly copied `routeTable` **before** the `distribution.Engine` publishes the new `RouteID` to the coordinator, so `Observe` never dereferences a missing route. Counts that raced a transition are attributed to whichever `RouteID` the coordinator resolved — acceptable because the loss is bounded by a single step window. ### 5.5 Bucketing for the response @@ -193,10 +193,11 @@ Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptab Phase 3 persists compacted columns **distributed across the user Raft groups themselves, not the default group**. Concentrating KeyViz writes on the default group would centralise I/O and Raft-log growth onto a single group, creating exactly the kind of hotspot this feature is built to surface. Instead: -- Each compacted KeyViz column for a route is written to the **Raft group that owns that route**, under a reserved key `!admin|keyviz||`. +- Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local reserved key `!admin|keyviz|range||`; the prefix is not routed through the global user keyspace or default group. `lineageID` is a stable KeyViz identifier stored with `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}` metadata; `RouteID` is recorded only as the current routing hint, never as the primary history key. +- Split and merge events append small group-local lineage records under `!admin|keyviz|lineage|`. On split, both children point back to the parent lineage and inherit the parent's compacted history for continuity. On merge, the survivor records both child lineage IDs and the reader sums overlapping intervals. If a node sees historical rows without a lineage record during an upgrade, the admin reader falls back to overlap on the persisted `[start, end)` range before using `RouteID`. - Writes are batched hourly per group (not per flush) and dispatched as a single low-priority proposal per group, keeping the write amplification proportional to the group's own traffic. - A TTL of 7 days is applied via the existing HLC-based expiry (`store/lsm_store.go:24`). -- The admin binary, on a history query, fans out to all groups' leaders (§9.1) and merges the returned slices. +- The admin binary, on a history query, fans out to all groups' leaders (§9.1), reconstructs the range timeline from lineage metadata, and merges returned slices by time × key-range overlap. This keeps a hotspot visually continuous even when its serving `RouteID` changed across a `SplitRange` or merge. - For coarsened virtual buckets (§5.3), the column is written to the group owning the bucket's **first** constituent route, with a small index entry under `!admin|keyviz|index|` on the same group so the fan-out reader can discover it. The index entry is the only per-hour write that is shared — but its size is bounded by the route-budget cap, not by total traffic. This keeps the data-plane Raft-log overhead bounded by per-group load and fails independently when a single group is unavailable. @@ -230,7 +231,7 @@ This adds roughly a dozen integer fields per tracked operation and avoids both t - Series switcher (reads / writes / readBytes / writeBytes). - Range selection opens a drawer with the underlying route list, current leader, size, and a link to the Raft group page. - Live mode: a WebSocket push appends a new column every `stepSeconds` without refetching history. - - Buckets below the 95%-capture threshold are hatched to signal estimation uncertainty. + - Buckets that miss the ±5% / 95%-CI estimator bound are hatched to signal estimation uncertainty. - **Build**: `web/` at repo root, `pnpm build` output copied to `cmd/elastickv-admin/dist/`, embedded with `//go:embed dist`. - **Dev flow**: Vite dev server on `:5173` proxies `/api` and `/stream` to a locally running `cmd/elastickv-admin`. @@ -244,7 +245,7 @@ This adds roughly a dozen integer fields per tracked operation and avoids both t | `kv/sharded_coordinator.go` | One-line `sampler.Observe(...)` at dispatch entry; `sampler` is `keyviz.Sampler` injected via constructor, nil-safe. | | `keyviz/` (new) | `Sampler`, adaptive sub-sampler, ring buffer, route-watch subscriber, preview logic, tests. | | `monitoring/live_summary.go` (new) | Rolling-window adapter counters, hooked into existing observers. | -| `main.go` | Register `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers. No new flags on the node binary. | +| `main.go` | Register `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers; add `--keyvizMaxTrackedRoutes`. | | `web/` (new) | Svelte SPA source. | No changes to Raft, FSM, MVCC, or any protocol adapter beyond the single sampler call site and the `LiveSummary` hook that sits next to the existing Prometheus writes. @@ -268,21 +269,21 @@ Because only the current leader of a Raft group records samples for that group's ## 10. Performance Considerations -- Sampler fast path on a hit: shard-select (a bitmask on `RouteID`), `RLock` on that shard, map lookup, `atomic.Pointer[routeCounters].Load`, then `atomic.AddUint64` on the four counters. No allocation per call, no global lock. +- Sampler fast path on a hit: `atomic.Pointer[routeTable].Load`, immutable map lookup by `RouteID`, `atomic.Pointer[routeCounters].Load`, then `atomic.AddUint64` on the four counters. No allocation per call, no mutex acquisition, no global lock. - The coordinator already holds the `RouteID` at the hook site, so the sampler does not re-resolve. -- The flush goroutine performs atomic pointer swaps per tracked route; there is no global write lock covering `Observe` calls. Splits and merges install child counters before publishing the new `RouteID` (§5.4), so the callback does not race with the hot path. +- The flush goroutine performs atomic pointer swaps per tracked route; there is no write lock covering `Observe` calls. Splits and merges publish a copied immutable route table with child counters before publishing the new `RouteID` (§5.4), so the callback does not race with the hot path. - API endpoints cap `to − from` at 7 days and `rows` at 1024 to bound server work. - `LiveSummary` adds a second atomic increment alongside each existing Prometheus `Inc()`, plus one atomic increment on a fixed-bucket histogram counter. Cost is on the order of a nanosecond and well below the noise floor in §5.2. - Fan-out cost (§9.1) is N parallel gRPC calls; each node already serves `GetKeyVizMatrix` only for its own leader-owned routes, so the response size is distributed and the aggregate wall-clock is bounded by the slowest node, not the sum. ## 11. Testing -1. Unit tests for `keyviz.Sampler`: concurrent `Observe` under the `-race` detector across all shards, flush correctness via the pointer-swap protocol, split/merge reshaping, and the **accuracy SLO** (1000 trials of synthetic workload must satisfy ±5% relative error at 95% CI per §5.2). +1. Unit tests for `keyviz.Sampler`: concurrent `Observe` under the `-race` detector while copy-on-write route-table updates run, flush correctness via the pointer-swap protocol, split/merge reshaping, and the **accuracy SLO** (1000 trials of synthetic workload must satisfy ±5% relative error at 95% CI per §5.2). 2. Route-budget test: generate more than `--keyvizMaxTrackedRoutes` routes and assert that coarsening preserves total observed traffic and keeps hot routes un-merged. 3. Integration test in `kv/` that drives synthetic traffic through the coordinator and asserts the matrix reflects the skew. 4. gRPC handler tests with a fake engine and fake Raft status reader. 5. Fan-out test: admin binary against a 3-node fake cluster, including one unreachable node; the merged response must include the partial-status array. -6. Persistence test: write compacted columns to per-route groups, take a leadership transfer, and verify the reader sees the complete history across groups. +6. Persistence test: write compacted columns to per-range groups, perform split and merge transitions, take a leadership transfer, and verify the lineage reader reconstructs complete history across groups without relying on stable `RouteID`s. 7. Benchmark gate: `BenchmarkCoordinatorDispatch` with sampler off vs on. CI fails if the difference exceeds the benchmark's own run-to-run variance. 8. Playwright smoke test against the embedded SPA to catch build-time regressions. From d6ba24a91882aa84336098d55e5728ca03f57fb0 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 00:26:39 +0900 Subject: [PATCH 04/30] docs: address keyviz follow-up review --- docs/admin_ui_key_visualizer_design.md | 84 +++++++++++++++----------- 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index 3dd60e0a..c6875910 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -24,12 +24,13 @@ This document proposes a built-in admin Web UI, shipped as a separate binary `cm 5. Keep the sampler's hot-path overhead within the measurement noise floor of `BenchmarkCoordinatorDispatch`. Accuracy is expressed as a bound on the **estimator's relative error**, not a raw capture rate (see §5.2). 6. Stay off the Prometheus client library in Phases 0–3. Traffic counters used by the UI are maintained by the in-process sampler and a small adapter-side aggregator that already exists on the hot path. 7. Make the admin binary easy to deploy: a single Go binary with the SPA embedded via `go:embed`, producing one artifact per platform in CI. +8. Protect the node-side `Admin` gRPC service from Phase 0. The UI may bind to localhost, but the nodes expose metadata on their data-plane gRPC port, so read-only admin RPCs require an operator token by default. ### 2.2 Non-goals 1. Replacement of the existing Grafana dashboards. The admin UI focuses on cluster state and the keyspace view; long-horizon trend analysis remains a Prometheus/Grafana concern. 2. Per-individual-key statistics. The visualizer operates on route-level buckets, not on a `GET` / `PUT` trace. -3. Authentication or authorization in the initial milestones. The admin binary binds to localhost by default and expects operators to layer their own access control (SSH tunnel, reverse proxy, network ACL). Authentication is out of scope for Phases 0–4. +3. Full multi-user RBAC, identity federation, or browser login flows. Phase 0 only requires a shared read-only admin token for the node-side gRPC service; richer auth remains deferred. 4. Query console (SQL/Redis/DynamoDB REPL) inside the UI. Deferred. 5. Multi-cluster federation. Scope is a single cluster; the admin binary may target any single node. @@ -68,20 +69,20 @@ flowchart LR AdminSvc --> Raft ``` -The admin binary holds no authoritative state. All data is fetched on demand from nodes via a new `Admin` gRPC service. The sampler's ring buffer lives inside each node's process, rebuildable after restart (see §5.5). +The admin binary holds no authoritative state. All data is fetched on demand from nodes via a new `Admin` gRPC service. The sampler's ring buffer lives inside each node's process, rebuildable after restart once Phase 3 persistence is enabled (see §5.6). ### 3.1 Why a separate binary - Release cadence for the UI is decoupled from the data plane. - The admin binary can be placed on an operator workstation or a sidecar pod, so a compromised UI does not imply a compromised data node. - Node binaries remain free of the Prometheus client (goal §2.1-6) and of any SPA assets. -- `cmd/elastickv-admin --node=host:50051` is the full invocation; no config files are required for the default use case. +- `cmd/elastickv-admin --nodes=host:50051 --nodeTokenFile=/etc/elastickv/admin.token` is the full invocation; no multi-file config bundle is required for the default use case. ## 4. API Surface Two layers: -**Layer A — gRPC, node → admin binary.** A new `Admin` service on each node, registered on the same gRPC port as `RawKV` (`--address`, default `:50051`). All methods are read-only in Phases 0–3. +**Layer A — gRPC, node → admin binary.** A new `Admin` service on each node, registered on the same gRPC port as `RawKV` (`--address`, default `:50051`). All methods are read-only in Phases 0–3 and require `authorization: Bearer ` metadata. Nodes load the token from `--adminTokenFile`; the admin binary sends it from `--nodeTokenFile`. An explicit `--adminInsecureNoAuth` flag exists only for local development and logs a warning at startup. | RPC | Purpose | |---|---| @@ -89,8 +90,8 @@ Two layers: | `ListRoutes` | Existing `Distribution.ListRoutes` (reused, not duplicated) | | `GetRaftGroups` | Per-group state (leader, term, commit/applied, last contact) | | `GetAdapterSummary` | Per-adapter QPS and latency quantiles from the in-process aggregator | -| `GetKeyVizMatrix` | Heatmap matrix for **this node's leader-owned routes only** (see §5.4). The admin binary fans out and merges. | -| `GetRouteDetail` | Time series for one route (drill-down). Served only by the current leader of that route. | +| `GetKeyVizMatrix` | Heatmap matrix for **this node's locally observed samples**: leader writes plus reads served locally, including follower-local reads (see §5.1). The admin binary fans out and merges. | +| `GetRouteDetail` | Time series for one route or virtual bucket (drill-down). The admin binary fans out because reads may be observed by followers. | | `StreamEvents` | Server-stream of route-state transitions and fresh matrix columns | **Layer B — HTTP/JSON, browser → admin binary.** Thin pass-through wrappers over the gRPC calls, plus static asset serving. @@ -103,7 +104,7 @@ Two layers: | GET | `/api/raft/groups` | Wraps `GetRaftGroups` | | GET | `/api/adapters/summary` | Wraps `GetAdapterSummary` | | GET | `/api/keyviz/matrix` | Wraps `GetKeyVizMatrix` | -| GET | `/api/keyviz/routes/{routeID}` | Wraps `GetRouteDetail` | +| GET | `/api/keyviz/buckets/{bucketID}` | Wraps `GetRouteDetail` for a real route bucket or coarsened virtual bucket | | WS | `/api/stream` | Multiplexes `StreamEvents` from all targeted nodes | HTTP errors use a minimal `{code, message}` envelope. No caching headers on read endpoints. @@ -117,7 +118,15 @@ HTTP errors use a minimal `{code, message}` envelope. No caching headers on read | `to` | timestamp | now | Exclusive | | `rows` | int | 256 | Target Y-axis resolution (server may return fewer) | -Response matrix format: `matrix[i][j]` is the value for bucket `i` at time column `j`. Keys in `start`/`end` are raw bytes; the server supplies `label` as a printable preview (§5.6). +Response matrix format: `matrix[i][j]` is the value for bucket `i` at time column `j`. Keys in `start`/`end` are raw bytes; the server supplies `label` as a printable preview (§5.6). Each row also carries bucket metadata: + +| Field | Meaning | +|---|---| +| `bucketID` | Stable UI identifier, either `route:` or `virtual:`. | +| `aggregate` | `true` when multiple routes were coarsened into this row. | +| `routeIDs` / `routeCount` | Exact route IDs for small aggregates, plus total count. Large aggregates may truncate `routeIDs` and set `routeIDsTruncated=true`. | +| `sampleRoles` | Which roles contributed: `leaderWrite`, `leaderRead`, `followerRead`. | +| `lineageID` | Present for persisted Phase 3 rows so the UI can track continuity across split/merge events. | ## 5. Key Visualizer @@ -131,7 +140,7 @@ sampler.Observe(routeID, op, keyLen, valueLen) `sampler` is an interface; the default implementation is nil-safe (a nil sampler compiles to one branch and no allocation). The hook runs *before* Raft proposal so it measures offered load, not applied load. -Reads and writes are both sampled on the leader. Followers do not sample, because follower-local reads flow through the same coordinator path on the follower. Because only the current leader samples, a cluster-wide heatmap requires the admin binary to fan out and merge across nodes (§9.1) — pointing at a single node would produce a partial view. +Writes are sampled exactly once by the current Raft leader before proposal. Reads are sampled by the node that actually serves the read: leader reads are marked `leaderRead`, and lease/follower-local reads are marked `followerRead`. Requests forwarded between nodes carry an internal "already sampled" marker so a logical operation is not counted twice. Because read load can be spread across followers, a cluster-wide heatmap requires the admin binary to fan out and merge across nodes (§9.1) — pointing at a single node would produce a partial view. The hot path uses lock-free reads for route lookup and counter increments. The data structures used are: @@ -167,7 +176,7 @@ Every `stepSeconds` a flush goroutine swaps each route's counter pointer (§5.1) **Route budget and memory cap.** Naïve sizing (`columns × routes × series × 8B`) does not scale: 1 M routes × 1440 columns × 4 series × 8 B = ~46 GiB. Unbounded growth is unacceptable. The sampler enforces a hard budget on tracked routes: - A new flag `--keyvizMaxTrackedRoutes` (default **10 000** per node) caps the size of `routes`. -- When `ListRoutes` exceeds the cap, the sampler **coarsens adjacent routes into virtual tracking buckets** sized to fit the budget. This is a purely internal aggregation; the admin binary still sees real `RouteID`s in `ListRoutes`, but their `Observe` calls land in the shared bucket, and the heatmap row simply labels the range `[start-of-first, end-of-last)`. +- When `ListRoutes` exceeds the cap, the sampler **coarsens adjacent routes into virtual tracking buckets** sized to fit the budget. The admin binary still sees real `RouteID`s in `ListRoutes`, but their `Observe` calls land in the shared bucket. The matrix response never pretends that such a row is a single route: it sets `aggregate=true`, returns a `virtual:*` `bucketID`, includes `routeCount` and the constituent `routeIDs` when small enough, and labels the range `[start-of-first, end-of-last)`. - Coarsening is greedy on sorted `start` with merge priority given to **lowest recent activity**, so hot routes stay 1:1 until the budget is exhausted. - Compacted storage: columns older than 1 hour are re-bucketed into 5-minute aggregates, and columns older than 6 hours into 1-hour aggregates. The resulting steady-state footprint is: @@ -193,10 +202,13 @@ Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptab Phase 3 persists compacted columns **distributed across the user Raft groups themselves, not the default group**. Concentrating KeyViz writes on the default group would centralise I/O and Raft-log growth onto a single group, creating exactly the kind of hotspot this feature is built to surface. Instead: -- Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local reserved key `!admin|keyviz|range||`; the prefix is not routed through the global user keyspace or default group. `lineageID` is a stable KeyViz identifier stored with `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}` metadata; `RouteID` is recorded only as the current routing hint, never as the primary history key. +- Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local admin namespace `!admin|keyviz|range||`; the prefix is not routed through the default group. Phase 3 also adds an explicit system-namespace filter so `pebbleStore.ScanAt`, `ReverseScanAt`, and `ShardedCoordinator.maxLatestCommitTS` ignore `!admin|*` records for user-plane requests. This prevents internal metadata from leaking through scans or advancing user transaction timestamps. +- `lineageID` is generated as a UUIDv7 using the route transition HLC plus crypto-random entropy, making it cluster-wide unique without coordinating through the default group. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}`; `RouteID` is recorded only as the current routing hint, never as the primary history key. - Split and merge events append small group-local lineage records under `!admin|keyviz|lineage|`. On split, both children point back to the parent lineage and inherit the parent's compacted history for continuity. On merge, the survivor records both child lineage IDs and the reader sums overlapping intervals. If a node sees historical rows without a lineage record during an upgrade, the admin reader falls back to overlap on the persisted `[start, end)` range before using `RouteID`. +- On startup, the sampler rebuilds its in-memory `RouteID → lineageID` map by scanning the group-local lineage index for routes currently owned by the node's groups and matching active `[start, end)` ranges from `ListRoutes`. If a route exists without a matching lineage record, the node creates a new lineage record with a parent pointer to the best overlapping retained range. This makes rolling restarts and upgrades preserve historical continuity. - Writes are batched hourly per group (not per flush) and dispatched as a single low-priority proposal per group, keeping the write amplification proportional to the group's own traffic. -- A TTL of 7 days is applied via the existing HLC-based expiry (`store/lsm_store.go:24`). +- Retention is enforced by a KeyViz-specific GC pass, not by assuming ordinary HLC expiry will delete the latest MVCC version. Phase 3 includes either updating `pebbleStore.Compact` to collect latest versions whose `ExpireAt` is past the retention horizon or adding a KeyViz maintenance delete that tombstones expired column and lineage records before compaction. Persistence refuses to enable if this GC capability is absent, avoiding unbounded growth. +- Lineage records are retained while any column in the 7-day retention window references them. The same GC pass prunes closed lineage branches whose `validToHLC` and descendants are older than retention, so frequent split/merge clusters do not accumulate an unbounded lineage tree. - The admin binary, on a history query, fans out to all groups' leaders (§9.1), reconstructs the range timeline from lineage metadata, and merges returned slices by time × key-range overlap. This keeps a hotspot visually continuous even when its serving `RouteID` changed across a `SplitRange` or merge. - For coarsened virtual buckets (§5.3), the column is written to the group owning the bucket's **first** constituent route, with a small index entry under `!admin|keyviz|index|` on the same group so the fan-out reader can discover it. The index entry is the only per-hour write that is shared — but its size is bounded by the route-budget cap, not by total traffic. @@ -229,7 +241,7 @@ This adds roughly a dozen integer fields per tracked operation and avoids both t - **Key Visualizer page**: - X-axis time, Y-axis route buckets, brush-to-zoom on both axes. - Series switcher (reads / writes / readBytes / writeBytes). - - Range selection opens a drawer with the underlying route list, current leader, size, and a link to the Raft group page. + - Range selection opens a drawer with the underlying route list, current leader(s), size, and a link to the Raft group page. For `aggregate=true` rows, the drawer explicitly says the row is a coarsened virtual bucket and lists the constituent routes or the truncated route count. - Live mode: a WebSocket push appends a new column every `stepSeconds` without refetching history. - Buckets that miss the ±5% / 95%-CI estimator bound are hatched to signal estimation uncertainty. - **Build**: `web/` at repo root, `pnpm build` output copied to `cmd/elastickv-admin/dist/`, embedded with `//go:embed dist`. @@ -242,29 +254,30 @@ This adds roughly a dozen integer fields per tracked operation and avoids both t | `cmd/elastickv-admin/` (new) | Main, HTTP server, gRPC clients, embedded SPA. | | `adapter/admin_grpc.go` (new) | Server-side implementation of the `Admin` gRPC service, registered in `main.go`. | | `proto/admin.proto` (new) | Service definition for `Admin`. | -| `kv/sharded_coordinator.go` | One-line `sampler.Observe(...)` at dispatch entry; `sampler` is `keyviz.Sampler` injected via constructor, nil-safe. | +| `kv/sharded_coordinator.go` | One-line `sampler.Observe(...)` at dispatch entry; `sampler` is `keyviz.Sampler` injected via constructor, nil-safe. Phase 3 also filters `!admin|*` from `maxLatestCommitTS`. | | `keyviz/` (new) | `Sampler`, adaptive sub-sampler, ring buffer, route-watch subscriber, preview logic, tests. | | `monitoring/live_summary.go` (new) | Rolling-window adapter counters, hooked into existing observers. | -| `main.go` | Register `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers; add `--keyvizMaxTrackedRoutes`. | +| `store/lsm_store.go` | Phase 3 system-namespace scan filtering and retention GC support for expired latest versions or KeyViz maintenance tombstones. | +| `main.go` | Register token-protected `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers; add `--adminTokenFile`, `--adminInsecureNoAuth`, and `--keyvizMaxTrackedRoutes`. | | `web/` (new) | Svelte SPA source. | -No changes to Raft, FSM, MVCC, or any protocol adapter beyond the single sampler call site and the `LiveSummary` hook that sits next to the existing Prometheus writes. +No changes to Raft or FSM are required. Data-plane protocol adapters only receive the sampler call site and the `LiveSummary` hook that sits next to existing Prometheus writes. Phase 3 intentionally touches the store/coordinator read paths to keep `!admin|keyviz|*` metadata out of user scans and timestamp selection. ## 9. Deployment and Operation -- The admin binary is not intended to be exposed on the public network in its initial form. It has no auth. Default bind is `127.0.0.1:8080`. -- Typical operator workflow: `ssh -L 8080:localhost:8080 operator@host` then `elastickv-admin --nodes=host1:50051,host2:50051,host3:50051`, or run the binary on a laptop and point it at any reachable subset of nodes. +- The admin binary is not intended to be exposed on the public network in its initial form. Default bind is `127.0.0.1:8080`; browser login and RBAC are deferred, but node-side `Admin` gRPC calls require the shared read-only token from §4. +- Typical operator workflow: `ssh -L 8080:localhost:8080 operator@host` then `elastickv-admin --nodes=host1:50051,host2:50051,host3:50051 --nodeTokenFile=/etc/elastickv/admin.token`, or run the binary on a laptop and point it at any reachable subset of nodes. - The admin binary is stateless; it can be killed and restarted without coordination. - CI produces release artifacts for `linux/amd64`, `linux/arm64`, `darwin/arm64`, and `windows/amd64`. ### 9.1 Cluster-wide fan-out -Because only the current leader of a Raft group records samples for that group's routes (§5.1), pointing the admin binary at a single node produces a **partial heatmap** covering only the routes that node happens to lead. To give operators a complete view by default, the admin binary runs in **fan-out mode**: +Because writes are recorded by Raft leaders and follower-local reads are recorded by the followers that serve them (§5.1), pointing the admin binary at a single node produces a **partial heatmap**. To give operators a complete view by default, the admin binary runs in **fan-out mode**: - `--nodes` accepts a comma-separated list of seed addresses. The admin binary calls `GetClusterOverview` on any reachable seed to discover the current full membership (node → gRPC endpoint, plus per-group leader identity). - For each query (`GetKeyVizMatrix`, `GetRouteDetail`, `GetAdapterSummary`), the admin binary issues parallel gRPC calls to every known node and merges results server-side before sending one combined JSON payload to the browser. -- Merging rule for the heatmap: each route appears in exactly one node's response (the leader's), so the merge is a concatenation with deduplication on `RouteID`. If two nodes report the same `RouteID` (a leadership change during the query window), the response with the **later** last-sampled timestamp wins, and the other is discarded. -- Degraded mode: if any node is unreachable, the admin binary returns a partial result with a per-node `{node, ok, error}` status array so the UI can surface "3 of 4 nodes responded" instead of silently hiding ranges. The heatmap hatches rows whose owning node failed. +- Merging rule for the heatmap: rows are grouped by `bucketID`/`lineageID` and time step. Read samples from multiple nodes are **summed**, because they represent distinct locally served reads. Write samples are grouped by `(bucketID, raftGroupID, leaderTerm, sourceNode, windowStart)` and summed across distinct leader terms during leadership transitions; exact duplicate source keys are deduplicated. The admin binary never uses "later timestamp wins" to overwrite a previous leader's complete window with a new leader's partial window. If two leaders claim overlapping terms for the same group, the cell is returned with `conflict=true` and rendered hatched rather than silently dropping data. +- Degraded mode: if any node is unreachable, the admin binary returns a partial result with a per-node `{node, ok, error}` status array so the UI can surface "3 of 4 nodes responded" instead of silently hiding ranges. The heatmap hatches rows or time windows whose expected source node failed. - A single-node mode (`--nodes=one:50051 --no-fanout`) is retained for operators who explicitly want the partial view. ## 10. Performance Considerations @@ -274,34 +287,35 @@ Because only the current leader of a Raft group records samples for that group's - The flush goroutine performs atomic pointer swaps per tracked route; there is no write lock covering `Observe` calls. Splits and merges publish a copied immutable route table with child counters before publishing the new `RouteID` (§5.4), so the callback does not race with the hot path. - API endpoints cap `to − from` at 7 days and `rows` at 1024 to bound server work. - `LiveSummary` adds a second atomic increment alongside each existing Prometheus `Inc()`, plus one atomic increment on a fixed-bucket histogram counter. Cost is on the order of a nanosecond and well below the noise floor in §5.2. -- Fan-out cost (§9.1) is N parallel gRPC calls; each node already serves `GetKeyVizMatrix` only for its own leader-owned routes, so the response size is distributed and the aggregate wall-clock is bounded by the slowest node, not the sum. +- Fan-out cost (§9.1) is N parallel gRPC calls; each node serves only its locally observed samples, so the response size is distributed and the aggregate wall-clock is bounded by the slowest node, not the sum. ## 11. Testing -1. Unit tests for `keyviz.Sampler`: concurrent `Observe` under the `-race` detector while copy-on-write route-table updates run, flush correctness via the pointer-swap protocol, split/merge reshaping, and the **accuracy SLO** (1000 trials of synthetic workload must satisfy ±5% relative error at 95% CI per §5.2). -2. Route-budget test: generate more than `--keyvizMaxTrackedRoutes` routes and assert that coarsening preserves total observed traffic and keeps hot routes un-merged. +1. Unit tests for `keyviz.Sampler`: concurrent `Observe` under the `-race` detector while copy-on-write route-table updates run, flush correctness via the pointer-swap protocol, split/merge reshaping, forwarded-read "already sampled" deduplication, and the **accuracy SLO** (1000 trials of synthetic workload must satisfy ±5% relative error at 95% CI per §5.2). +2. Route-budget test: generate more than `--keyvizMaxTrackedRoutes` routes and assert that coarsening preserves total observed traffic, keeps hot routes un-merged, and returns `aggregate`, `bucketID`, `routeCount`, and constituent route metadata correctly. 3. Integration test in `kv/` that drives synthetic traffic through the coordinator and asserts the matrix reflects the skew. 4. gRPC handler tests with a fake engine and fake Raft status reader. -5. Fan-out test: admin binary against a 3-node fake cluster, including one unreachable node; the merged response must include the partial-status array. -6. Persistence test: write compacted columns to per-range groups, perform split and merge transitions, take a leadership transfer, and verify the lineage reader reconstructs complete history across groups without relying on stable `RouteID`s. -7. Benchmark gate: `BenchmarkCoordinatorDispatch` with sampler off vs on. CI fails if the difference exceeds the benchmark's own run-to-run variance. -8. Playwright smoke test against the embedded SPA to catch build-time regressions. +5. Fan-out test: admin binary against a 3-node fake cluster, including follower-local reads, one unreachable node, and a leadership transfer in the middle of a step window; the merged response must sum non-duplicate samples, preserve the partial-status array, and flag ambiguous overlap. +6. Persistence test: write compacted columns to per-range groups, perform split and merge transitions, restart a node, take a leadership transfer, run KeyViz GC, and verify the lineage reader reconstructs complete history across groups without relying on stable `RouteID`s`. +7. Namespace isolation test: user `ScanAt`, `ReverseScanAt`, and `maxLatestCommitTS` must ignore `!admin|keyviz|*` records. +8. Auth test: `Admin` gRPC methods reject missing or wrong tokens and accept the configured read-only token. +9. Benchmark gate: `BenchmarkCoordinatorDispatch` with sampler off vs on. CI fails if the difference exceeds the benchmark's own run-to-run variance. +10. Playwright smoke test against the embedded SPA to catch build-time regressions. ## 12. Phased Delivery | Phase | Scope | Exit criteria | |---|---|---| -| 0 | `cmd/elastickv-admin` skeleton, `Admin` gRPC service stub, empty SPA shell, CI wiring. | Binary builds, `/api/cluster/overview` returns live data from a real node. | +| 0 | `cmd/elastickv-admin` skeleton, token-protected `Admin` gRPC service stub, empty SPA shell, CI wiring. | Binary builds, `/api/cluster/overview` returns live data from a real node only when the configured admin token is supplied. | | 1 | Overview, Routes, Raft Groups, Adapters pages. `LiveSummary` added. No sampler. | All read-only pages match `grpcurl` ground truth. | -| 2 | Key Visualizer MVP: in-memory sampler with adaptive sub-sampling, reads/writes series, fan-out across nodes, static matrix API. | Benchmark gate green; heatmap shows synthetic hotspot within 2 s of load; ±5% / 95%-CI accuracy SLO holds under synthetic bursts; fan-out returns complete view with 1 node down. | -| 3 | Bytes series, drill-down, split/merge continuity, persistence of compacted columns distributed **per owning Raft group**. | Heatmap remains continuous across a live `SplitRange`; restart preserves last 7 days; no single Raft group sees more than its share of KeyViz writes. | -| 4 (deferred) | Mutating admin operations (`SplitRange` from UI), authentication. Out of scope for this design; a follow-up design will cover it. | — | +| 2 | Key Visualizer MVP: in-memory sampler with adaptive sub-sampling, leader writes, leader/follower reads, fan-out across nodes, static matrix API with virtual-bucket metadata. | Benchmark gate green; heatmap shows synthetic hotspot within 2 s of load; ±5% / 95%-CI accuracy SLO holds under synthetic bursts; fan-out returns complete view with 1 node down. | +| 3 | Bytes series, drill-down, split/merge continuity, namespace-isolated persistence of compacted columns distributed **per owning Raft group**, lineage recovery, and retention GC. | Heatmap remains continuous across a live `SplitRange`; restart preserves last 7 days; expired data and stale lineage records are collected; no single Raft group sees more than its share of KeyViz writes. | +| 4 (deferred) | Mutating admin operations (`SplitRange` from UI), browser login, RBAC, and identity-provider integration. Out of scope for this design; a follow-up design will cover it. | — | Phases 0–2 are the minimum operationally useful product; Phase 3 is the "ship-quality" target. ## 13. Open Questions 1. Default value of `--keyvizMaxTrackedRoutes`. 10 000 is conservative; operators with very large clusters may prefer a higher default paired with shorter retention. Settle during Phase 2 benchmarking. -2. Do we want to expose follower-local read traffic separately from leader traffic in Phase 2, or defer that split to Phase 3? -3. In fan-out (§9.1), should the admin binary **pin** to the seed list or dynamically refresh membership from `GetClusterOverview` on every request? Dynamic is more correct during scale events; pinned is simpler and avoids stampedes on the seed. -4. For the Phase 3 persistence schema, should KeyViz writes share a transaction with other per-group low-priority maintenance (compaction metadata, etc.) to amortise Raft cost, or remain a dedicated batch for easier rollback? +2. In fan-out (§9.1), should the admin binary **pin** to the seed list or dynamically refresh membership from `GetClusterOverview` on every request? Dynamic is more correct during scale events; pinned is simpler and avoids stampedes on the seed. +3. For the Phase 3 persistence schema, should KeyViz writes share a transaction with other per-group low-priority maintenance (compaction metadata, etc.) to amortise Raft cost, or remain a dedicated batch for easier rollback? From 11af0646eb7dc5b148106203b03f7dd0e70b0a2f Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 00:27:42 +0900 Subject: [PATCH 05/30] docs: clarify keyviz lineage retention --- docs/admin_ui_key_visualizer_design.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index c6875910..0cb83ceb 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -198,13 +198,13 @@ The API's `rows` parameter is a *target*, not a guarantee. The server walks the ### 5.6 Persistence -Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptable for an MVP and keeps the Raft critical path untouched. +Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptable for an MVP and keeps the Raft critical path untouched. Phase 3 changes that contract: persisted lineage records are the source of truth and the sampler rebuilds `RouteID → lineageID` state from them on restart. Phase 3 persists compacted columns **distributed across the user Raft groups themselves, not the default group**. Concentrating KeyViz writes on the default group would centralise I/O and Raft-log growth onto a single group, creating exactly the kind of hotspot this feature is built to surface. Instead: - Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local admin namespace `!admin|keyviz|range||`; the prefix is not routed through the default group. Phase 3 also adds an explicit system-namespace filter so `pebbleStore.ScanAt`, `ReverseScanAt`, and `ShardedCoordinator.maxLatestCommitTS` ignore `!admin|*` records for user-plane requests. This prevents internal metadata from leaking through scans or advancing user transaction timestamps. - `lineageID` is generated as a UUIDv7 using the route transition HLC plus crypto-random entropy, making it cluster-wide unique without coordinating through the default group. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}`; `RouteID` is recorded only as the current routing hint, never as the primary history key. -- Split and merge events append small group-local lineage records under `!admin|keyviz|lineage|`. On split, both children point back to the parent lineage and inherit the parent's compacted history for continuity. On merge, the survivor records both child lineage IDs and the reader sums overlapping intervals. If a node sees historical rows without a lineage record during an upgrade, the admin reader falls back to overlap on the persisted `[start, end)` range before using `RouteID`. +- Split and merge events append small group-local lineage records under `!admin|keyviz|lineage|` and mark closed branches with `validToHLC` so retention GC can later prune them. On split, both children point back to the parent lineage and inherit the parent's compacted history for continuity. On merge, the survivor records both child lineage IDs and the reader sums overlapping intervals. If a node sees historical rows without a lineage record during an upgrade, the admin reader falls back to overlap on the persisted `[start, end)` range before using `RouteID`. - On startup, the sampler rebuilds its in-memory `RouteID → lineageID` map by scanning the group-local lineage index for routes currently owned by the node's groups and matching active `[start, end)` ranges from `ListRoutes`. If a route exists without a matching lineage record, the node creates a new lineage record with a parent pointer to the best overlapping retained range. This makes rolling restarts and upgrades preserve historical continuity. - Writes are batched hourly per group (not per flush) and dispatched as a single low-priority proposal per group, keeping the write amplification proportional to the group's own traffic. - Retention is enforced by a KeyViz-specific GC pass, not by assuming ordinary HLC expiry will delete the latest MVCC version. Phase 3 includes either updating `pebbleStore.Compact` to collect latest versions whose `ExpireAt` is past the retention horizon or adding a KeyViz maintenance delete that tombstones expired column and lineage records before compaction. Persistence refuses to enable if this GC capability is absent, avoiding unbounded growth. From 6a88d26336678e9042ec4dc4963e6a9ea984f1a6 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 04:05:55 +0900 Subject: [PATCH 06/30] docs: address keyviz review comments on persistence and isolation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Detail UUIDv7 HLC mapping (physical→ts, logical→rand_a, crypto rand_b). - Use Pebble CompactionFilter as preferred GC path; bounded sweep as fallback. - Replace hourly batch with 5 min default + per-node WAL for crash recovery. - Reject user-plane writes to !admin|* in coordinator; widen isPebbleMetaKey to prefix match. --- docs/admin_ui_key_visualizer_design.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index 0cb83ceb..9bab3df4 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -202,12 +202,12 @@ Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptab Phase 3 persists compacted columns **distributed across the user Raft groups themselves, not the default group**. Concentrating KeyViz writes on the default group would centralise I/O and Raft-log growth onto a single group, creating exactly the kind of hotspot this feature is built to surface. Instead: -- Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local admin namespace `!admin|keyviz|range||`; the prefix is not routed through the default group. Phase 3 also adds an explicit system-namespace filter so `pebbleStore.ScanAt`, `ReverseScanAt`, and `ShardedCoordinator.maxLatestCommitTS` ignore `!admin|*` records for user-plane requests. This prevents internal metadata from leaking through scans or advancing user transaction timestamps. -- `lineageID` is generated as a UUIDv7 using the route transition HLC plus crypto-random entropy, making it cluster-wide unique without coordinating through the default group. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}`; `RouteID` is recorded only as the current routing hint, never as the primary history key. +- Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local admin namespace `!admin|keyviz|range||`; the prefix is not routed through the default group. Phase 3 also adds an explicit system-namespace filter so `pebbleStore.ScanAt`, `ReverseScanAt`, and `ShardedCoordinator.maxLatestCommitTS` ignore `!admin|*` records for user-plane requests. The current `isPebbleMetaKey` exact-match check (`store/lsm_store.go:299`) is widened to a prefix check on `!admin|`, and the same check is applied in `nextScannableUserKey` / `prevScannableUserKey` so internal KeyViz records are skipped during user-plane scans. To prevent the inverse leak, every data-plane adapter (gRPC `RawKV`/`TransactionalKV`, Redis, DynamoDB, S3) rejects user-plane writes — `Put`, `Delete`, transactional mutations, and Redis equivalents — whose key starts with `!admin|`. The check is centralised in `kv.ShardedCoordinator` so adapters cannot forget it; a write attempting an `!admin|*` key returns `InvalidArgument` and is recorded in the audit metric. +- `lineageID` is generated as a UUIDv7 derived from the route transition HLC, making it cluster-wide unique without coordinating through the default group. To avoid losing precision, the 64-bit HLC is mapped explicitly: the **physical** part (millisecond-resolution wall clock) populates the 48-bit `unix_ts_ms` timestamp field, the low 12 bits of the HLC **logical** counter populate the `rand_a` sequence field (preserving per-millisecond ordering for transitions in the same physical tick), and the remaining 62 bits of `rand_b` come from `crypto/rand`. This avoids the collision and lost-ordering risk of naïvely truncating the HLC into the timestamp field. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}`; `RouteID` is recorded only as the current routing hint, never as the primary history key. - Split and merge events append small group-local lineage records under `!admin|keyviz|lineage|` and mark closed branches with `validToHLC` so retention GC can later prune them. On split, both children point back to the parent lineage and inherit the parent's compacted history for continuity. On merge, the survivor records both child lineage IDs and the reader sums overlapping intervals. If a node sees historical rows without a lineage record during an upgrade, the admin reader falls back to overlap on the persisted `[start, end)` range before using `RouteID`. - On startup, the sampler rebuilds its in-memory `RouteID → lineageID` map by scanning the group-local lineage index for routes currently owned by the node's groups and matching active `[start, end)` ranges from `ListRoutes`. If a route exists without a matching lineage record, the node creates a new lineage record with a parent pointer to the best overlapping retained range. This makes rolling restarts and upgrades preserve historical continuity. -- Writes are batched hourly per group (not per flush) and dispatched as a single low-priority proposal per group, keeping the write amplification proportional to the group's own traffic. -- Retention is enforced by a KeyViz-specific GC pass, not by assuming ordinary HLC expiry will delete the latest MVCC version. Phase 3 includes either updating `pebbleStore.Compact` to collect latest versions whose `ExpireAt` is past the retention horizon or adding a KeyViz maintenance delete that tombstones expired column and lineage records before compaction. Persistence refuses to enable if this GC capability is absent, avoiding unbounded growth. +- Writes are batched per group on a configurable interval (`--keyvizPersistInterval`, **default 5 min**, max 1 h) and dispatched as a single low-priority Raft proposal per group, keeping the write amplification proportional to the group's own traffic. Hourly was rejected as the default because a node crash between flushes would lose up to one hour of heatmap; 5 min bounds worst-case loss while still amortising Raft cost. As a defence-in-depth against single-point loss, each node also keeps the most recent unflushed window in a small **append-only WAL file** (`/keyviz/wal-.log`) under the same retention contract; on restart the sampler replays the WAL into the in-memory ring buffer, then truncates entries that have since been included in a persisted batch. Operators that want stricter durability set `--keyvizPersistInterval=30s`. +- Retention is enforced by a KeyViz-specific GC pass, not by assuming ordinary HLC expiry will delete the latest MVCC version. Phase 3 prefers a **Pebble `CompactionFilter`** that drops expired `!admin|keyviz|*` versions during normal background compactions — this avoids the I/O and CPU cost of an out-of-band scan-and-delete sweep, since the work happens during compactions that would run anyway. As a fallback for store flavours where a CompactionFilter is unavailable, an opt-in maintenance pass tombstones expired column and lineage records using a bounded, time-budgeted scan (default ≤5% of disk read bandwidth). Persistence refuses to enable if neither path is available, avoiding unbounded growth. - Lineage records are retained while any column in the 7-day retention window references them. The same GC pass prunes closed lineage branches whose `validToHLC` and descendants are older than retention, so frequent split/merge clusters do not accumulate an unbounded lineage tree. - The admin binary, on a history query, fans out to all groups' leaders (§9.1), reconstructs the range timeline from lineage metadata, and merges returned slices by time × key-range overlap. This keeps a hotspot visually continuous even when its serving `RouteID` changed across a `SplitRange` or merge. - For coarsened virtual buckets (§5.3), the column is written to the group owning the bucket's **first** constituent route, with a small index entry under `!admin|keyviz|index|` on the same group so the fan-out reader can discover it. The index entry is the only per-hour write that is shared — but its size is bounded by the route-budget cap, not by total traffic. @@ -254,11 +254,11 @@ This adds roughly a dozen integer fields per tracked operation and avoids both t | `cmd/elastickv-admin/` (new) | Main, HTTP server, gRPC clients, embedded SPA. | | `adapter/admin_grpc.go` (new) | Server-side implementation of the `Admin` gRPC service, registered in `main.go`. | | `proto/admin.proto` (new) | Service definition for `Admin`. | -| `kv/sharded_coordinator.go` | One-line `sampler.Observe(...)` at dispatch entry; `sampler` is `keyviz.Sampler` injected via constructor, nil-safe. Phase 3 also filters `!admin|*` from `maxLatestCommitTS`. | -| `keyviz/` (new) | `Sampler`, adaptive sub-sampler, ring buffer, route-watch subscriber, preview logic, tests. | +| `kv/sharded_coordinator.go` | One-line `sampler.Observe(...)` at dispatch entry; `sampler` is `keyviz.Sampler` injected via constructor, nil-safe. Phase 3 also filters `!admin|*` from `maxLatestCommitTS` and rejects user-plane writes (`Put`/`Delete`/transactional mutations) targeting `!admin|*` with `InvalidArgument`, so adapters (gRPC, Redis, DynamoDB, S3) cannot bypass the isolation. | +| `keyviz/` (new) | `Sampler`, adaptive sub-sampler, ring buffer, route-watch subscriber, WAL replay, preview logic, tests. | | `monitoring/live_summary.go` (new) | Rolling-window adapter counters, hooked into existing observers. | -| `store/lsm_store.go` | Phase 3 system-namespace scan filtering and retention GC support for expired latest versions or KeyViz maintenance tombstones. | -| `main.go` | Register token-protected `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers; add `--adminTokenFile`, `--adminInsecureNoAuth`, and `--keyvizMaxTrackedRoutes`. | +| `store/lsm_store.go` | Phase 3 widens `isPebbleMetaKey` from exact-match to a prefix check on `!admin|` so `nextScannableUserKey` / `prevScannableUserKey` skip all internal KeyViz records during user-plane scans; adds retention GC (Pebble `CompactionFilter` preferred, time-budgeted maintenance sweep fallback) for expired `!admin|keyviz|*` columns and lineage records. | +| `main.go` | Register token-protected `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers; add `--adminTokenFile`, `--adminInsecureNoAuth`, `--keyvizMaxTrackedRoutes`, and `--keyvizPersistInterval`. | | `web/` (new) | Svelte SPA source. | No changes to Raft or FSM are required. Data-plane protocol adapters only receive the sampler call site and the `LiveSummary` hook that sits next to existing Prometheus writes. Phase 3 intentionally touches the store/coordinator read paths to keep `!admin|keyviz|*` metadata out of user scans and timestamp selection. @@ -297,7 +297,7 @@ Because writes are recorded by Raft leaders and follower-local reads are recorde 4. gRPC handler tests with a fake engine and fake Raft status reader. 5. Fan-out test: admin binary against a 3-node fake cluster, including follower-local reads, one unreachable node, and a leadership transfer in the middle of a step window; the merged response must sum non-duplicate samples, preserve the partial-status array, and flag ambiguous overlap. 6. Persistence test: write compacted columns to per-range groups, perform split and merge transitions, restart a node, take a leadership transfer, run KeyViz GC, and verify the lineage reader reconstructs complete history across groups without relying on stable `RouteID`s`. -7. Namespace isolation test: user `ScanAt`, `ReverseScanAt`, and `maxLatestCommitTS` must ignore `!admin|keyviz|*` records. +7. Namespace isolation test: user `ScanAt`, `ReverseScanAt`, and `maxLatestCommitTS` must ignore `!admin|keyviz|*` records, and user-plane `Put` / `Delete` / transactional writes to any `!admin|*` key must be rejected with `InvalidArgument` by every adapter (gRPC `RawKV`/`TransactionalKV`, Redis, DynamoDB, S3). 8. Auth test: `Admin` gRPC methods reject missing or wrong tokens and accept the configured read-only token. 9. Benchmark gate: `BenchmarkCoordinatorDispatch` with sampler off vs on. CI fails if the difference exceeds the benchmark's own run-to-run variance. 10. Playwright smoke test against the embedded SPA to catch build-time regressions. From cdabf3844e515007a045dcbab8977c085d51d82c Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 04:12:23 +0900 Subject: [PATCH 07/30] docs: address second-round keyviz review - Bound WAL replay: size cap + per-interval checkpoint + shed-on-overflow so rolling restarts stay responsive. - Extend namespace isolation to GetAt/ExistsAt so point reads cannot probe !admin|*. - Use full 16-bit HLC logical when packing UUIDv7 (rand_a + top nibble of rand_b) to keep monotonic ordering. - Resolve fan-out membership open question: dynamic refresh with short-lived cache + lazy invalidation. --- docs/admin_ui_key_visualizer_design.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index 9bab3df4..e0e3a662 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -202,11 +202,11 @@ Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptab Phase 3 persists compacted columns **distributed across the user Raft groups themselves, not the default group**. Concentrating KeyViz writes on the default group would centralise I/O and Raft-log growth onto a single group, creating exactly the kind of hotspot this feature is built to surface. Instead: -- Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local admin namespace `!admin|keyviz|range||`; the prefix is not routed through the default group. Phase 3 also adds an explicit system-namespace filter so `pebbleStore.ScanAt`, `ReverseScanAt`, and `ShardedCoordinator.maxLatestCommitTS` ignore `!admin|*` records for user-plane requests. The current `isPebbleMetaKey` exact-match check (`store/lsm_store.go:299`) is widened to a prefix check on `!admin|`, and the same check is applied in `nextScannableUserKey` / `prevScannableUserKey` so internal KeyViz records are skipped during user-plane scans. To prevent the inverse leak, every data-plane adapter (gRPC `RawKV`/`TransactionalKV`, Redis, DynamoDB, S3) rejects user-plane writes — `Put`, `Delete`, transactional mutations, and Redis equivalents — whose key starts with `!admin|`. The check is centralised in `kv.ShardedCoordinator` so adapters cannot forget it; a write attempting an `!admin|*` key returns `InvalidArgument` and is recorded in the audit metric. -- `lineageID` is generated as a UUIDv7 derived from the route transition HLC, making it cluster-wide unique without coordinating through the default group. To avoid losing precision, the 64-bit HLC is mapped explicitly: the **physical** part (millisecond-resolution wall clock) populates the 48-bit `unix_ts_ms` timestamp field, the low 12 bits of the HLC **logical** counter populate the `rand_a` sequence field (preserving per-millisecond ordering for transitions in the same physical tick), and the remaining 62 bits of `rand_b` come from `crypto/rand`. This avoids the collision and lost-ordering risk of naïvely truncating the HLC into the timestamp field. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}`; `RouteID` is recorded only as the current routing hint, never as the primary history key. +- Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local admin namespace `!admin|keyviz|range||`; the prefix is not routed through the default group. Phase 3 also adds an explicit system-namespace filter so every user-plane read and timestamp-selection path — `pebbleStore.ScanAt`, `ReverseScanAt`, `GetAt`, `ExistsAt`, and `ShardedCoordinator.maxLatestCommitTS` — ignores `!admin|*` records; point reads that target an `!admin|*` key return `NotFound` as if the key did not exist, so an attacker cannot distinguish "hidden" from "missing". The current `isPebbleMetaKey` exact-match check (`store/lsm_store.go:299`) is widened to a prefix check on `!admin|`, and the same check is applied in `nextScannableUserKey` / `prevScannableUserKey` so internal KeyViz records are skipped during user-plane scans. To prevent the inverse leak, every data-plane adapter (gRPC `RawKV`/`TransactionalKV`, Redis, DynamoDB, S3) rejects user-plane writes — `Put`, `Delete`, transactional mutations, and Redis equivalents — whose key starts with `!admin|`. The check is centralised in `kv.ShardedCoordinator` so adapters cannot forget it; a write attempting an `!admin|*` key returns `InvalidArgument` and is recorded in the audit metric. +- `lineageID` is generated as a UUIDv7 derived from the route transition HLC, making it cluster-wide unique without coordinating through the default group. To avoid losing precision, the 64-bit HLC is mapped explicitly: the **physical** part (millisecond-resolution wall clock) populates the 48-bit `unix_ts_ms` timestamp field, and the full HLC **logical** counter (Elastickv's HLC uses a 16-bit logical) populates `rand_a` (12 bits) **concatenated** with the top 4 bits of `rand_b`'s per-UUID random payload — i.e. logical bits `[15:4]` → `rand_a`, logical bits `[3:0]` → the top nibble of `rand_b`, so the full 16-bit logical is preserved inside the UUID body. The remaining 58 bits of `rand_b` come from `crypto/rand`. This preserves the HLC monotonic ordering guarantee within a single millisecond (up to 65 535 transitions per ms per node, well beyond the observed split/merge rate) and still gives ~2^58 random bits to keep collision probability negligible. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}` with `validFromHLC` carrying the full HLC so the reader can re-sort authoritatively; `RouteID` is recorded only as the current routing hint, never as the primary history key. - Split and merge events append small group-local lineage records under `!admin|keyviz|lineage|` and mark closed branches with `validToHLC` so retention GC can later prune them. On split, both children point back to the parent lineage and inherit the parent's compacted history for continuity. On merge, the survivor records both child lineage IDs and the reader sums overlapping intervals. If a node sees historical rows without a lineage record during an upgrade, the admin reader falls back to overlap on the persisted `[start, end)` range before using `RouteID`. - On startup, the sampler rebuilds its in-memory `RouteID → lineageID` map by scanning the group-local lineage index for routes currently owned by the node's groups and matching active `[start, end)` ranges from `ListRoutes`. If a route exists without a matching lineage record, the node creates a new lineage record with a parent pointer to the best overlapping retained range. This makes rolling restarts and upgrades preserve historical continuity. -- Writes are batched per group on a configurable interval (`--keyvizPersistInterval`, **default 5 min**, max 1 h) and dispatched as a single low-priority Raft proposal per group, keeping the write amplification proportional to the group's own traffic. Hourly was rejected as the default because a node crash between flushes would lose up to one hour of heatmap; 5 min bounds worst-case loss while still amortising Raft cost. As a defence-in-depth against single-point loss, each node also keeps the most recent unflushed window in a small **append-only WAL file** (`/keyviz/wal-.log`) under the same retention contract; on restart the sampler replays the WAL into the in-memory ring buffer, then truncates entries that have since been included in a persisted batch. Operators that want stricter durability set `--keyvizPersistInterval=30s`. +- Writes are batched per group on a configurable interval (`--keyvizPersistInterval`, **default 5 min**, max 1 h) and dispatched as a single low-priority Raft proposal per group, keeping the write amplification proportional to the group's own traffic. Hourly was rejected as the default because a node crash between flushes would lose up to one hour of heatmap; 5 min bounds worst-case loss while still amortising Raft cost. As a defence-in-depth against single-point loss, each node also keeps the most recent unflushed window in a small **append-only WAL file** (`/keyviz/wal-.log`) under the same retention contract, with two hard bounds to keep restart fast: the WAL is **size-capped at `--keyvizWALMaxBytes` (default 64 MiB)** and **checkpointed every `--keyvizPersistInterval`** — when a batch is persisted to Raft, the corresponding WAL prefix is truncated. This caps worst-case replay at one interval's worth of data (at the default, tens of MiB at most), and a target recovery budget of **≤1 s replay time at 1 M ops/s**. If the WAL exceeds its size cap before the next flush — indicating the node is behind on persistence — the sampler drops the oldest records and records a `keyviz_wal_shed_total` metric instead of blocking the hot path. On startup the sampler fast-loads the WAL without running the adaptive controller, then resumes normal operation; readiness is gated on WAL replay completion so rolling upgrades do not route traffic to a node that is still rebuilding state. Operators that want stricter durability set `--keyvizPersistInterval=30s`; those that want faster restart at the cost of more write amplification set a smaller `--keyvizWALMaxBytes`. - Retention is enforced by a KeyViz-specific GC pass, not by assuming ordinary HLC expiry will delete the latest MVCC version. Phase 3 prefers a **Pebble `CompactionFilter`** that drops expired `!admin|keyviz|*` versions during normal background compactions — this avoids the I/O and CPU cost of an out-of-band scan-and-delete sweep, since the work happens during compactions that would run anyway. As a fallback for store flavours where a CompactionFilter is unavailable, an opt-in maintenance pass tombstones expired column and lineage records using a bounded, time-budgeted scan (default ≤5% of disk read bandwidth). Persistence refuses to enable if neither path is available, avoiding unbounded growth. - Lineage records are retained while any column in the 7-day retention window references them. The same GC pass prunes closed lineage branches whose `validToHLC` and descendants are older than retention, so frequent split/merge clusters do not accumulate an unbounded lineage tree. - The admin binary, on a history query, fans out to all groups' leaders (§9.1), reconstructs the range timeline from lineage metadata, and merges returned slices by time × key-range overlap. This keeps a hotspot visually continuous even when its serving `RouteID` changed across a `SplitRange` or merge. @@ -258,7 +258,7 @@ This adds roughly a dozen integer fields per tracked operation and avoids both t | `keyviz/` (new) | `Sampler`, adaptive sub-sampler, ring buffer, route-watch subscriber, WAL replay, preview logic, tests. | | `monitoring/live_summary.go` (new) | Rolling-window adapter counters, hooked into existing observers. | | `store/lsm_store.go` | Phase 3 widens `isPebbleMetaKey` from exact-match to a prefix check on `!admin|` so `nextScannableUserKey` / `prevScannableUserKey` skip all internal KeyViz records during user-plane scans; adds retention GC (Pebble `CompactionFilter` preferred, time-budgeted maintenance sweep fallback) for expired `!admin|keyviz|*` columns and lineage records. | -| `main.go` | Register token-protected `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers; add `--adminTokenFile`, `--adminInsecureNoAuth`, `--keyvizMaxTrackedRoutes`, and `--keyvizPersistInterval`. | +| `main.go` | Register token-protected `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers; add `--adminTokenFile`, `--adminInsecureNoAuth`, `--keyvizMaxTrackedRoutes`, `--keyvizPersistInterval`, and `--keyvizWALMaxBytes`. | | `web/` (new) | Svelte SPA source. | No changes to Raft or FSM are required. Data-plane protocol adapters only receive the sampler call site and the `LiveSummary` hook that sits next to existing Prometheus writes. Phase 3 intentionally touches the store/coordinator read paths to keep `!admin|keyviz|*` metadata out of user scans and timestamp selection. @@ -274,7 +274,7 @@ No changes to Raft or FSM are required. Data-plane protocol adapters only receiv Because writes are recorded by Raft leaders and follower-local reads are recorded by the followers that serve them (§5.1), pointing the admin binary at a single node produces a **partial heatmap**. To give operators a complete view by default, the admin binary runs in **fan-out mode**: -- `--nodes` accepts a comma-separated list of seed addresses. The admin binary calls `GetClusterOverview` on any reachable seed to discover the current full membership (node → gRPC endpoint, plus per-group leader identity). +- `--nodes` accepts a comma-separated list of seed addresses. The admin binary calls `GetClusterOverview` on any reachable seed to discover the current full membership (node → gRPC endpoint, plus per-group leader identity). Membership is cached for `--nodesRefreshInterval` (**default 15 s**) so a stampede of concurrent browser requests hits at most one `GetClusterOverview` per interval per seed, while scale-out events are still reflected within seconds. The cache is refreshed lazily on the first request after expiry and invalidated immediately on any per-node `Unavailable` error, so removed or replaced nodes are dropped on the next request instead of waiting for the next tick. - For each query (`GetKeyVizMatrix`, `GetRouteDetail`, `GetAdapterSummary`), the admin binary issues parallel gRPC calls to every known node and merges results server-side before sending one combined JSON payload to the browser. - Merging rule for the heatmap: rows are grouped by `bucketID`/`lineageID` and time step. Read samples from multiple nodes are **summed**, because they represent distinct locally served reads. Write samples are grouped by `(bucketID, raftGroupID, leaderTerm, sourceNode, windowStart)` and summed across distinct leader terms during leadership transitions; exact duplicate source keys are deduplicated. The admin binary never uses "later timestamp wins" to overwrite a previous leader's complete window with a new leader's partial window. If two leaders claim overlapping terms for the same group, the cell is returned with `conflict=true` and rendered hatched rather than silently dropping data. - Degraded mode: if any node is unreachable, the admin binary returns a partial result with a per-node `{node, ok, error}` status array so the UI can surface "3 of 4 nodes responded" instead of silently hiding ranges. The heatmap hatches rows or time windows whose expected source node failed. @@ -317,5 +317,4 @@ Phases 0–2 are the minimum operationally useful product; Phase 3 is the "ship- ## 13. Open Questions 1. Default value of `--keyvizMaxTrackedRoutes`. 10 000 is conservative; operators with very large clusters may prefer a higher default paired with shorter retention. Settle during Phase 2 benchmarking. -2. In fan-out (§9.1), should the admin binary **pin** to the seed list or dynamically refresh membership from `GetClusterOverview` on every request? Dynamic is more correct during scale events; pinned is simpler and avoids stampedes on the seed. -3. For the Phase 3 persistence schema, should KeyViz writes share a transaction with other per-group low-priority maintenance (compaction metadata, etc.) to amortise Raft cost, or remain a dedicated batch for easier rollback? +2. For the Phase 3 persistence schema, should KeyViz writes share a transaction with other per-group low-priority maintenance (compaction metadata, etc.) to amortise Raft cost, or remain a dedicated batch for easier rollback? From ddd5039a29d10d5770f6433dfb6346e987271349 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 04:18:52 +0900 Subject: [PATCH 08/30] docs: address coderabbit comments on leader HLC, lineageID, leadership loss MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Leadership loss: mark in-window leaderWrite samples staleLeader=true; fan-out keys writes by (group, term, window) so stale + fresh samples never double-count. - CPU attribution: use synthetic costPerObserveNs × observeCount rather than runtime profiling to keep hot path clean. - lineageID is generated once by the Raft leader as part of the split/merge proposal (leader-issued HLC), deterministically derived from HLC + log index + BLAKE2b so every replica computes the same UUIDv7. --- docs/admin_ui_key_visualizer_design.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index e0e3a662..d6e9dfa2 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -142,6 +142,8 @@ sampler.Observe(routeID, op, keyLen, valueLen) Writes are sampled exactly once by the current Raft leader before proposal. Reads are sampled by the node that actually serves the read: leader reads are marked `leaderRead`, and lease/follower-local reads are marked `followerRead`. Requests forwarded between nodes carry an internal "already sampled" marker so a logical operation is not counted twice. Because read load can be spread across followers, a cluster-wide heatmap requires the admin binary to fan out and merge across nodes (§9.1) — pointing at a single node would produce a partial view. +**Leadership loss.** Each sample carries the `(raftGroupID, leaderTerm)` under which it was recorded. When the node's lease-loss callback fires for a group, the sampler stamps all `leaderWrite` samples for that group in the current and previous step window with `staleLeader=true` rather than deleting them — keeping them visible on the heatmap helps operators diagnose rapid leadership churn, and they remain authoritative for the window in which this node was in fact the leader. The admin fan-out (§9.1) merges writes by `(bucketID, raftGroupID, leaderTerm, windowStart)`, so the stale samples from an old leader and the fresh samples from a new leader never double-count: distinct terms are summed (each term's leader only saw its own term's writes), and within a single term the one leader's samples are authoritative. If fan-out receives `staleLeader=true` samples that conflict with a concurrent newer-term sample for the same window, the cell is flagged `conflict=true` and rendered hatched. + The hot path uses lock-free reads for route lookup and counter increments. The data structures used are: - **Current-window counters**: `routes` is an immutable `routeTable` published through `atomic.Pointer[routeTable]`. `routeTable` owns `map[RouteID]*routeSlot`; each `routeSlot` owns an `atomic.Pointer[routeCounters]`. `Observe` loads the current table, performs a plain map lookup against that immutable snapshot, loads the slot's counter pointer, and uses `atomic.AddUint64` on the counter fields. Adding a new `RouteID` or replacing split/merge mappings performs a copy-on-write table update under a non-hot-path `routesMu`, then publishes the new table with one atomic store. No `Observe` call ever runs against a Go map that can be mutated concurrently. @@ -158,7 +160,7 @@ The capture rate itself is not the SLO — at `sampleRate = 8` the raw capture r For Poisson-ish traffic, the relative error of the Horvitz–Thompson estimator is approximately `1 / sqrt(acceptedSamples)` for 1-in-N sub-sampling where N > 1. Setting this ≤0.05 at 95% CI gives a required `acceptedSamples ≥ (1.96 / 0.05)² ≈ 1537`, independent of the current 1-in-N rate. Buckets sampled at `sampleRate = 1` are exact and do not need the bound. The adaptive controller enforces this by never raising `sampleRate` past the point where the most recent window's `acceptedSamples` falls below that bound; if a burst violates the bound the affected buckets are flagged in the response and the UI renders them hatched so the operator knows the estimate is soft. -`sampleRate` only rises at all when the previous flush window's CPU attributed to `Observe` crosses a measured threshold. In steady state with moderate per-route QPS, `sampleRate` stays at 1 and every op is counted. +`sampleRate` only rises at all when the previous flush window's estimated `Observe` cost crosses a measured threshold. To avoid adding profiling overhead to the hot path, the cost is estimated with a **synthetic model** (no runtime profiler involved): at startup `BenchmarkCoordinatorDispatch` with the sampler enabled records `costPerObserveNs` once, and each flush window computes `estimatedObserveCPU = Σ_routes(observeCount × costPerObserveNs)` directly from the counters already being harvested. This is exact up to the benchmarked cost constant and zero-overhead at runtime. In steady state with moderate per-route QPS, `sampleRate` stays at 1 and every op is counted. Benchmark gate in CI: `BenchmarkCoordinatorDispatch` with sampler off vs on; the delta must stay within run-to-run variance. Separately, a correctness test drives a known synthetic workload through a sub-sampling sampler and asserts the ±5% / 95%-CI bound holds across 1000 trials. @@ -203,7 +205,8 @@ Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptab Phase 3 persists compacted columns **distributed across the user Raft groups themselves, not the default group**. Concentrating KeyViz writes on the default group would centralise I/O and Raft-log growth onto a single group, creating exactly the kind of hotspot this feature is built to surface. Instead: - Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local admin namespace `!admin|keyviz|range||`; the prefix is not routed through the default group. Phase 3 also adds an explicit system-namespace filter so every user-plane read and timestamp-selection path — `pebbleStore.ScanAt`, `ReverseScanAt`, `GetAt`, `ExistsAt`, and `ShardedCoordinator.maxLatestCommitTS` — ignores `!admin|*` records; point reads that target an `!admin|*` key return `NotFound` as if the key did not exist, so an attacker cannot distinguish "hidden" from "missing". The current `isPebbleMetaKey` exact-match check (`store/lsm_store.go:299`) is widened to a prefix check on `!admin|`, and the same check is applied in `nextScannableUserKey` / `prevScannableUserKey` so internal KeyViz records are skipped during user-plane scans. To prevent the inverse leak, every data-plane adapter (gRPC `RawKV`/`TransactionalKV`, Redis, DynamoDB, S3) rejects user-plane writes — `Put`, `Delete`, transactional mutations, and Redis equivalents — whose key starts with `!admin|`. The check is centralised in `kv.ShardedCoordinator` so adapters cannot forget it; a write attempting an `!admin|*` key returns `InvalidArgument` and is recorded in the audit metric. -- `lineageID` is generated as a UUIDv7 derived from the route transition HLC, making it cluster-wide unique without coordinating through the default group. To avoid losing precision, the 64-bit HLC is mapped explicitly: the **physical** part (millisecond-resolution wall clock) populates the 48-bit `unix_ts_ms` timestamp field, and the full HLC **logical** counter (Elastickv's HLC uses a 16-bit logical) populates `rand_a` (12 bits) **concatenated** with the top 4 bits of `rand_b`'s per-UUID random payload — i.e. logical bits `[15:4]` → `rand_a`, logical bits `[3:0]` → the top nibble of `rand_b`, so the full 16-bit logical is preserved inside the UUID body. The remaining 58 bits of `rand_b` come from `crypto/rand`. This preserves the HLC monotonic ordering guarantee within a single millisecond (up to 65 535 transitions per ms per node, well beyond the observed split/merge rate) and still gives ~2^58 random bits to keep collision probability negligible. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs}` with `validFromHLC` carrying the full HLC so the reader can re-sort authoritatively; `RouteID` is recorded only as the current routing hint, never as the primary history key. +- `lineageID` is generated **exactly once, by the Raft leader proposing the split/merge**, as part of the route-transition command itself, and then stored in the Raft log — so every replica reads the same value instead of regenerating it. This avoids violating the repository invariant that persistence timestamps must originate from the Raft leader, not from a node-local clock. The transition HLC used is the **leader-issued HLC stamped onto the `SplitRange`/`MergeRange` Raft proposal** (same HLC that backs OCC decisions), never a node-local snapshot; followers observe the lineageID by replaying the committed command. +- The UUIDv7 is derived deterministically from that leader-issued HLC plus the proposal's Raft log index so the same transition yields the same lineageID on every replica and on re-proposal: the 48-bit `unix_ts_ms` field gets the HLC physical part (ms resolution), and the full 16-bit HLC logical counter is packed across `rand_a` (12 bits) and the top nibble of `rand_b` — logical bits `[15:4]` into `rand_a`, logical bits `[3:0]` into the top 4 bits of `rand_b`, so no logical bits are dropped. The remaining 58 bits of `rand_b` are filled from `BLAKE2b-256(raftGroupID || raftLogIndex || proposalBytes)` truncated to 58 bits — deterministic across replicas, collision-resistant across transitions, and no runtime RNG dependency. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs, raftLogIndex}` with `validFromHLC` carrying the full HLC so the reader can re-sort authoritatively; `RouteID` is recorded only as the current routing hint, never as the primary history key. - Split and merge events append small group-local lineage records under `!admin|keyviz|lineage|` and mark closed branches with `validToHLC` so retention GC can later prune them. On split, both children point back to the parent lineage and inherit the parent's compacted history for continuity. On merge, the survivor records both child lineage IDs and the reader sums overlapping intervals. If a node sees historical rows without a lineage record during an upgrade, the admin reader falls back to overlap on the persisted `[start, end)` range before using `RouteID`. - On startup, the sampler rebuilds its in-memory `RouteID → lineageID` map by scanning the group-local lineage index for routes currently owned by the node's groups and matching active `[start, end)` ranges from `ListRoutes`. If a route exists without a matching lineage record, the node creates a new lineage record with a parent pointer to the best overlapping retained range. This makes rolling restarts and upgrades preserve historical continuity. - Writes are batched per group on a configurable interval (`--keyvizPersistInterval`, **default 5 min**, max 1 h) and dispatched as a single low-priority Raft proposal per group, keeping the write amplification proportional to the group's own traffic. Hourly was rejected as the default because a node crash between flushes would lose up to one hour of heatmap; 5 min bounds worst-case loss while still amortising Raft cost. As a defence-in-depth against single-point loss, each node also keeps the most recent unflushed window in a small **append-only WAL file** (`/keyviz/wal-.log`) under the same retention contract, with two hard bounds to keep restart fast: the WAL is **size-capped at `--keyvizWALMaxBytes` (default 64 MiB)** and **checkpointed every `--keyvizPersistInterval`** — when a batch is persisted to Raft, the corresponding WAL prefix is truncated. This caps worst-case replay at one interval's worth of data (at the default, tens of MiB at most), and a target recovery budget of **≤1 s replay time at 1 M ops/s**. If the WAL exceeds its size cap before the next flush — indicating the node is behind on persistence — the sampler drops the oldest records and records a `keyviz_wal_shed_total` metric instead of blocking the hot path. On startup the sampler fast-loads the WAL without running the adaptive controller, then resumes normal operation; readiness is gated on WAL replay completion so rolling upgrades do not route traffic to a node that is still rebuilding state. Operators that want stricter durability set `--keyvizPersistInterval=30s`; those that want faster restart at the cost of more write amplification set a smaller `--keyvizWALMaxBytes`. From 763fd3ac2132c061f21419fbbb99ff63281899a1 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 04:30:37 +0900 Subject: [PATCH 09/30] feat(admin): phase 0 skeleton for elastickv-admin and Admin gRPC service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - proto/admin.proto: full Admin service surface per design §4; Phase 0 implements GetClusterOverview and GetRaftGroups, the rest fall through Unimplemented. - adapter/admin_grpc.go: server + bearer-token interceptor (constant-time compare, scoped to /Admin/*). - cmd/elastickv-admin: binary with --nodes/--nodeTokenFile/--adminInsecureNoAuth/--nodesRefreshInterval, HTTP /api/cluster/overview fanning out over gRPC. - Tests cover token gating edge cases and overview fan-out shape. --- adapter/admin_grpc.go | 192 ++++ adapter/admin_grpc_test.go | 141 +++ cmd/elastickv-admin/main.go | 257 +++++ cmd/elastickv-admin/main_test.go | 77 ++ proto/Makefile | 3 + proto/admin.pb.go | 1529 ++++++++++++++++++++++++++++++ proto/admin.proto | 149 +++ proto/admin_grpc.pb.go | 325 +++++++ 8 files changed, 2673 insertions(+) create mode 100644 adapter/admin_grpc.go create mode 100644 adapter/admin_grpc_test.go create mode 100644 cmd/elastickv-admin/main.go create mode 100644 cmd/elastickv-admin/main_test.go create mode 100644 proto/admin.pb.go create mode 100644 proto/admin.proto create mode 100644 proto/admin_grpc.pb.go diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go new file mode 100644 index 00000000..be43baca --- /dev/null +++ b/adapter/admin_grpc.go @@ -0,0 +1,192 @@ +package adapter + +import ( + "context" + "crypto/subtle" + "strings" + "sync" + + "github.com/bootjp/elastickv/internal/raftengine" + pb "github.com/bootjp/elastickv/proto" + "github.com/cockroachdb/errors" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/metadata" + "google.golang.org/grpc/status" +) + +// AdminGroup exposes per-Raft-group state to the Admin service. It is a narrow +// subset of raftengine.Engine so tests can supply an in-memory fake without +// standing up a real Raft cluster. +type AdminGroup interface { + Status() raftengine.Status +} + +// NodeIdentity is the value form of the protobuf NodeIdentity message used for +// AdminServer configuration. It avoids copying pb.NodeIdentity, which embeds a +// protoimpl.MessageState (and a mutex). +type NodeIdentity struct { + NodeID string + GRPCAddress string +} + +func (n NodeIdentity) toProto() *pb.NodeIdentity { + return &pb.NodeIdentity{NodeId: n.NodeID, GrpcAddress: n.GRPCAddress} +} + +// AdminServer implements the node-side Admin gRPC service described in +// docs/admin_ui_key_visualizer_design.md §4 (Layer A). Phase 0 only implements +// GetClusterOverview and GetRaftGroups; remaining RPCs return Unimplemented so +// the generated client can still compile against older nodes during rollout. +type AdminServer struct { + self NodeIdentity + members []NodeIdentity + + groupsMu sync.RWMutex + groups map[uint64]AdminGroup + + pb.UnimplementedAdminServer +} + +// NewAdminServer constructs an AdminServer. `self` identifies the local node +// for responses that return node identity. `members` is the static membership +// snapshot shipped to the admin binary; callers that already have a membership +// source may pass nil and let the admin binary's fan-out layer discover peers +// by other means. +func NewAdminServer(self NodeIdentity, members []NodeIdentity) *AdminServer { + cloned := append([]NodeIdentity(nil), members...) + return &AdminServer{ + self: self, + members: cloned, + groups: make(map[uint64]AdminGroup), + } +} + +// RegisterGroup binds a Raft group ID to its engine so the Admin service can +// report leader and log state for that group. +func (s *AdminServer) RegisterGroup(groupID uint64, g AdminGroup) { + if g == nil { + return + } + s.groupsMu.Lock() + s.groups[groupID] = g + s.groupsMu.Unlock() +} + +// GetClusterOverview returns the local node identity, the configured member +// list, and per-group leader identity collected from the engines registered +// via RegisterGroup. +func (s *AdminServer) GetClusterOverview( + _ context.Context, + _ *pb.GetClusterOverviewRequest, +) (*pb.GetClusterOverviewResponse, error) { + leaders := s.snapshotLeaders() + members := make([]*pb.NodeIdentity, 0, len(s.members)) + for _, m := range s.members { + members = append(members, m.toProto()) + } + return &pb.GetClusterOverviewResponse{ + Self: s.self.toProto(), + Members: members, + GroupLeaders: leaders, + }, nil +} + +// GetRaftGroups returns per-group state snapshots. Phase 0 wires commit/applied +// indices only; per-follower contact and term history land in later phases. +func (s *AdminServer) GetRaftGroups( + _ context.Context, + _ *pb.GetRaftGroupsRequest, +) (*pb.GetRaftGroupsResponse, error) { + s.groupsMu.RLock() + out := make([]*pb.RaftGroupState, 0, len(s.groups)) + for id, g := range s.groups { + st := g.Status() + out = append(out, &pb.RaftGroupState{ + RaftGroupId: id, + LeaderNodeId: st.Leader.ID, + LeaderTerm: st.Term, + CommitIndex: st.CommitIndex, + AppliedIndex: st.AppliedIndex, + }) + } + s.groupsMu.RUnlock() + return &pb.GetRaftGroupsResponse{Groups: out}, nil +} + +func (s *AdminServer) snapshotLeaders() []*pb.GroupLeader { + s.groupsMu.RLock() + defer s.groupsMu.RUnlock() + out := make([]*pb.GroupLeader, 0, len(s.groups)) + for id, g := range s.groups { + st := g.Status() + out = append(out, &pb.GroupLeader{ + RaftGroupId: id, + LeaderNodeId: st.Leader.ID, + LeaderTerm: st.Term, + }) + } + return out +} + +// AdminTokenAuth builds a gRPC unary+stream interceptor pair enforcing +// "authorization: Bearer " metadata against the supplied token. An +// empty token disables enforcement; callers should pair that mode with a +// --adminInsecureNoAuth flag so operators knowingly opt in. +func AdminTokenAuth(token string) (grpc.UnaryServerInterceptor, grpc.StreamServerInterceptor) { + if token == "" { + return nil, nil + } + expected := []byte(token) + check := func(ctx context.Context) error { + md, ok := metadata.FromIncomingContext(ctx) + if !ok { + return status.Error(codes.Unauthenticated, "missing authorization metadata") + } + values := md.Get("authorization") + if len(values) == 0 { + return status.Error(codes.Unauthenticated, "missing authorization header") + } + got, ok := strings.CutPrefix(values[0], "Bearer ") + if !ok { + return status.Error(codes.Unauthenticated, "authorization is not a bearer token") + } + if subtle.ConstantTimeCompare([]byte(got), expected) != 1 { + return status.Error(codes.Unauthenticated, "invalid admin token") + } + return nil + } + unary := func( + ctx context.Context, + req any, + info *grpc.UnaryServerInfo, + handler grpc.UnaryHandler, + ) (any, error) { + if !strings.HasPrefix(info.FullMethod, "/Admin/") { + return handler(ctx, req) + } + if err := check(ctx); err != nil { + return nil, err + } + return handler(ctx, req) + } + stream := func( + srv any, + ss grpc.ServerStream, + info *grpc.StreamServerInfo, + handler grpc.StreamHandler, + ) error { + if !strings.HasPrefix(info.FullMethod, "/Admin/") { + return handler(srv, ss) + } + if err := check(ss.Context()); err != nil { + return err + } + return handler(srv, ss) + } + return unary, stream +} + +// ErrAdminTokenRequired is returned by NewAdminServer helpers when the operator +// failed to supply a token and also did not opt into insecure mode. +var ErrAdminTokenRequired = errors.New("admin token file required; pass --adminInsecureNoAuth to run without") diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go new file mode 100644 index 00000000..3001ebec --- /dev/null +++ b/adapter/admin_grpc_test.go @@ -0,0 +1,141 @@ +package adapter + +import ( + "context" + "testing" + + "github.com/bootjp/elastickv/internal/raftengine" + pb "github.com/bootjp/elastickv/proto" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/metadata" + "google.golang.org/grpc/status" +) + +type fakeGroup struct { + leaderID string + term uint64 + commit uint64 + applied uint64 +} + +func (f fakeGroup) Status() raftengine.Status { + return raftengine.Status{ + Leader: raftengine.LeaderInfo{ID: f.leaderID}, + Term: f.term, + CommitIndex: f.commit, + AppliedIndex: f.applied, + } +} + +func TestGetClusterOverviewReturnsSelfAndLeaders(t *testing.T) { + t.Parallel() + srv := NewAdminServer( + NodeIdentity{NodeID: "node-a", GRPCAddress: "127.0.0.1:50051"}, + []NodeIdentity{{NodeID: "node-b", GRPCAddress: "127.0.0.1:50052"}}, + ) + srv.RegisterGroup(1, fakeGroup{leaderID: "node-a", term: 7}) + srv.RegisterGroup(2, fakeGroup{leaderID: "node-b", term: 3}) + + resp, err := srv.GetClusterOverview(context.Background(), &pb.GetClusterOverviewRequest{}) + if err != nil { + t.Fatalf("GetClusterOverview: %v", err) + } + if resp.Self.NodeId != "node-a" { + t.Fatalf("self = %q, want node-a", resp.Self.NodeId) + } + if len(resp.Members) != 1 || resp.Members[0].NodeId != "node-b" { + t.Fatalf("members = %v, want [node-b]", resp.Members) + } + if len(resp.GroupLeaders) != 2 { + t.Fatalf("group_leaders count = %d, want 2", len(resp.GroupLeaders)) + } +} + +func TestGetRaftGroupsExposesCommitApplied(t *testing.T) { + t.Parallel() + srv := NewAdminServer(NodeIdentity{NodeID: "n1"}, nil) + srv.RegisterGroup(1, fakeGroup{leaderID: "n1", term: 2, commit: 99, applied: 97}) + + resp, err := srv.GetRaftGroups(context.Background(), &pb.GetRaftGroupsRequest{}) + if err != nil { + t.Fatalf("GetRaftGroups: %v", err) + } + if len(resp.Groups) != 1 { + t.Fatalf("groups = %d, want 1", len(resp.Groups)) + } + g := resp.Groups[0] + if g.CommitIndex != 99 || g.AppliedIndex != 97 || g.LeaderTerm != 2 { + t.Fatalf("unexpected state %+v", g) + } +} + +func TestAdminTokenAuth(t *testing.T) { + t.Parallel() + unary, _ := AdminTokenAuth("s3cret") + if unary == nil { + t.Fatal("interceptor should be non-nil for configured token") + } + + info := &grpc.UnaryServerInfo{FullMethod: "/Admin/GetClusterOverview"} + handler := func(_ context.Context, _ any) (any, error) { return "ok", nil } + + cases := []struct { + name string + md metadata.MD + code codes.Code + call bool + }{ + {"missing metadata", nil, codes.Unauthenticated, false}, + {"missing header", metadata.Pairs(), codes.Unauthenticated, false}, + {"wrong scheme", metadata.Pairs("authorization", "Basic zzz"), codes.Unauthenticated, false}, + {"wrong token", metadata.Pairs("authorization", "Bearer nope"), codes.Unauthenticated, false}, + {"correct", metadata.Pairs("authorization", "Bearer s3cret"), codes.OK, true}, + } + for _, tc := range cases { + + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + ctx := context.Background() + if tc.md != nil { + ctx = metadata.NewIncomingContext(ctx, tc.md) + } + resp, err := unary(ctx, nil, info, handler) + if tc.code == codes.OK { + if err != nil { + t.Fatalf("want OK, got %v", err) + } + if resp != "ok" { + t.Fatalf("handler not called: resp=%v", resp) + } + return + } + if status.Code(err) != tc.code { + t.Fatalf("code = %v, want %v (err=%v)", status.Code(err), tc.code, err) + } + }) + } +} + +func TestAdminTokenAuthSkipsOtherServices(t *testing.T) { + t.Parallel() + unary, _ := AdminTokenAuth("s3cret") + info := &grpc.UnaryServerInfo{FullMethod: "/RawKV/Get"} + handler := func(_ context.Context, _ any) (any, error) { return "ok", nil } + + resp, err := unary(context.Background(), nil, info, handler) + if err != nil { + t.Fatalf("non-admin method should not be gated: %v", err) + } + if resp != "ok" { + t.Fatalf("handler not called: resp=%v", resp) + } +} + +func TestAdminTokenAuthEmptyTokenDisabled(t *testing.T) { + t.Parallel() + unary, stream := AdminTokenAuth("") + if unary != nil || stream != nil { + t.Fatal("empty token should disable interceptors") + } +} diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go new file mode 100644 index 00000000..4c9a2c1c --- /dev/null +++ b/cmd/elastickv-admin/main.go @@ -0,0 +1,257 @@ +// Command elastickv-admin serves the Elastickv admin Web UI described in +// docs/admin_ui_key_visualizer_design.md. Phase 0: token-protected passthrough +// of Admin.GetClusterOverview at /api/cluster/overview, no SPA yet. +package main + +import ( + "context" + "encoding/json" + "errors" + "flag" + "fmt" + "log" + "net/http" + "os" + "os/signal" + "path/filepath" + "strings" + "sync" + "syscall" + "time" + + pb "github.com/bootjp/elastickv/proto" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/metadata" +) + +const ( + defaultBindAddr = "127.0.0.1:8080" + defaultNodesRefreshInterval = 15 * time.Second + defaultGRPCRequestTimeout = 10 * time.Second + readHeaderTimeout = 5 * time.Second + shutdownTimeout = 5 * time.Second +) + +var ( + bindAddr = flag.String("bindAddr", defaultBindAddr, "HTTP bind address for the admin UI") + nodes = flag.String("nodes", "", "Comma-separated list of elastickv node gRPC addresses") + nodeTokenFile = flag.String("nodeTokenFile", "", "File containing the bearer token sent to nodes' Admin service") + nodesRefreshInterval = flag.Duration("nodesRefreshInterval", defaultNodesRefreshInterval, "Duration to cache cluster membership before re-fetching") + insecureNoAuth = flag.Bool("adminInsecureNoAuth", false, "Skip bearer token authentication; development only") +) + +func main() { + flag.Parse() + if err := run(); err != nil { + log.Fatal(err) + } +} + +func run() error { + seeds := splitNodes(*nodes) + if len(seeds) == 0 { + return errors.New("--nodes is required (comma-separated gRPC addresses)") + } + + token, err := loadToken(*nodeTokenFile, *insecureNoAuth) + if err != nil { + return err + } + + fan := newFanout(seeds, token, *nodesRefreshInterval) + defer fan.Close() + + mux := http.NewServeMux() + mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + }) + mux.HandleFunc("/api/cluster/overview", fan.handleOverview) + mux.HandleFunc("/api/", func(w http.ResponseWriter, _ *http.Request) { + writeJSONError(w, http.StatusServiceUnavailable, "endpoint not implemented in phase 0") + }) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + _, _ = w.Write([]byte("elastickv-admin: phase 0 — SPA not yet embedded\n")) + }) + + srv := &http.Server{ + Addr: *bindAddr, + Handler: mux, + ReadHeaderTimeout: readHeaderTimeout, + } + + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + errCh := make(chan error, 1) + go func() { + log.Printf("elastickv-admin listening on %s (seeds=%v)", *bindAddr, seeds) + if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + errCh <- err + return + } + errCh <- nil + }() + + select { + case <-ctx.Done(): + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout) + defer shutdownCancel() + if err := srv.Shutdown(shutdownCtx); err != nil { + return fmt.Errorf("shutdown: %w", err) + } + return nil + case err := <-errCh: + return err + } +} + +func splitNodes(raw string) []string { + parts := strings.Split(raw, ",") + out := make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p != "" { + out = append(out, p) + } + } + return out +} + +func loadToken(path string, insecureMode bool) (string, error) { + if path == "" { + if insecureMode { + return "", nil + } + return "", errors.New("--nodeTokenFile is required; pass --adminInsecureNoAuth for insecure dev mode") + } + if insecureMode { + return "", errors.New("--adminInsecureNoAuth and --nodeTokenFile are mutually exclusive") + } + abs, err := filepath.Abs(path) + if err != nil { + return "", fmt.Errorf("resolve token path: %w", err) + } + b, err := os.ReadFile(abs) + if err != nil { + return "", fmt.Errorf("read token file: %w", err) + } + token := strings.TrimSpace(string(b)) + if token == "" { + return "", errors.New("token file is empty") + } + return token, nil +} + +type nodeClient struct { + addr string + conn *grpc.ClientConn + client pb.AdminClient +} + +type fanout struct { + seeds []string + token string + refreshInterval time.Duration + + mu sync.Mutex + clients map[string]*nodeClient +} + +func newFanout(seeds []string, token string, refreshInterval time.Duration) *fanout { + if refreshInterval <= 0 { + refreshInterval = defaultNodesRefreshInterval + } + return &fanout{ + seeds: seeds, + token: token, + refreshInterval: refreshInterval, + clients: make(map[string]*nodeClient), + } +} + +func (f *fanout) Close() { + f.mu.Lock() + defer f.mu.Unlock() + for _, c := range f.clients { + _ = c.conn.Close() + } + f.clients = nil +} + +func (f *fanout) clientFor(addr string) (*nodeClient, error) { + f.mu.Lock() + defer f.mu.Unlock() + if c, ok := f.clients[addr]; ok { + return c, nil + } + conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + return nil, fmt.Errorf("dial %s: %w", addr, err) + } + c := &nodeClient{addr: addr, conn: conn, client: pb.NewAdminClient(conn)} + f.clients[addr] = c + return c, nil +} + +func (f *fanout) outgoingCtx(parent context.Context) context.Context { + if f.token == "" { + return parent + } + return metadata.AppendToOutgoingContext(parent, "authorization", "Bearer "+f.token) +} + +func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { + ctx, cancel := context.WithTimeout(r.Context(), defaultGRPCRequestTimeout) + defer cancel() + + type perNode struct { + Node string `json:"node"` + OK bool `json:"ok"` + Error string `json:"error,omitempty"` + Data *pb.GetClusterOverviewResponse `json:"data,omitempty"` + } + + results := make([]perNode, len(f.seeds)) + var wg sync.WaitGroup + for i, addr := range f.seeds { + wg.Add(1) + go func(i int, addr string) { + defer wg.Done() + entry := perNode{Node: addr} + cli, err := f.clientFor(addr) + if err != nil { + entry.Error = err.Error() + results[i] = entry + return + } + resp, err := cli.client.GetClusterOverview(f.outgoingCtx(ctx), &pb.GetClusterOverviewRequest{}) + if err != nil { + entry.Error = err.Error() + results[i] = entry + return + } + entry.OK = true + entry.Data = resp + results[i] = entry + }(i, addr) + } + wg.Wait() + + writeJSON(w, http.StatusOK, map[string]any{"nodes": results}) +} + +func writeJSON(w http.ResponseWriter, code int, body any) { + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.WriteHeader(code) + _ = json.NewEncoder(w).Encode(body) +} + +func writeJSONError(w http.ResponseWriter, code int, msg string) { + writeJSON(w, code, map[string]any{"code": code, "message": msg}) +} diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go new file mode 100644 index 00000000..e33d167c --- /dev/null +++ b/cmd/elastickv-admin/main_test.go @@ -0,0 +1,77 @@ +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestSplitNodesTrimsAndDrops(t *testing.T) { + t.Parallel() + got := splitNodes(" host-a:50051 ,,host-b:50051 ,") + want := []string{"host-a:50051", "host-b:50051"} + if len(got) != len(want) { + t.Fatalf("len = %d, want %d (%v)", len(got), len(want), got) + } + for i, w := range want { + if got[i] != w { + t.Fatalf("[%d] = %q, want %q", i, got[i], w) + } + } +} + +func TestLoadTokenRequiresFileOrInsecure(t *testing.T) { + t.Parallel() + if _, err := loadToken("", false); err == nil { + t.Fatal("expected error when neither token nor insecure mode supplied") + } + tok, err := loadToken("", true) + if err != nil { + t.Fatalf("insecure-mode empty path should succeed: %v", err) + } + if tok != "" { + t.Fatalf("insecure-mode token = %q, want empty", tok) + } +} + +func TestLoadTokenReadsAndTrims(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "token") + if err := os.WriteFile(path, []byte("\n s3cret \n"), 0o600); err != nil { + t.Fatal(err) + } + tok, err := loadToken(path, false) + if err != nil { + t.Fatalf("loadToken: %v", err) + } + if tok != "s3cret" { + t.Fatalf("tok = %q, want s3cret", tok) + } +} + +func TestLoadTokenRejectsEmptyFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "empty") + if err := os.WriteFile(path, []byte(" \n"), 0o600); err != nil { + t.Fatal(err) + } + _, err := loadToken(path, false) + if err == nil || !strings.Contains(err.Error(), "empty") { + t.Fatalf("expected empty-file error, got %v", err) + } +} + +func TestLoadTokenRejectsInsecureWithFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "tok") + if err := os.WriteFile(path, []byte("x"), 0o600); err != nil { + t.Fatal(err) + } + if _, err := loadToken(path, true); err == nil { + t.Fatal("expected mutual-exclusion error when both supplied") + } +} diff --git a/proto/Makefile b/proto/Makefile index c329a70b..8f811e88 100644 --- a/proto/Makefile +++ b/proto/Makefile @@ -30,6 +30,9 @@ gen: check-tools protoc --go_out=. --go_opt=paths=source_relative \ --go-grpc_out=. --go-grpc_opt=paths=source_relative \ distribution.proto + protoc --go_out=. --go_opt=paths=source_relative \ + --go-grpc_out=. --go-grpc_opt=paths=source_relative \ + admin.proto protoc --go_out=. --go_opt=paths=source_relative \ dynamodb_internal.proto protoc --go_out=. --go_opt=paths=source_relative \ diff --git a/proto/admin.pb.go b/proto/admin.pb.go new file mode 100644 index 00000000..5845642e --- /dev/null +++ b/proto/admin.pb.go @@ -0,0 +1,1529 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.11 +// protoc v7.34.0 +// source: admin.proto + +package proto + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type KeyVizSeries int32 + +const ( + KeyVizSeries_KEYVIZ_SERIES_UNSPECIFIED KeyVizSeries = 0 + KeyVizSeries_KEYVIZ_SERIES_READS KeyVizSeries = 1 + KeyVizSeries_KEYVIZ_SERIES_WRITES KeyVizSeries = 2 + KeyVizSeries_KEYVIZ_SERIES_READ_BYTES KeyVizSeries = 3 + KeyVizSeries_KEYVIZ_SERIES_WRITE_BYTES KeyVizSeries = 4 +) + +// Enum value maps for KeyVizSeries. +var ( + KeyVizSeries_name = map[int32]string{ + 0: "KEYVIZ_SERIES_UNSPECIFIED", + 1: "KEYVIZ_SERIES_READS", + 2: "KEYVIZ_SERIES_WRITES", + 3: "KEYVIZ_SERIES_READ_BYTES", + 4: "KEYVIZ_SERIES_WRITE_BYTES", + } + KeyVizSeries_value = map[string]int32{ + "KEYVIZ_SERIES_UNSPECIFIED": 0, + "KEYVIZ_SERIES_READS": 1, + "KEYVIZ_SERIES_WRITES": 2, + "KEYVIZ_SERIES_READ_BYTES": 3, + "KEYVIZ_SERIES_WRITE_BYTES": 4, + } +) + +func (x KeyVizSeries) Enum() *KeyVizSeries { + p := new(KeyVizSeries) + *p = x + return p +} + +func (x KeyVizSeries) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (KeyVizSeries) Descriptor() protoreflect.EnumDescriptor { + return file_admin_proto_enumTypes[0].Descriptor() +} + +func (KeyVizSeries) Type() protoreflect.EnumType { + return &file_admin_proto_enumTypes[0] +} + +func (x KeyVizSeries) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use KeyVizSeries.Descriptor instead. +func (KeyVizSeries) EnumDescriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{0} +} + +type SampleRole int32 + +const ( + SampleRole_SAMPLE_ROLE_UNSPECIFIED SampleRole = 0 + SampleRole_SAMPLE_ROLE_LEADER_WRITE SampleRole = 1 + SampleRole_SAMPLE_ROLE_LEADER_READ SampleRole = 2 + SampleRole_SAMPLE_ROLE_FOLLOWER_READ SampleRole = 3 +) + +// Enum value maps for SampleRole. +var ( + SampleRole_name = map[int32]string{ + 0: "SAMPLE_ROLE_UNSPECIFIED", + 1: "SAMPLE_ROLE_LEADER_WRITE", + 2: "SAMPLE_ROLE_LEADER_READ", + 3: "SAMPLE_ROLE_FOLLOWER_READ", + } + SampleRole_value = map[string]int32{ + "SAMPLE_ROLE_UNSPECIFIED": 0, + "SAMPLE_ROLE_LEADER_WRITE": 1, + "SAMPLE_ROLE_LEADER_READ": 2, + "SAMPLE_ROLE_FOLLOWER_READ": 3, + } +) + +func (x SampleRole) Enum() *SampleRole { + p := new(SampleRole) + *p = x + return p +} + +func (x SampleRole) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (SampleRole) Descriptor() protoreflect.EnumDescriptor { + return file_admin_proto_enumTypes[1].Descriptor() +} + +func (SampleRole) Type() protoreflect.EnumType { + return &file_admin_proto_enumTypes[1] +} + +func (x SampleRole) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use SampleRole.Descriptor instead. +func (SampleRole) EnumDescriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{1} +} + +type NodeIdentity struct { + state protoimpl.MessageState `protogen:"open.v1"` + NodeId string `protobuf:"bytes,1,opt,name=node_id,json=nodeId,proto3" json:"node_id,omitempty"` + GrpcAddress string `protobuf:"bytes,2,opt,name=grpc_address,json=grpcAddress,proto3" json:"grpc_address,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *NodeIdentity) Reset() { + *x = NodeIdentity{} + mi := &file_admin_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *NodeIdentity) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*NodeIdentity) ProtoMessage() {} + +func (x *NodeIdentity) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use NodeIdentity.ProtoReflect.Descriptor instead. +func (*NodeIdentity) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{0} +} + +func (x *NodeIdentity) GetNodeId() string { + if x != nil { + return x.NodeId + } + return "" +} + +func (x *NodeIdentity) GetGrpcAddress() string { + if x != nil { + return x.GrpcAddress + } + return "" +} + +type GroupLeader struct { + state protoimpl.MessageState `protogen:"open.v1"` + RaftGroupId uint64 `protobuf:"varint,1,opt,name=raft_group_id,json=raftGroupId,proto3" json:"raft_group_id,omitempty"` + LeaderNodeId string `protobuf:"bytes,2,opt,name=leader_node_id,json=leaderNodeId,proto3" json:"leader_node_id,omitempty"` + LeaderTerm uint64 `protobuf:"varint,3,opt,name=leader_term,json=leaderTerm,proto3" json:"leader_term,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GroupLeader) Reset() { + *x = GroupLeader{} + mi := &file_admin_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GroupLeader) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GroupLeader) ProtoMessage() {} + +func (x *GroupLeader) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GroupLeader.ProtoReflect.Descriptor instead. +func (*GroupLeader) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{1} +} + +func (x *GroupLeader) GetRaftGroupId() uint64 { + if x != nil { + return x.RaftGroupId + } + return 0 +} + +func (x *GroupLeader) GetLeaderNodeId() string { + if x != nil { + return x.LeaderNodeId + } + return "" +} + +func (x *GroupLeader) GetLeaderTerm() uint64 { + if x != nil { + return x.LeaderTerm + } + return 0 +} + +type GetClusterOverviewRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetClusterOverviewRequest) Reset() { + *x = GetClusterOverviewRequest{} + mi := &file_admin_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetClusterOverviewRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetClusterOverviewRequest) ProtoMessage() {} + +func (x *GetClusterOverviewRequest) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetClusterOverviewRequest.ProtoReflect.Descriptor instead. +func (*GetClusterOverviewRequest) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{2} +} + +type GetClusterOverviewResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Self *NodeIdentity `protobuf:"bytes,1,opt,name=self,proto3" json:"self,omitempty"` + Members []*NodeIdentity `protobuf:"bytes,2,rep,name=members,proto3" json:"members,omitempty"` + GroupLeaders []*GroupLeader `protobuf:"bytes,3,rep,name=group_leaders,json=groupLeaders,proto3" json:"group_leaders,omitempty"` + AggregateQps uint64 `protobuf:"varint,4,opt,name=aggregate_qps,json=aggregateQps,proto3" json:"aggregate_qps,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetClusterOverviewResponse) Reset() { + *x = GetClusterOverviewResponse{} + mi := &file_admin_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetClusterOverviewResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetClusterOverviewResponse) ProtoMessage() {} + +func (x *GetClusterOverviewResponse) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetClusterOverviewResponse.ProtoReflect.Descriptor instead. +func (*GetClusterOverviewResponse) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{3} +} + +func (x *GetClusterOverviewResponse) GetSelf() *NodeIdentity { + if x != nil { + return x.Self + } + return nil +} + +func (x *GetClusterOverviewResponse) GetMembers() []*NodeIdentity { + if x != nil { + return x.Members + } + return nil +} + +func (x *GetClusterOverviewResponse) GetGroupLeaders() []*GroupLeader { + if x != nil { + return x.GroupLeaders + } + return nil +} + +func (x *GetClusterOverviewResponse) GetAggregateQps() uint64 { + if x != nil { + return x.AggregateQps + } + return 0 +} + +type RaftGroupState struct { + state protoimpl.MessageState `protogen:"open.v1"` + RaftGroupId uint64 `protobuf:"varint,1,opt,name=raft_group_id,json=raftGroupId,proto3" json:"raft_group_id,omitempty"` + LeaderNodeId string `protobuf:"bytes,2,opt,name=leader_node_id,json=leaderNodeId,proto3" json:"leader_node_id,omitempty"` + LeaderTerm uint64 `protobuf:"varint,3,opt,name=leader_term,json=leaderTerm,proto3" json:"leader_term,omitempty"` + CommitIndex uint64 `protobuf:"varint,4,opt,name=commit_index,json=commitIndex,proto3" json:"commit_index,omitempty"` + AppliedIndex uint64 `protobuf:"varint,5,opt,name=applied_index,json=appliedIndex,proto3" json:"applied_index,omitempty"` + LastContactUnixMs int64 `protobuf:"varint,6,opt,name=last_contact_unix_ms,json=lastContactUnixMs,proto3" json:"last_contact_unix_ms,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *RaftGroupState) Reset() { + *x = RaftGroupState{} + mi := &file_admin_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *RaftGroupState) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RaftGroupState) ProtoMessage() {} + +func (x *RaftGroupState) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[4] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RaftGroupState.ProtoReflect.Descriptor instead. +func (*RaftGroupState) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{4} +} + +func (x *RaftGroupState) GetRaftGroupId() uint64 { + if x != nil { + return x.RaftGroupId + } + return 0 +} + +func (x *RaftGroupState) GetLeaderNodeId() string { + if x != nil { + return x.LeaderNodeId + } + return "" +} + +func (x *RaftGroupState) GetLeaderTerm() uint64 { + if x != nil { + return x.LeaderTerm + } + return 0 +} + +func (x *RaftGroupState) GetCommitIndex() uint64 { + if x != nil { + return x.CommitIndex + } + return 0 +} + +func (x *RaftGroupState) GetAppliedIndex() uint64 { + if x != nil { + return x.AppliedIndex + } + return 0 +} + +func (x *RaftGroupState) GetLastContactUnixMs() int64 { + if x != nil { + return x.LastContactUnixMs + } + return 0 +} + +type GetRaftGroupsRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetRaftGroupsRequest) Reset() { + *x = GetRaftGroupsRequest{} + mi := &file_admin_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetRaftGroupsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetRaftGroupsRequest) ProtoMessage() {} + +func (x *GetRaftGroupsRequest) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[5] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetRaftGroupsRequest.ProtoReflect.Descriptor instead. +func (*GetRaftGroupsRequest) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{5} +} + +type GetRaftGroupsResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Groups []*RaftGroupState `protobuf:"bytes,1,rep,name=groups,proto3" json:"groups,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetRaftGroupsResponse) Reset() { + *x = GetRaftGroupsResponse{} + mi := &file_admin_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetRaftGroupsResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetRaftGroupsResponse) ProtoMessage() {} + +func (x *GetRaftGroupsResponse) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[6] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetRaftGroupsResponse.ProtoReflect.Descriptor instead. +func (*GetRaftGroupsResponse) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{6} +} + +func (x *GetRaftGroupsResponse) GetGroups() []*RaftGroupState { + if x != nil { + return x.Groups + } + return nil +} + +type AdapterSummary struct { + state protoimpl.MessageState `protogen:"open.v1"` + Adapter string `protobuf:"bytes,1,opt,name=adapter,proto3" json:"adapter,omitempty"` + Operation string `protobuf:"bytes,2,opt,name=operation,proto3" json:"operation,omitempty"` + Requests uint64 `protobuf:"varint,3,opt,name=requests,proto3" json:"requests,omitempty"` + InFlight uint64 `protobuf:"varint,4,opt,name=in_flight,json=inFlight,proto3" json:"in_flight,omitempty"` + BytesIn uint64 `protobuf:"varint,5,opt,name=bytes_in,json=bytesIn,proto3" json:"bytes_in,omitempty"` + BytesOut uint64 `protobuf:"varint,6,opt,name=bytes_out,json=bytesOut,proto3" json:"bytes_out,omitempty"` + P50Ns float64 `protobuf:"fixed64,7,opt,name=p50_ns,json=p50Ns,proto3" json:"p50_ns,omitempty"` + P95Ns float64 `protobuf:"fixed64,8,opt,name=p95_ns,json=p95Ns,proto3" json:"p95_ns,omitempty"` + P99Ns float64 `protobuf:"fixed64,9,opt,name=p99_ns,json=p99Ns,proto3" json:"p99_ns,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *AdapterSummary) Reset() { + *x = AdapterSummary{} + mi := &file_admin_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *AdapterSummary) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*AdapterSummary) ProtoMessage() {} + +func (x *AdapterSummary) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[7] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use AdapterSummary.ProtoReflect.Descriptor instead. +func (*AdapterSummary) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{7} +} + +func (x *AdapterSummary) GetAdapter() string { + if x != nil { + return x.Adapter + } + return "" +} + +func (x *AdapterSummary) GetOperation() string { + if x != nil { + return x.Operation + } + return "" +} + +func (x *AdapterSummary) GetRequests() uint64 { + if x != nil { + return x.Requests + } + return 0 +} + +func (x *AdapterSummary) GetInFlight() uint64 { + if x != nil { + return x.InFlight + } + return 0 +} + +func (x *AdapterSummary) GetBytesIn() uint64 { + if x != nil { + return x.BytesIn + } + return 0 +} + +func (x *AdapterSummary) GetBytesOut() uint64 { + if x != nil { + return x.BytesOut + } + return 0 +} + +func (x *AdapterSummary) GetP50Ns() float64 { + if x != nil { + return x.P50Ns + } + return 0 +} + +func (x *AdapterSummary) GetP95Ns() float64 { + if x != nil { + return x.P95Ns + } + return 0 +} + +func (x *AdapterSummary) GetP99Ns() float64 { + if x != nil { + return x.P99Ns + } + return 0 +} + +type GetAdapterSummaryRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetAdapterSummaryRequest) Reset() { + *x = GetAdapterSummaryRequest{} + mi := &file_admin_proto_msgTypes[8] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetAdapterSummaryRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetAdapterSummaryRequest) ProtoMessage() {} + +func (x *GetAdapterSummaryRequest) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[8] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetAdapterSummaryRequest.ProtoReflect.Descriptor instead. +func (*GetAdapterSummaryRequest) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{8} +} + +type GetAdapterSummaryResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Summaries []*AdapterSummary `protobuf:"bytes,1,rep,name=summaries,proto3" json:"summaries,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetAdapterSummaryResponse) Reset() { + *x = GetAdapterSummaryResponse{} + mi := &file_admin_proto_msgTypes[9] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetAdapterSummaryResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetAdapterSummaryResponse) ProtoMessage() {} + +func (x *GetAdapterSummaryResponse) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[9] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetAdapterSummaryResponse.ProtoReflect.Descriptor instead. +func (*GetAdapterSummaryResponse) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{9} +} + +func (x *GetAdapterSummaryResponse) GetSummaries() []*AdapterSummary { + if x != nil { + return x.Summaries + } + return nil +} + +type KeyVizRow struct { + state protoimpl.MessageState `protogen:"open.v1"` + // bucket_id is either "route:" or "virtual:". + BucketId string `protobuf:"bytes,1,opt,name=bucket_id,json=bucketId,proto3" json:"bucket_id,omitempty"` + Start []byte `protobuf:"bytes,2,opt,name=start,proto3" json:"start,omitempty"` + End []byte `protobuf:"bytes,3,opt,name=end,proto3" json:"end,omitempty"` + Label string `protobuf:"bytes,4,opt,name=label,proto3" json:"label,omitempty"` + Aggregate bool `protobuf:"varint,5,opt,name=aggregate,proto3" json:"aggregate,omitempty"` + RouteIds []uint64 `protobuf:"varint,6,rep,packed,name=route_ids,json=routeIds,proto3" json:"route_ids,omitempty"` + RouteIdsTruncated bool `protobuf:"varint,7,opt,name=route_ids_truncated,json=routeIdsTruncated,proto3" json:"route_ids_truncated,omitempty"` + RouteCount uint64 `protobuf:"varint,8,opt,name=route_count,json=routeCount,proto3" json:"route_count,omitempty"` + SampleRoles []SampleRole `protobuf:"varint,9,rep,packed,name=sample_roles,json=sampleRoles,proto3,enum=SampleRole" json:"sample_roles,omitempty"` + LineageId string `protobuf:"bytes,10,opt,name=lineage_id,json=lineageId,proto3" json:"lineage_id,omitempty"` + // values[j] is the series value at time column j. + Values []uint64 `protobuf:"varint,11,rep,packed,name=values,proto3" json:"values,omitempty"` + // soft_columns[j] is true when the j-th column missed the estimator SLO. + SoftColumns []bool `protobuf:"varint,12,rep,packed,name=soft_columns,json=softColumns,proto3" json:"soft_columns,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *KeyVizRow) Reset() { + *x = KeyVizRow{} + mi := &file_admin_proto_msgTypes[10] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *KeyVizRow) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*KeyVizRow) ProtoMessage() {} + +func (x *KeyVizRow) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[10] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use KeyVizRow.ProtoReflect.Descriptor instead. +func (*KeyVizRow) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{10} +} + +func (x *KeyVizRow) GetBucketId() string { + if x != nil { + return x.BucketId + } + return "" +} + +func (x *KeyVizRow) GetStart() []byte { + if x != nil { + return x.Start + } + return nil +} + +func (x *KeyVizRow) GetEnd() []byte { + if x != nil { + return x.End + } + return nil +} + +func (x *KeyVizRow) GetLabel() string { + if x != nil { + return x.Label + } + return "" +} + +func (x *KeyVizRow) GetAggregate() bool { + if x != nil { + return x.Aggregate + } + return false +} + +func (x *KeyVizRow) GetRouteIds() []uint64 { + if x != nil { + return x.RouteIds + } + return nil +} + +func (x *KeyVizRow) GetRouteIdsTruncated() bool { + if x != nil { + return x.RouteIdsTruncated + } + return false +} + +func (x *KeyVizRow) GetRouteCount() uint64 { + if x != nil { + return x.RouteCount + } + return 0 +} + +func (x *KeyVizRow) GetSampleRoles() []SampleRole { + if x != nil { + return x.SampleRoles + } + return nil +} + +func (x *KeyVizRow) GetLineageId() string { + if x != nil { + return x.LineageId + } + return "" +} + +func (x *KeyVizRow) GetValues() []uint64 { + if x != nil { + return x.Values + } + return nil +} + +func (x *KeyVizRow) GetSoftColumns() []bool { + if x != nil { + return x.SoftColumns + } + return nil +} + +type GetKeyVizMatrixRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + Series KeyVizSeries `protobuf:"varint,1,opt,name=series,proto3,enum=KeyVizSeries" json:"series,omitempty"` + FromUnixMs int64 `protobuf:"varint,2,opt,name=from_unix_ms,json=fromUnixMs,proto3" json:"from_unix_ms,omitempty"` + ToUnixMs int64 `protobuf:"varint,3,opt,name=to_unix_ms,json=toUnixMs,proto3" json:"to_unix_ms,omitempty"` + Rows uint32 `protobuf:"varint,4,opt,name=rows,proto3" json:"rows,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetKeyVizMatrixRequest) Reset() { + *x = GetKeyVizMatrixRequest{} + mi := &file_admin_proto_msgTypes[11] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetKeyVizMatrixRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetKeyVizMatrixRequest) ProtoMessage() {} + +func (x *GetKeyVizMatrixRequest) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[11] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetKeyVizMatrixRequest.ProtoReflect.Descriptor instead. +func (*GetKeyVizMatrixRequest) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{11} +} + +func (x *GetKeyVizMatrixRequest) GetSeries() KeyVizSeries { + if x != nil { + return x.Series + } + return KeyVizSeries_KEYVIZ_SERIES_UNSPECIFIED +} + +func (x *GetKeyVizMatrixRequest) GetFromUnixMs() int64 { + if x != nil { + return x.FromUnixMs + } + return 0 +} + +func (x *GetKeyVizMatrixRequest) GetToUnixMs() int64 { + if x != nil { + return x.ToUnixMs + } + return 0 +} + +func (x *GetKeyVizMatrixRequest) GetRows() uint32 { + if x != nil { + return x.Rows + } + return 0 +} + +type GetKeyVizMatrixResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + ColumnUnixMs []int64 `protobuf:"varint,1,rep,packed,name=column_unix_ms,json=columnUnixMs,proto3" json:"column_unix_ms,omitempty"` + Rows []*KeyVizRow `protobuf:"bytes,2,rep,name=rows,proto3" json:"rows,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetKeyVizMatrixResponse) Reset() { + *x = GetKeyVizMatrixResponse{} + mi := &file_admin_proto_msgTypes[12] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetKeyVizMatrixResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetKeyVizMatrixResponse) ProtoMessage() {} + +func (x *GetKeyVizMatrixResponse) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[12] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetKeyVizMatrixResponse.ProtoReflect.Descriptor instead. +func (*GetKeyVizMatrixResponse) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{12} +} + +func (x *GetKeyVizMatrixResponse) GetColumnUnixMs() []int64 { + if x != nil { + return x.ColumnUnixMs + } + return nil +} + +func (x *GetKeyVizMatrixResponse) GetRows() []*KeyVizRow { + if x != nil { + return x.Rows + } + return nil +} + +type GetRouteDetailRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Either a concrete route: or a virtual: emitted in a previous + // GetKeyVizMatrix response. + BucketId string `protobuf:"bytes,1,opt,name=bucket_id,json=bucketId,proto3" json:"bucket_id,omitempty"` + FromUnixMs int64 `protobuf:"varint,2,opt,name=from_unix_ms,json=fromUnixMs,proto3" json:"from_unix_ms,omitempty"` + ToUnixMs int64 `protobuf:"varint,3,opt,name=to_unix_ms,json=toUnixMs,proto3" json:"to_unix_ms,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetRouteDetailRequest) Reset() { + *x = GetRouteDetailRequest{} + mi := &file_admin_proto_msgTypes[13] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetRouteDetailRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetRouteDetailRequest) ProtoMessage() {} + +func (x *GetRouteDetailRequest) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[13] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetRouteDetailRequest.ProtoReflect.Descriptor instead. +func (*GetRouteDetailRequest) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{13} +} + +func (x *GetRouteDetailRequest) GetBucketId() string { + if x != nil { + return x.BucketId + } + return "" +} + +func (x *GetRouteDetailRequest) GetFromUnixMs() int64 { + if x != nil { + return x.FromUnixMs + } + return 0 +} + +func (x *GetRouteDetailRequest) GetToUnixMs() int64 { + if x != nil { + return x.ToUnixMs + } + return 0 +} + +type GetRouteDetailResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + Row *KeyVizRow `protobuf:"bytes,1,opt,name=row,proto3" json:"row,omitempty"` + PerAdapter []*AdapterSummary `protobuf:"bytes,2,rep,name=per_adapter,json=perAdapter,proto3" json:"per_adapter,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *GetRouteDetailResponse) Reset() { + *x = GetRouteDetailResponse{} + mi := &file_admin_proto_msgTypes[14] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *GetRouteDetailResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetRouteDetailResponse) ProtoMessage() {} + +func (x *GetRouteDetailResponse) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[14] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetRouteDetailResponse.ProtoReflect.Descriptor instead. +func (*GetRouteDetailResponse) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{14} +} + +func (x *GetRouteDetailResponse) GetRow() *KeyVizRow { + if x != nil { + return x.Row + } + return nil +} + +func (x *GetRouteDetailResponse) GetPerAdapter() []*AdapterSummary { + if x != nil { + return x.PerAdapter + } + return nil +} + +type StreamEventsRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *StreamEventsRequest) Reset() { + *x = StreamEventsRequest{} + mi := &file_admin_proto_msgTypes[15] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *StreamEventsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*StreamEventsRequest) ProtoMessage() {} + +func (x *StreamEventsRequest) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[15] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use StreamEventsRequest.ProtoReflect.Descriptor instead. +func (*StreamEventsRequest) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{15} +} + +type StreamEventsEvent struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Types that are valid to be assigned to Event: + // + // *StreamEventsEvent_RouteTransition + // *StreamEventsEvent_KeyvizColumn + Event isStreamEventsEvent_Event `protobuf_oneof:"event"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *StreamEventsEvent) Reset() { + *x = StreamEventsEvent{} + mi := &file_admin_proto_msgTypes[16] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *StreamEventsEvent) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*StreamEventsEvent) ProtoMessage() {} + +func (x *StreamEventsEvent) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[16] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use StreamEventsEvent.ProtoReflect.Descriptor instead. +func (*StreamEventsEvent) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{16} +} + +func (x *StreamEventsEvent) GetEvent() isStreamEventsEvent_Event { + if x != nil { + return x.Event + } + return nil +} + +func (x *StreamEventsEvent) GetRouteTransition() *RouteTransition { + if x != nil { + if x, ok := x.Event.(*StreamEventsEvent_RouteTransition); ok { + return x.RouteTransition + } + } + return nil +} + +func (x *StreamEventsEvent) GetKeyvizColumn() *KeyVizColumn { + if x != nil { + if x, ok := x.Event.(*StreamEventsEvent_KeyvizColumn); ok { + return x.KeyvizColumn + } + } + return nil +} + +type isStreamEventsEvent_Event interface { + isStreamEventsEvent_Event() +} + +type StreamEventsEvent_RouteTransition struct { + RouteTransition *RouteTransition `protobuf:"bytes,1,opt,name=route_transition,json=routeTransition,proto3,oneof"` +} + +type StreamEventsEvent_KeyvizColumn struct { + KeyvizColumn *KeyVizColumn `protobuf:"bytes,2,opt,name=keyviz_column,json=keyvizColumn,proto3,oneof"` +} + +func (*StreamEventsEvent_RouteTransition) isStreamEventsEvent_Event() {} + +func (*StreamEventsEvent_KeyvizColumn) isStreamEventsEvent_Event() {} + +type RouteTransition struct { + state protoimpl.MessageState `protogen:"open.v1"` + ParentRouteId uint64 `protobuf:"varint,1,opt,name=parent_route_id,json=parentRouteId,proto3" json:"parent_route_id,omitempty"` + ChildRouteIds []uint64 `protobuf:"varint,2,rep,packed,name=child_route_ids,json=childRouteIds,proto3" json:"child_route_ids,omitempty"` + LineageId string `protobuf:"bytes,3,opt,name=lineage_id,json=lineageId,proto3" json:"lineage_id,omitempty"` + UnixMs int64 `protobuf:"varint,4,opt,name=unix_ms,json=unixMs,proto3" json:"unix_ms,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *RouteTransition) Reset() { + *x = RouteTransition{} + mi := &file_admin_proto_msgTypes[17] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *RouteTransition) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RouteTransition) ProtoMessage() {} + +func (x *RouteTransition) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[17] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RouteTransition.ProtoReflect.Descriptor instead. +func (*RouteTransition) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{17} +} + +func (x *RouteTransition) GetParentRouteId() uint64 { + if x != nil { + return x.ParentRouteId + } + return 0 +} + +func (x *RouteTransition) GetChildRouteIds() []uint64 { + if x != nil { + return x.ChildRouteIds + } + return nil +} + +func (x *RouteTransition) GetLineageId() string { + if x != nil { + return x.LineageId + } + return "" +} + +func (x *RouteTransition) GetUnixMs() int64 { + if x != nil { + return x.UnixMs + } + return 0 +} + +type KeyVizColumn struct { + state protoimpl.MessageState `protogen:"open.v1"` + ColumnUnixMs int64 `protobuf:"varint,1,opt,name=column_unix_ms,json=columnUnixMs,proto3" json:"column_unix_ms,omitempty"` + Series KeyVizSeries `protobuf:"varint,2,opt,name=series,proto3,enum=KeyVizSeries" json:"series,omitempty"` + Rows []*KeyVizRow `protobuf:"bytes,3,rep,name=rows,proto3" json:"rows,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *KeyVizColumn) Reset() { + *x = KeyVizColumn{} + mi := &file_admin_proto_msgTypes[18] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *KeyVizColumn) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*KeyVizColumn) ProtoMessage() {} + +func (x *KeyVizColumn) ProtoReflect() protoreflect.Message { + mi := &file_admin_proto_msgTypes[18] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use KeyVizColumn.ProtoReflect.Descriptor instead. +func (*KeyVizColumn) Descriptor() ([]byte, []int) { + return file_admin_proto_rawDescGZIP(), []int{18} +} + +func (x *KeyVizColumn) GetColumnUnixMs() int64 { + if x != nil { + return x.ColumnUnixMs + } + return 0 +} + +func (x *KeyVizColumn) GetSeries() KeyVizSeries { + if x != nil { + return x.Series + } + return KeyVizSeries_KEYVIZ_SERIES_UNSPECIFIED +} + +func (x *KeyVizColumn) GetRows() []*KeyVizRow { + if x != nil { + return x.Rows + } + return nil +} + +var File_admin_proto protoreflect.FileDescriptor + +const file_admin_proto_rawDesc = "" + + "\n" + + "\vadmin.proto\"J\n" + + "\fNodeIdentity\x12\x17\n" + + "\anode_id\x18\x01 \x01(\tR\x06nodeId\x12!\n" + + "\fgrpc_address\x18\x02 \x01(\tR\vgrpcAddress\"x\n" + + "\vGroupLeader\x12\"\n" + + "\rraft_group_id\x18\x01 \x01(\x04R\vraftGroupId\x12$\n" + + "\x0eleader_node_id\x18\x02 \x01(\tR\fleaderNodeId\x12\x1f\n" + + "\vleader_term\x18\x03 \x01(\x04R\n" + + "leaderTerm\"\x1b\n" + + "\x19GetClusterOverviewRequest\"\xc0\x01\n" + + "\x1aGetClusterOverviewResponse\x12!\n" + + "\x04self\x18\x01 \x01(\v2\r.NodeIdentityR\x04self\x12'\n" + + "\amembers\x18\x02 \x03(\v2\r.NodeIdentityR\amembers\x121\n" + + "\rgroup_leaders\x18\x03 \x03(\v2\f.GroupLeaderR\fgroupLeaders\x12#\n" + + "\raggregate_qps\x18\x04 \x01(\x04R\faggregateQps\"\xf4\x01\n" + + "\x0eRaftGroupState\x12\"\n" + + "\rraft_group_id\x18\x01 \x01(\x04R\vraftGroupId\x12$\n" + + "\x0eleader_node_id\x18\x02 \x01(\tR\fleaderNodeId\x12\x1f\n" + + "\vleader_term\x18\x03 \x01(\x04R\n" + + "leaderTerm\x12!\n" + + "\fcommit_index\x18\x04 \x01(\x04R\vcommitIndex\x12#\n" + + "\rapplied_index\x18\x05 \x01(\x04R\fappliedIndex\x12/\n" + + "\x14last_contact_unix_ms\x18\x06 \x01(\x03R\x11lastContactUnixMs\"\x16\n" + + "\x14GetRaftGroupsRequest\"@\n" + + "\x15GetRaftGroupsResponse\x12'\n" + + "\x06groups\x18\x01 \x03(\v2\x0f.RaftGroupStateR\x06groups\"\xfe\x01\n" + + "\x0eAdapterSummary\x12\x18\n" + + "\aadapter\x18\x01 \x01(\tR\aadapter\x12\x1c\n" + + "\toperation\x18\x02 \x01(\tR\toperation\x12\x1a\n" + + "\brequests\x18\x03 \x01(\x04R\brequests\x12\x1b\n" + + "\tin_flight\x18\x04 \x01(\x04R\binFlight\x12\x19\n" + + "\bbytes_in\x18\x05 \x01(\x04R\abytesIn\x12\x1b\n" + + "\tbytes_out\x18\x06 \x01(\x04R\bbytesOut\x12\x15\n" + + "\x06p50_ns\x18\a \x01(\x01R\x05p50Ns\x12\x15\n" + + "\x06p95_ns\x18\b \x01(\x01R\x05p95Ns\x12\x15\n" + + "\x06p99_ns\x18\t \x01(\x01R\x05p99Ns\"\x1a\n" + + "\x18GetAdapterSummaryRequest\"J\n" + + "\x19GetAdapterSummaryResponse\x12-\n" + + "\tsummaries\x18\x01 \x03(\v2\x0f.AdapterSummaryR\tsummaries\"\xfc\x02\n" + + "\tKeyVizRow\x12\x1b\n" + + "\tbucket_id\x18\x01 \x01(\tR\bbucketId\x12\x14\n" + + "\x05start\x18\x02 \x01(\fR\x05start\x12\x10\n" + + "\x03end\x18\x03 \x01(\fR\x03end\x12\x14\n" + + "\x05label\x18\x04 \x01(\tR\x05label\x12\x1c\n" + + "\taggregate\x18\x05 \x01(\bR\taggregate\x12\x1b\n" + + "\troute_ids\x18\x06 \x03(\x04R\brouteIds\x12.\n" + + "\x13route_ids_truncated\x18\a \x01(\bR\x11routeIdsTruncated\x12\x1f\n" + + "\vroute_count\x18\b \x01(\x04R\n" + + "routeCount\x12.\n" + + "\fsample_roles\x18\t \x03(\x0e2\v.SampleRoleR\vsampleRoles\x12\x1d\n" + + "\n" + + "lineage_id\x18\n" + + " \x01(\tR\tlineageId\x12\x16\n" + + "\x06values\x18\v \x03(\x04R\x06values\x12!\n" + + "\fsoft_columns\x18\f \x03(\bR\vsoftColumns\"\x93\x01\n" + + "\x16GetKeyVizMatrixRequest\x12%\n" + + "\x06series\x18\x01 \x01(\x0e2\r.KeyVizSeriesR\x06series\x12 \n" + + "\ffrom_unix_ms\x18\x02 \x01(\x03R\n" + + "fromUnixMs\x12\x1c\n" + + "\n" + + "to_unix_ms\x18\x03 \x01(\x03R\btoUnixMs\x12\x12\n" + + "\x04rows\x18\x04 \x01(\rR\x04rows\"_\n" + + "\x17GetKeyVizMatrixResponse\x12$\n" + + "\x0ecolumn_unix_ms\x18\x01 \x03(\x03R\fcolumnUnixMs\x12\x1e\n" + + "\x04rows\x18\x02 \x03(\v2\n" + + ".KeyVizRowR\x04rows\"t\n" + + "\x15GetRouteDetailRequest\x12\x1b\n" + + "\tbucket_id\x18\x01 \x01(\tR\bbucketId\x12 \n" + + "\ffrom_unix_ms\x18\x02 \x01(\x03R\n" + + "fromUnixMs\x12\x1c\n" + + "\n" + + "to_unix_ms\x18\x03 \x01(\x03R\btoUnixMs\"h\n" + + "\x16GetRouteDetailResponse\x12\x1c\n" + + "\x03row\x18\x01 \x01(\v2\n" + + ".KeyVizRowR\x03row\x120\n" + + "\vper_adapter\x18\x02 \x03(\v2\x0f.AdapterSummaryR\n" + + "perAdapter\"\x15\n" + + "\x13StreamEventsRequest\"\x91\x01\n" + + "\x11StreamEventsEvent\x12=\n" + + "\x10route_transition\x18\x01 \x01(\v2\x10.RouteTransitionH\x00R\x0frouteTransition\x124\n" + + "\rkeyviz_column\x18\x02 \x01(\v2\r.KeyVizColumnH\x00R\fkeyvizColumnB\a\n" + + "\x05event\"\x99\x01\n" + + "\x0fRouteTransition\x12&\n" + + "\x0fparent_route_id\x18\x01 \x01(\x04R\rparentRouteId\x12&\n" + + "\x0fchild_route_ids\x18\x02 \x03(\x04R\rchildRouteIds\x12\x1d\n" + + "\n" + + "lineage_id\x18\x03 \x01(\tR\tlineageId\x12\x17\n" + + "\aunix_ms\x18\x04 \x01(\x03R\x06unixMs\"{\n" + + "\fKeyVizColumn\x12$\n" + + "\x0ecolumn_unix_ms\x18\x01 \x01(\x03R\fcolumnUnixMs\x12%\n" + + "\x06series\x18\x02 \x01(\x0e2\r.KeyVizSeriesR\x06series\x12\x1e\n" + + "\x04rows\x18\x03 \x03(\v2\n" + + ".KeyVizRowR\x04rows*\x9d\x01\n" + + "\fKeyVizSeries\x12\x1d\n" + + "\x19KEYVIZ_SERIES_UNSPECIFIED\x10\x00\x12\x17\n" + + "\x13KEYVIZ_SERIES_READS\x10\x01\x12\x18\n" + + "\x14KEYVIZ_SERIES_WRITES\x10\x02\x12\x1c\n" + + "\x18KEYVIZ_SERIES_READ_BYTES\x10\x03\x12\x1d\n" + + "\x19KEYVIZ_SERIES_WRITE_BYTES\x10\x04*\x83\x01\n" + + "\n" + + "SampleRole\x12\x1b\n" + + "\x17SAMPLE_ROLE_UNSPECIFIED\x10\x00\x12\x1c\n" + + "\x18SAMPLE_ROLE_LEADER_WRITE\x10\x01\x12\x1b\n" + + "\x17SAMPLE_ROLE_LEADER_READ\x10\x02\x12\x1d\n" + + "\x19SAMPLE_ROLE_FOLLOWER_READ\x10\x032\xb3\x03\n" + + "\x05Admin\x12O\n" + + "\x12GetClusterOverview\x12\x1a.GetClusterOverviewRequest\x1a\x1b.GetClusterOverviewResponse\"\x00\x12@\n" + + "\rGetRaftGroups\x12\x15.GetRaftGroupsRequest\x1a\x16.GetRaftGroupsResponse\"\x00\x12L\n" + + "\x11GetAdapterSummary\x12\x19.GetAdapterSummaryRequest\x1a\x1a.GetAdapterSummaryResponse\"\x00\x12F\n" + + "\x0fGetKeyVizMatrix\x12\x17.GetKeyVizMatrixRequest\x1a\x18.GetKeyVizMatrixResponse\"\x00\x12C\n" + + "\x0eGetRouteDetail\x12\x16.GetRouteDetailRequest\x1a\x17.GetRouteDetailResponse\"\x00\x12<\n" + + "\fStreamEvents\x12\x14.StreamEventsRequest\x1a\x12.StreamEventsEvent\"\x000\x01B#Z!github.com/bootjp/elastickv/protob\x06proto3" + +var ( + file_admin_proto_rawDescOnce sync.Once + file_admin_proto_rawDescData []byte +) + +func file_admin_proto_rawDescGZIP() []byte { + file_admin_proto_rawDescOnce.Do(func() { + file_admin_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_admin_proto_rawDesc), len(file_admin_proto_rawDesc))) + }) + return file_admin_proto_rawDescData +} + +var file_admin_proto_enumTypes = make([]protoimpl.EnumInfo, 2) +var file_admin_proto_msgTypes = make([]protoimpl.MessageInfo, 19) +var file_admin_proto_goTypes = []any{ + (KeyVizSeries)(0), // 0: KeyVizSeries + (SampleRole)(0), // 1: SampleRole + (*NodeIdentity)(nil), // 2: NodeIdentity + (*GroupLeader)(nil), // 3: GroupLeader + (*GetClusterOverviewRequest)(nil), // 4: GetClusterOverviewRequest + (*GetClusterOverviewResponse)(nil), // 5: GetClusterOverviewResponse + (*RaftGroupState)(nil), // 6: RaftGroupState + (*GetRaftGroupsRequest)(nil), // 7: GetRaftGroupsRequest + (*GetRaftGroupsResponse)(nil), // 8: GetRaftGroupsResponse + (*AdapterSummary)(nil), // 9: AdapterSummary + (*GetAdapterSummaryRequest)(nil), // 10: GetAdapterSummaryRequest + (*GetAdapterSummaryResponse)(nil), // 11: GetAdapterSummaryResponse + (*KeyVizRow)(nil), // 12: KeyVizRow + (*GetKeyVizMatrixRequest)(nil), // 13: GetKeyVizMatrixRequest + (*GetKeyVizMatrixResponse)(nil), // 14: GetKeyVizMatrixResponse + (*GetRouteDetailRequest)(nil), // 15: GetRouteDetailRequest + (*GetRouteDetailResponse)(nil), // 16: GetRouteDetailResponse + (*StreamEventsRequest)(nil), // 17: StreamEventsRequest + (*StreamEventsEvent)(nil), // 18: StreamEventsEvent + (*RouteTransition)(nil), // 19: RouteTransition + (*KeyVizColumn)(nil), // 20: KeyVizColumn +} +var file_admin_proto_depIdxs = []int32{ + 2, // 0: GetClusterOverviewResponse.self:type_name -> NodeIdentity + 2, // 1: GetClusterOverviewResponse.members:type_name -> NodeIdentity + 3, // 2: GetClusterOverviewResponse.group_leaders:type_name -> GroupLeader + 6, // 3: GetRaftGroupsResponse.groups:type_name -> RaftGroupState + 9, // 4: GetAdapterSummaryResponse.summaries:type_name -> AdapterSummary + 1, // 5: KeyVizRow.sample_roles:type_name -> SampleRole + 0, // 6: GetKeyVizMatrixRequest.series:type_name -> KeyVizSeries + 12, // 7: GetKeyVizMatrixResponse.rows:type_name -> KeyVizRow + 12, // 8: GetRouteDetailResponse.row:type_name -> KeyVizRow + 9, // 9: GetRouteDetailResponse.per_adapter:type_name -> AdapterSummary + 19, // 10: StreamEventsEvent.route_transition:type_name -> RouteTransition + 20, // 11: StreamEventsEvent.keyviz_column:type_name -> KeyVizColumn + 0, // 12: KeyVizColumn.series:type_name -> KeyVizSeries + 12, // 13: KeyVizColumn.rows:type_name -> KeyVizRow + 4, // 14: Admin.GetClusterOverview:input_type -> GetClusterOverviewRequest + 7, // 15: Admin.GetRaftGroups:input_type -> GetRaftGroupsRequest + 10, // 16: Admin.GetAdapterSummary:input_type -> GetAdapterSummaryRequest + 13, // 17: Admin.GetKeyVizMatrix:input_type -> GetKeyVizMatrixRequest + 15, // 18: Admin.GetRouteDetail:input_type -> GetRouteDetailRequest + 17, // 19: Admin.StreamEvents:input_type -> StreamEventsRequest + 5, // 20: Admin.GetClusterOverview:output_type -> GetClusterOverviewResponse + 8, // 21: Admin.GetRaftGroups:output_type -> GetRaftGroupsResponse + 11, // 22: Admin.GetAdapterSummary:output_type -> GetAdapterSummaryResponse + 14, // 23: Admin.GetKeyVizMatrix:output_type -> GetKeyVizMatrixResponse + 16, // 24: Admin.GetRouteDetail:output_type -> GetRouteDetailResponse + 18, // 25: Admin.StreamEvents:output_type -> StreamEventsEvent + 20, // [20:26] is the sub-list for method output_type + 14, // [14:20] is the sub-list for method input_type + 14, // [14:14] is the sub-list for extension type_name + 14, // [14:14] is the sub-list for extension extendee + 0, // [0:14] is the sub-list for field type_name +} + +func init() { file_admin_proto_init() } +func file_admin_proto_init() { + if File_admin_proto != nil { + return + } + file_admin_proto_msgTypes[16].OneofWrappers = []any{ + (*StreamEventsEvent_RouteTransition)(nil), + (*StreamEventsEvent_KeyvizColumn)(nil), + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_admin_proto_rawDesc), len(file_admin_proto_rawDesc)), + NumEnums: 2, + NumMessages: 19, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_admin_proto_goTypes, + DependencyIndexes: file_admin_proto_depIdxs, + EnumInfos: file_admin_proto_enumTypes, + MessageInfos: file_admin_proto_msgTypes, + }.Build() + File_admin_proto = out.File + file_admin_proto_goTypes = nil + file_admin_proto_depIdxs = nil +} diff --git a/proto/admin.proto b/proto/admin.proto new file mode 100644 index 00000000..3b476166 --- /dev/null +++ b/proto/admin.proto @@ -0,0 +1,149 @@ +syntax = "proto3"; + +option go_package = "github.com/bootjp/elastickv/proto"; + +// Admin is the node-side read-only admin gRPC service consumed by +// cmd/elastickv-admin. Every method requires "authorization: Bearer " +// metadata unless the node was started with --adminInsecureNoAuth. +// See docs/admin_ui_key_visualizer_design.md §4 (Layer A). +service Admin { + rpc GetClusterOverview (GetClusterOverviewRequest) returns (GetClusterOverviewResponse) {} + rpc GetRaftGroups (GetRaftGroupsRequest) returns (GetRaftGroupsResponse) {} + rpc GetAdapterSummary (GetAdapterSummaryRequest) returns (GetAdapterSummaryResponse) {} + rpc GetKeyVizMatrix (GetKeyVizMatrixRequest) returns (GetKeyVizMatrixResponse) {} + rpc GetRouteDetail (GetRouteDetailRequest) returns (GetRouteDetailResponse) {} + rpc StreamEvents (StreamEventsRequest) returns (stream StreamEventsEvent) {} +} + +message NodeIdentity { + string node_id = 1; + string grpc_address = 2; +} + +message GroupLeader { + uint64 raft_group_id = 1; + string leader_node_id = 2; + uint64 leader_term = 3; +} + +message GetClusterOverviewRequest {} + +message GetClusterOverviewResponse { + NodeIdentity self = 1; + repeated NodeIdentity members = 2; + repeated GroupLeader group_leaders = 3; + uint64 aggregate_qps = 4; +} + +message RaftGroupState { + uint64 raft_group_id = 1; + string leader_node_id = 2; + uint64 leader_term = 3; + uint64 commit_index = 4; + uint64 applied_index = 5; + int64 last_contact_unix_ms = 6; +} + +message GetRaftGroupsRequest {} + +message GetRaftGroupsResponse { + repeated RaftGroupState groups = 1; +} + +message AdapterSummary { + string adapter = 1; + string operation = 2; + uint64 requests = 3; + uint64 in_flight = 4; + uint64 bytes_in = 5; + uint64 bytes_out = 6; + double p50_ns = 7; + double p95_ns = 8; + double p99_ns = 9; +} + +message GetAdapterSummaryRequest {} + +message GetAdapterSummaryResponse { + repeated AdapterSummary summaries = 1; +} + +enum KeyVizSeries { + KEYVIZ_SERIES_UNSPECIFIED = 0; + KEYVIZ_SERIES_READS = 1; + KEYVIZ_SERIES_WRITES = 2; + KEYVIZ_SERIES_READ_BYTES = 3; + KEYVIZ_SERIES_WRITE_BYTES = 4; +} + +enum SampleRole { + SAMPLE_ROLE_UNSPECIFIED = 0; + SAMPLE_ROLE_LEADER_WRITE = 1; + SAMPLE_ROLE_LEADER_READ = 2; + SAMPLE_ROLE_FOLLOWER_READ = 3; +} + +message KeyVizRow { + // bucket_id is either "route:" or "virtual:". + string bucket_id = 1; + bytes start = 2; + bytes end = 3; + string label = 4; + bool aggregate = 5; + repeated uint64 route_ids = 6; + bool route_ids_truncated = 7; + uint64 route_count = 8; + repeated SampleRole sample_roles = 9; + string lineage_id = 10; + // values[j] is the series value at time column j. + repeated uint64 values = 11; + // soft_columns[j] is true when the j-th column missed the estimator SLO. + repeated bool soft_columns = 12; +} + +message GetKeyVizMatrixRequest { + KeyVizSeries series = 1; + int64 from_unix_ms = 2; + int64 to_unix_ms = 3; + uint32 rows = 4; +} + +message GetKeyVizMatrixResponse { + repeated int64 column_unix_ms = 1; + repeated KeyVizRow rows = 2; +} + +message GetRouteDetailRequest { + // Either a concrete route: or a virtual: emitted in a previous + // GetKeyVizMatrix response. + string bucket_id = 1; + int64 from_unix_ms = 2; + int64 to_unix_ms = 3; +} + +message GetRouteDetailResponse { + KeyVizRow row = 1; + repeated AdapterSummary per_adapter = 2; +} + +message StreamEventsRequest {} + +message StreamEventsEvent { + oneof event { + RouteTransition route_transition = 1; + KeyVizColumn keyviz_column = 2; + } +} + +message RouteTransition { + uint64 parent_route_id = 1; + repeated uint64 child_route_ids = 2; + string lineage_id = 3; + int64 unix_ms = 4; +} + +message KeyVizColumn { + int64 column_unix_ms = 1; + KeyVizSeries series = 2; + repeated KeyVizRow rows = 3; +} diff --git a/proto/admin_grpc.pb.go b/proto/admin_grpc.pb.go new file mode 100644 index 00000000..379805d3 --- /dev/null +++ b/proto/admin_grpc.pb.go @@ -0,0 +1,325 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.6.1 +// - protoc v7.34.0 +// source: admin.proto + +package proto + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.64.0 or later. +const _ = grpc.SupportPackageIsVersion9 + +const ( + Admin_GetClusterOverview_FullMethodName = "/Admin/GetClusterOverview" + Admin_GetRaftGroups_FullMethodName = "/Admin/GetRaftGroups" + Admin_GetAdapterSummary_FullMethodName = "/Admin/GetAdapterSummary" + Admin_GetKeyVizMatrix_FullMethodName = "/Admin/GetKeyVizMatrix" + Admin_GetRouteDetail_FullMethodName = "/Admin/GetRouteDetail" + Admin_StreamEvents_FullMethodName = "/Admin/StreamEvents" +) + +// AdminClient is the client API for Admin service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +// +// Admin is the node-side read-only admin gRPC service consumed by +// cmd/elastickv-admin. Every method requires "authorization: Bearer " +// metadata unless the node was started with --adminInsecureNoAuth. +// See docs/admin_ui_key_visualizer_design.md §4 (Layer A). +type AdminClient interface { + GetClusterOverview(ctx context.Context, in *GetClusterOverviewRequest, opts ...grpc.CallOption) (*GetClusterOverviewResponse, error) + GetRaftGroups(ctx context.Context, in *GetRaftGroupsRequest, opts ...grpc.CallOption) (*GetRaftGroupsResponse, error) + GetAdapterSummary(ctx context.Context, in *GetAdapterSummaryRequest, opts ...grpc.CallOption) (*GetAdapterSummaryResponse, error) + GetKeyVizMatrix(ctx context.Context, in *GetKeyVizMatrixRequest, opts ...grpc.CallOption) (*GetKeyVizMatrixResponse, error) + GetRouteDetail(ctx context.Context, in *GetRouteDetailRequest, opts ...grpc.CallOption) (*GetRouteDetailResponse, error) + StreamEvents(ctx context.Context, in *StreamEventsRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[StreamEventsEvent], error) +} + +type adminClient struct { + cc grpc.ClientConnInterface +} + +func NewAdminClient(cc grpc.ClientConnInterface) AdminClient { + return &adminClient{cc} +} + +func (c *adminClient) GetClusterOverview(ctx context.Context, in *GetClusterOverviewRequest, opts ...grpc.CallOption) (*GetClusterOverviewResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(GetClusterOverviewResponse) + err := c.cc.Invoke(ctx, Admin_GetClusterOverview_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *adminClient) GetRaftGroups(ctx context.Context, in *GetRaftGroupsRequest, opts ...grpc.CallOption) (*GetRaftGroupsResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(GetRaftGroupsResponse) + err := c.cc.Invoke(ctx, Admin_GetRaftGroups_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *adminClient) GetAdapterSummary(ctx context.Context, in *GetAdapterSummaryRequest, opts ...grpc.CallOption) (*GetAdapterSummaryResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(GetAdapterSummaryResponse) + err := c.cc.Invoke(ctx, Admin_GetAdapterSummary_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *adminClient) GetKeyVizMatrix(ctx context.Context, in *GetKeyVizMatrixRequest, opts ...grpc.CallOption) (*GetKeyVizMatrixResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(GetKeyVizMatrixResponse) + err := c.cc.Invoke(ctx, Admin_GetKeyVizMatrix_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *adminClient) GetRouteDetail(ctx context.Context, in *GetRouteDetailRequest, opts ...grpc.CallOption) (*GetRouteDetailResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(GetRouteDetailResponse) + err := c.cc.Invoke(ctx, Admin_GetRouteDetail_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *adminClient) StreamEvents(ctx context.Context, in *StreamEventsRequest, opts ...grpc.CallOption) (grpc.ServerStreamingClient[StreamEventsEvent], error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + stream, err := c.cc.NewStream(ctx, &Admin_ServiceDesc.Streams[0], Admin_StreamEvents_FullMethodName, cOpts...) + if err != nil { + return nil, err + } + x := &grpc.GenericClientStream[StreamEventsRequest, StreamEventsEvent]{ClientStream: stream} + if err := x.ClientStream.SendMsg(in); err != nil { + return nil, err + } + if err := x.ClientStream.CloseSend(); err != nil { + return nil, err + } + return x, nil +} + +// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name. +type Admin_StreamEventsClient = grpc.ServerStreamingClient[StreamEventsEvent] + +// AdminServer is the server API for Admin service. +// All implementations must embed UnimplementedAdminServer +// for forward compatibility. +// +// Admin is the node-side read-only admin gRPC service consumed by +// cmd/elastickv-admin. Every method requires "authorization: Bearer " +// metadata unless the node was started with --adminInsecureNoAuth. +// See docs/admin_ui_key_visualizer_design.md §4 (Layer A). +type AdminServer interface { + GetClusterOverview(context.Context, *GetClusterOverviewRequest) (*GetClusterOverviewResponse, error) + GetRaftGroups(context.Context, *GetRaftGroupsRequest) (*GetRaftGroupsResponse, error) + GetAdapterSummary(context.Context, *GetAdapterSummaryRequest) (*GetAdapterSummaryResponse, error) + GetKeyVizMatrix(context.Context, *GetKeyVizMatrixRequest) (*GetKeyVizMatrixResponse, error) + GetRouteDetail(context.Context, *GetRouteDetailRequest) (*GetRouteDetailResponse, error) + StreamEvents(*StreamEventsRequest, grpc.ServerStreamingServer[StreamEventsEvent]) error + mustEmbedUnimplementedAdminServer() +} + +// UnimplementedAdminServer must be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedAdminServer struct{} + +func (UnimplementedAdminServer) GetClusterOverview(context.Context, *GetClusterOverviewRequest) (*GetClusterOverviewResponse, error) { + return nil, status.Error(codes.Unimplemented, "method GetClusterOverview not implemented") +} +func (UnimplementedAdminServer) GetRaftGroups(context.Context, *GetRaftGroupsRequest) (*GetRaftGroupsResponse, error) { + return nil, status.Error(codes.Unimplemented, "method GetRaftGroups not implemented") +} +func (UnimplementedAdminServer) GetAdapterSummary(context.Context, *GetAdapterSummaryRequest) (*GetAdapterSummaryResponse, error) { + return nil, status.Error(codes.Unimplemented, "method GetAdapterSummary not implemented") +} +func (UnimplementedAdminServer) GetKeyVizMatrix(context.Context, *GetKeyVizMatrixRequest) (*GetKeyVizMatrixResponse, error) { + return nil, status.Error(codes.Unimplemented, "method GetKeyVizMatrix not implemented") +} +func (UnimplementedAdminServer) GetRouteDetail(context.Context, *GetRouteDetailRequest) (*GetRouteDetailResponse, error) { + return nil, status.Error(codes.Unimplemented, "method GetRouteDetail not implemented") +} +func (UnimplementedAdminServer) StreamEvents(*StreamEventsRequest, grpc.ServerStreamingServer[StreamEventsEvent]) error { + return status.Error(codes.Unimplemented, "method StreamEvents not implemented") +} +func (UnimplementedAdminServer) mustEmbedUnimplementedAdminServer() {} +func (UnimplementedAdminServer) testEmbeddedByValue() {} + +// UnsafeAdminServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to AdminServer will +// result in compilation errors. +type UnsafeAdminServer interface { + mustEmbedUnimplementedAdminServer() +} + +func RegisterAdminServer(s grpc.ServiceRegistrar, srv AdminServer) { + // If the following call panics, it indicates UnimplementedAdminServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + s.RegisterService(&Admin_ServiceDesc, srv) +} + +func _Admin_GetClusterOverview_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetClusterOverviewRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(AdminServer).GetClusterOverview(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: Admin_GetClusterOverview_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(AdminServer).GetClusterOverview(ctx, req.(*GetClusterOverviewRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _Admin_GetRaftGroups_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetRaftGroupsRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(AdminServer).GetRaftGroups(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: Admin_GetRaftGroups_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(AdminServer).GetRaftGroups(ctx, req.(*GetRaftGroupsRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _Admin_GetAdapterSummary_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetAdapterSummaryRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(AdminServer).GetAdapterSummary(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: Admin_GetAdapterSummary_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(AdminServer).GetAdapterSummary(ctx, req.(*GetAdapterSummaryRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _Admin_GetKeyVizMatrix_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetKeyVizMatrixRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(AdminServer).GetKeyVizMatrix(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: Admin_GetKeyVizMatrix_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(AdminServer).GetKeyVizMatrix(ctx, req.(*GetKeyVizMatrixRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _Admin_GetRouteDetail_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetRouteDetailRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(AdminServer).GetRouteDetail(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: Admin_GetRouteDetail_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(AdminServer).GetRouteDetail(ctx, req.(*GetRouteDetailRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _Admin_StreamEvents_Handler(srv interface{}, stream grpc.ServerStream) error { + m := new(StreamEventsRequest) + if err := stream.RecvMsg(m); err != nil { + return err + } + return srv.(AdminServer).StreamEvents(m, &grpc.GenericServerStream[StreamEventsRequest, StreamEventsEvent]{ServerStream: stream}) +} + +// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name. +type Admin_StreamEventsServer = grpc.ServerStreamingServer[StreamEventsEvent] + +// Admin_ServiceDesc is the grpc.ServiceDesc for Admin service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var Admin_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "Admin", + HandlerType: (*AdminServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "GetClusterOverview", + Handler: _Admin_GetClusterOverview_Handler, + }, + { + MethodName: "GetRaftGroups", + Handler: _Admin_GetRaftGroups_Handler, + }, + { + MethodName: "GetAdapterSummary", + Handler: _Admin_GetAdapterSummary_Handler, + }, + { + MethodName: "GetKeyVizMatrix", + Handler: _Admin_GetKeyVizMatrix_Handler, + }, + { + MethodName: "GetRouteDetail", + Handler: _Admin_GetRouteDetail_Handler, + }, + }, + Streams: []grpc.StreamDesc{ + { + StreamName: "StreamEvents", + Handler: _Admin_StreamEvents_Handler, + ServerStreams: true, + }, + }, + Metadata: "admin.proto", +} From 498a370b77cf8d6e806ec083c75b809620c62c6f Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 22:40:25 +0900 Subject: [PATCH 10/30] fix(admin): harden phase 0 elastickv-admin after review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address gemini review on commit 5eed4f31: - Add Read/Write/IdleTimeout on http.Server (not just ReadHeaderTimeout) to blunt slowloris-style resource exhaustion. - Support TLS for node gRPC with --nodeTLSCACertFile / --nodeTLSServerName / --nodeTLSInsecureSkipVerify / --nodeTLSPlaintext; default is system-root TLS. Plaintext is opt-in and dev-only; skip-verify is mutually exclusive with CA file. - Implement dynamic membership discovery per design §9.1: fan-out resolves targets via GetClusterOverview on a reachable seed, caches the result for --nodesRefreshInterval, and invalidates on Unavailable. Falls back to the static seed list when discovery fails entirely. - Fold in the prior ordering/defer/close-error fixes from 5eed4f31 (this commit replaces it after amend). - Tests cover TLS flag precedence, member dedup + seed inclusion, cache-hit suppression and post-expiry refresh, and Unavailable fallback. --- adapter/admin_grpc.go | 29 +++-- adapter/admin_grpc_test.go | 47 ++++++++ cmd/elastickv-admin/main.go | 180 +++++++++++++++++++++++++++-- cmd/elastickv-admin/main_test.go | 191 +++++++++++++++++++++++++++++++ 4 files changed, 433 insertions(+), 14 deletions(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index be43baca..c58e9f74 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -3,6 +3,7 @@ package adapter import ( "context" "crypto/subtle" + "sort" "strings" "sync" @@ -99,9 +100,11 @@ func (s *AdminServer) GetRaftGroups( _ *pb.GetRaftGroupsRequest, ) (*pb.GetRaftGroupsResponse, error) { s.groupsMu.RLock() - out := make([]*pb.RaftGroupState, 0, len(s.groups)) - for id, g := range s.groups { - st := g.Status() + defer s.groupsMu.RUnlock() + ids := sortedGroupIDs(s.groups) + out := make([]*pb.RaftGroupState, 0, len(ids)) + for _, id := range ids { + st := s.groups[id].Status() out = append(out, &pb.RaftGroupState{ RaftGroupId: id, LeaderNodeId: st.Leader.ID, @@ -110,16 +113,16 @@ func (s *AdminServer) GetRaftGroups( AppliedIndex: st.AppliedIndex, }) } - s.groupsMu.RUnlock() return &pb.GetRaftGroupsResponse{Groups: out}, nil } func (s *AdminServer) snapshotLeaders() []*pb.GroupLeader { s.groupsMu.RLock() defer s.groupsMu.RUnlock() - out := make([]*pb.GroupLeader, 0, len(s.groups)) - for id, g := range s.groups { - st := g.Status() + ids := sortedGroupIDs(s.groups) + out := make([]*pb.GroupLeader, 0, len(ids)) + for _, id := range ids { + st := s.groups[id].Status() out = append(out, &pb.GroupLeader{ RaftGroupId: id, LeaderNodeId: st.Leader.ID, @@ -129,6 +132,18 @@ func (s *AdminServer) snapshotLeaders() []*pb.GroupLeader { return out } +// sortedGroupIDs returns the map's keys in ascending order so Admin responses +// are deterministic across calls — admin tooling and tests both rely on stable +// ordering. +func sortedGroupIDs(m map[uint64]AdminGroup) []uint64 { + ids := make([]uint64, 0, len(m)) + for id := range m { + ids = append(ids, id) + } + sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] }) + return ids +} + // AdminTokenAuth builds a gRPC unary+stream interceptor pair enforcing // "authorization: Bearer " metadata against the supplied token. An // empty token disables enforcement; callers should pair that mode with a diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index 3001ebec..2572664b 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -70,6 +70,53 @@ func TestGetRaftGroupsExposesCommitApplied(t *testing.T) { } } +// TestGroupOrderingIsStable locks in deterministic ascending-by-RaftGroupId +// ordering so admin UIs and diff-based tests do not see rows jump around. +func TestGroupOrderingIsStable(t *testing.T) { + t.Parallel() + srv := NewAdminServer(NodeIdentity{NodeID: "n1"}, nil) + for _, id := range []uint64{7, 2, 5, 3, 1} { + srv.RegisterGroup(id, fakeGroup{leaderID: "n1"}) + } + + groupsResp, err := srv.GetRaftGroups(context.Background(), &pb.GetRaftGroupsRequest{}) + if err != nil { + t.Fatal(err) + } + gotGroups := make([]uint64, 0, len(groupsResp.Groups)) + for _, g := range groupsResp.Groups { + gotGroups = append(gotGroups, g.RaftGroupId) + } + wantGroups := []uint64{1, 2, 3, 5, 7} + if !equalU64s(gotGroups, wantGroups) { + t.Fatalf("GetRaftGroups order = %v, want %v", gotGroups, wantGroups) + } + + overview, err := srv.GetClusterOverview(context.Background(), &pb.GetClusterOverviewRequest{}) + if err != nil { + t.Fatal(err) + } + gotLeaders := make([]uint64, 0, len(overview.GroupLeaders)) + for _, gl := range overview.GroupLeaders { + gotLeaders = append(gotLeaders, gl.RaftGroupId) + } + if !equalU64s(gotLeaders, wantGroups) { + t.Fatalf("GetClusterOverview leader order = %v, want %v", gotLeaders, wantGroups) + } +} + +func equalU64s(a, b []uint64) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + func TestAdminTokenAuth(t *testing.T) { t.Parallel() unary, _ := AdminTokenAuth("s3cret") diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 4c9a2c1c..7be5669e 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -5,6 +5,8 @@ package main import ( "context" + "crypto/tls" + "crypto/x509" "encoding/json" "errors" "flag" @@ -21,8 +23,11 @@ import ( pb "github.com/bootjp/elastickv/proto" "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/credentials" "google.golang.org/grpc/credentials/insecure" "google.golang.org/grpc/metadata" + "google.golang.org/grpc/status" ) const ( @@ -30,6 +35,9 @@ const ( defaultNodesRefreshInterval = 15 * time.Second defaultGRPCRequestTimeout = 10 * time.Second readHeaderTimeout = 5 * time.Second + readTimeout = 30 * time.Second + writeTimeout = 30 * time.Second + idleTimeout = 120 * time.Second shutdownTimeout = 5 * time.Second ) @@ -39,6 +47,10 @@ var ( nodeTokenFile = flag.String("nodeTokenFile", "", "File containing the bearer token sent to nodes' Admin service") nodesRefreshInterval = flag.Duration("nodesRefreshInterval", defaultNodesRefreshInterval, "Duration to cache cluster membership before re-fetching") insecureNoAuth = flag.Bool("adminInsecureNoAuth", false, "Skip bearer token authentication; development only") + nodeTLSCACertFile = flag.String("nodeTLSCACertFile", "", "PEM file with CA certificates used to verify nodes' gRPC TLS; enables TLS when set") + nodeTLSServerName = flag.String("nodeTLSServerName", "", "Expected TLS server name when connecting to nodes (overrides the address host)") + nodeTLSSkipVerify = flag.Bool("nodeTLSInsecureSkipVerify", false, "Skip TLS certificate verification; development only") + nodeTLSPlaintext = flag.Bool("nodeTLSPlaintext", false, "Skip TLS entirely and dial nodes with plaintext credentials; development only") ) func main() { @@ -59,7 +71,12 @@ func run() error { return err } - fan := newFanout(seeds, token, *nodesRefreshInterval) + creds, err := loadTransportCredentials(*nodeTLSPlaintext, *nodeTLSCACertFile, *nodeTLSServerName, *nodeTLSSkipVerify) + if err != nil { + return err + } + + fan := newFanout(seeds, token, *nodesRefreshInterval, creds) defer fan.Close() mux := http.NewServeMux() @@ -83,6 +100,9 @@ func run() error { Addr: *bindAddr, Handler: mux, ReadHeaderTimeout: readHeaderTimeout, + ReadTimeout: readTimeout, + WriteTimeout: writeTimeout, + IdleTimeout: idleTimeout, } ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) @@ -148,29 +168,83 @@ func loadToken(path string, insecureMode bool) (string, error) { return token, nil } +// loadTransportCredentials builds the gRPC TransportCredentials used to dial +// nodes. Precedence: --nodeTLSPlaintext (dev-only plaintext) → mutually +// exclusive with the TLS flags → otherwise TLS with the system trust roots by +// default, optionally overridden by --nodeTLSCACertFile and +// --nodeTLSInsecureSkipVerify. +func loadTransportCredentials( + plaintext bool, + caFile, serverName string, + skipVerify bool, +) (credentials.TransportCredentials, error) { + if plaintext { + if caFile != "" || serverName != "" || skipVerify { + return nil, errors.New("--nodeTLSPlaintext is mutually exclusive with other TLS flags") + } + return insecure.NewCredentials(), nil + } + cfg := &tls.Config{ + MinVersion: tls.VersionTLS12, + ServerName: serverName, + InsecureSkipVerify: skipVerify, //nolint:gosec // gated behind --nodeTLSInsecureSkipVerify; dev-only. + } + if caFile != "" { + if skipVerify { + return nil, errors.New("--nodeTLSCACertFile and --nodeTLSInsecureSkipVerify are mutually exclusive") + } + pem, err := os.ReadFile(caFile) + if err != nil { + return nil, fmt.Errorf("read node TLS CA file: %w", err) + } + pool := x509.NewCertPool() + if !pool.AppendCertsFromPEM(pem) { + return nil, errors.New("no certificates parsed from --nodeTLSCACertFile") + } + cfg.RootCAs = pool + } + return credentials.NewTLS(cfg), nil +} + type nodeClient struct { addr string conn *grpc.ClientConn client pb.AdminClient } +type membership struct { + addrs []string + fetchedAt time.Time +} + type fanout struct { seeds []string token string refreshInterval time.Duration + creds credentials.TransportCredentials mu sync.Mutex clients map[string]*nodeClient + members *membership } -func newFanout(seeds []string, token string, refreshInterval time.Duration) *fanout { +func newFanout( + seeds []string, + token string, + refreshInterval time.Duration, + creds credentials.TransportCredentials, +) *fanout { if refreshInterval <= 0 { refreshInterval = defaultNodesRefreshInterval } + if creds == nil { + creds = insecure.NewCredentials() + } return &fanout{ seeds: seeds, token: token, refreshInterval: refreshInterval, + creds: creds, clients: make(map[string]*nodeClient), } } @@ -179,7 +253,9 @@ func (f *fanout) Close() { f.mu.Lock() defer f.mu.Unlock() for _, c := range f.clients { - _ = c.conn.Close() + if err := c.conn.Close(); err != nil { + log.Printf("elastickv-admin: close gRPC connection to %s: %v", c.addr, err) + } } f.clients = nil } @@ -190,7 +266,7 @@ func (f *fanout) clientFor(addr string) (*nodeClient, error) { if c, ok := f.clients[addr]; ok { return c, nil } - conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(f.creds)) if err != nil { return nil, fmt.Errorf("dial %s: %w", addr, err) } @@ -199,6 +275,21 @@ func (f *fanout) clientFor(addr string) (*nodeClient, error) { return c, nil } +// invalidateClient drops a cached connection — used when a peer returns +// Unavailable so the next request re-dials or skips the removed node. +func (f *fanout) invalidateClient(addr string) { + f.mu.Lock() + c, ok := f.clients[addr] + delete(f.clients, addr) + f.members = nil + f.mu.Unlock() + if ok { + if err := c.conn.Close(); err != nil { + log.Printf("elastickv-admin: close gRPC connection to %s: %v", addr, err) + } + } +} + func (f *fanout) outgoingCtx(parent context.Context) context.Context { if f.token == "" { return parent @@ -206,10 +297,78 @@ func (f *fanout) outgoingCtx(parent context.Context) context.Context { return metadata.AppendToOutgoingContext(parent, "authorization", "Bearer "+f.token) } +// currentTargets returns the list of node addresses to fan out to. If the +// membership cache is fresh it is returned directly; otherwise the admin binary +// queries one reachable seed via GetClusterOverview and caches the resulting +// member list for refreshInterval. On total failure it falls back to seeds so +// a single unreachable seed does not take the admin offline. +func (f *fanout) currentTargets(ctx context.Context) []string { + f.mu.Lock() + if f.members != nil && time.Since(f.members.fetchedAt) < f.refreshInterval { + addrs := append([]string(nil), f.members.addrs...) + f.mu.Unlock() + return addrs + } + f.mu.Unlock() + + for _, seed := range f.seeds { + cli, err := f.clientFor(seed) + if err != nil { + log.Printf("elastickv-admin: dial seed %s: %v", seed, err) + continue + } + resp, err := cli.client.GetClusterOverview(f.outgoingCtx(ctx), &pb.GetClusterOverviewRequest{}) + if err != nil { + if status.Code(err) == codes.Unavailable { + f.invalidateClient(seed) + } + log.Printf("elastickv-admin: discover membership via %s: %v", seed, err) + continue + } + addrs := membersFrom(seed, resp) + f.mu.Lock() + f.members = &membership{addrs: addrs, fetchedAt: time.Now()} + f.mu.Unlock() + return append([]string(nil), addrs...) + } + + log.Printf("elastickv-admin: all seeds unreachable for membership refresh; falling back to static seed list") + return append([]string(nil), f.seeds...) +} + +// membersFrom extracts a deduplicated address list from a cluster overview +// response, always including the node that answered so the answering seed is +// still queried even if it omits itself from members. +func membersFrom(seed string, resp *pb.GetClusterOverviewResponse) []string { + seen := map[string]struct{}{} + out := make([]string, 0, len(resp.GetMembers())+1) + add := func(addr string) { + addr = strings.TrimSpace(addr) + if addr == "" { + return + } + if _, dup := seen[addr]; dup { + return + } + seen[addr] = struct{}{} + out = append(out, addr) + } + add(seed) + if self := resp.GetSelf(); self != nil { + add(self.GetGrpcAddress()) + } + for _, m := range resp.GetMembers() { + add(m.GetGrpcAddress()) + } + return out +} + func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { ctx, cancel := context.WithTimeout(r.Context(), defaultGRPCRequestTimeout) defer cancel() + targets := f.currentTargets(ctx) + type perNode struct { Node string `json:"node"` OK bool `json:"ok"` @@ -217,9 +376,9 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { Data *pb.GetClusterOverviewResponse `json:"data,omitempty"` } - results := make([]perNode, len(f.seeds)) + results := make([]perNode, len(targets)) var wg sync.WaitGroup - for i, addr := range f.seeds { + for i, addr := range targets { wg.Add(1) go func(i int, addr string) { defer wg.Done() @@ -232,6 +391,9 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { } resp, err := cli.client.GetClusterOverview(f.outgoingCtx(ctx), &pb.GetClusterOverviewRequest{}) if err != nil { + if status.Code(err) == codes.Unavailable { + f.invalidateClient(addr) + } entry.Error = err.Error() results[i] = entry return @@ -249,7 +411,11 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { func writeJSON(w http.ResponseWriter, code int, body any) { w.Header().Set("Content-Type", "application/json; charset=utf-8") w.WriteHeader(code) - _ = json.NewEncoder(w).Encode(body) + // Status code is already committed by WriteHeader; log encode failures so + // truncated or malformed responses remain visible to operators. + if err := json.NewEncoder(w).Encode(body); err != nil { + log.Printf("elastickv-admin: encode JSON response: %v", err) + } } func writeJSONError(w http.ResponseWriter, code int, msg string) { diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index e33d167c..597f608c 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -1,10 +1,27 @@ package main import ( + "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "math/big" + "net" "os" "path/filepath" "strings" + "sync/atomic" "testing" + "time" + + pb "github.com/bootjp/elastickv/proto" + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/status" ) func TestSplitNodesTrimsAndDrops(t *testing.T) { @@ -75,3 +92,177 @@ func TestLoadTokenRejectsInsecureWithFile(t *testing.T) { t.Fatal("expected mutual-exclusion error when both supplied") } } + +func TestLoadTransportCredentialsPrecedence(t *testing.T) { + t.Parallel() + + if _, err := loadTransportCredentials(true, "", "", false); err != nil { + t.Fatalf("plaintext alone should succeed: %v", err) + } + if _, err := loadTransportCredentials(true, "/tmp/ca.pem", "", false); err == nil { + t.Fatal("plaintext + CA file should error") + } + if _, err := loadTransportCredentials(true, "", "", true); err == nil { + t.Fatal("plaintext + skip-verify should error") + } + + dir := t.TempDir() + ca := filepath.Join(dir, "ca.pem") + if err := os.WriteFile(ca, writePEMCert(t), 0o600); err != nil { + t.Fatal(err) + } + if _, err := loadTransportCredentials(false, ca, "", true); err == nil { + t.Fatal("CA file + skip-verify should error") + } + creds, err := loadTransportCredentials(false, ca, "node-1", false) + if err != nil { + t.Fatalf("valid CA config failed: %v", err) + } + if creds == nil { + t.Fatal("expected TLS creds") + } + + bad := filepath.Join(dir, "bad.pem") + if err := os.WriteFile(bad, []byte("not a cert"), 0o600); err != nil { + t.Fatal(err) + } + if _, err := loadTransportCredentials(false, bad, "", false); err == nil { + t.Fatal("expected error for unparseable CA file") + } +} + +func writePEMCert(t *testing.T) []byte { + t.Helper() + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + t.Fatal(err) + } + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "test-ca"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(time.Hour), + IsCA: true, + KeyUsage: x509.KeyUsageCertSign, + } + der, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, &key.PublicKey, key) + if err != nil { + t.Fatal(err) + } + return pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) +} + +func TestMembersFromDeduplicatesAndIncludesSeed(t *testing.T) { + t.Parallel() + resp := &pb.GetClusterOverviewResponse{ + Self: &pb.NodeIdentity{GrpcAddress: "a:1"}, + Members: []*pb.NodeIdentity{{GrpcAddress: "a:1"}, {GrpcAddress: "b:2"}, {GrpcAddress: " "}, {GrpcAddress: "c:3"}}, + } + got := membersFrom("seed:1", resp) + want := []string{"seed:1", "a:1", "b:2", "c:3"} + if len(got) != len(want) { + t.Fatalf("len = %d (%v), want %d", len(got), got, len(want)) + } + for i := range want { + if got[i] != want[i] { + t.Fatalf("[%d] = %q, want %q", i, got[i], want[i]) + } + } +} + +// fakeAdminServer counts GetClusterOverview calls and returns a configurable +// member list, letting the test assert membership-cache behavior. +type fakeAdminServer struct { + pb.UnimplementedAdminServer + addr string + members []string + calls atomic.Int64 + returnUn bool +} + +func (f *fakeAdminServer) GetClusterOverview( + _ context.Context, + _ *pb.GetClusterOverviewRequest, +) (*pb.GetClusterOverviewResponse, error) { + f.calls.Add(1) + if f.returnUn { + return nil, status.Error(codes.Unavailable, "node gone") + } + members := make([]*pb.NodeIdentity, 0, len(f.members)) + for _, m := range f.members { + members = append(members, &pb.NodeIdentity{GrpcAddress: m}) + } + return &pb.GetClusterOverviewResponse{ + Self: &pb.NodeIdentity{GrpcAddress: f.addr}, + Members: members, + }, nil +} + +func startFakeAdmin(t *testing.T, srv *fakeAdminServer) string { + t.Helper() + var lc net.ListenConfig + lis, err := lc.Listen(context.Background(), "tcp", "127.0.0.1:0") + if err != nil { + t.Fatal(err) + } + srv.addr = lis.Addr().String() + gs := grpc.NewServer() + pb.RegisterAdminServer(gs, srv) + go func() { _ = gs.Serve(lis) }() + t.Cleanup(func() { + gs.GracefulStop() + _ = lis.Close() + }) + return srv.addr +} + +func TestFanoutCurrentTargetsCachesAndRefreshes(t *testing.T) { + t.Parallel() + + peer := &fakeAdminServer{members: []string{"peer-1:1", "peer-2:2"}} + seedAddr := startFakeAdmin(t, peer) + + f := newFanout([]string{seedAddr}, "", 50*time.Millisecond, insecure.NewCredentials()) + defer f.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + first := f.currentTargets(ctx) + if len(first) != 3 { + t.Fatalf("first call targets = %v, want 3 (seed + 2 members)", first) + } + if peer.calls.Load() != 1 { + t.Fatalf("calls = %d, want 1 after first refresh", peer.calls.Load()) + } + + // Within the cache window, no new discovery RPC. + _ = f.currentTargets(ctx) + if peer.calls.Load() != 1 { + t.Fatalf("cache window should suppress refresh, calls = %d", peer.calls.Load()) + } + + time.Sleep(70 * time.Millisecond) + _ = f.currentTargets(ctx) + if peer.calls.Load() != 2 { + t.Fatalf("post-expiry refresh expected, calls = %d", peer.calls.Load()) + } +} + +func TestFanoutCurrentTargetsFallsBackToSeeds(t *testing.T) { + t.Parallel() + + peer := &fakeAdminServer{returnUn: true} + seedAddr := startFakeAdmin(t, peer) + + f := newFanout([]string{seedAddr}, "", 50*time.Millisecond, insecure.NewCredentials()) + defer f.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + targets := f.currentTargets(ctx) + if len(targets) != 1 || targets[0] != seedAddr { + t.Fatalf("fallback targets = %v, want [%s]", targets, seedAddr) + } +} From 8bd34d92d010a8b1a862a9aeb62955919d415b05 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 23:02:47 +0900 Subject: [PATCH 11/30] fix(admin): address phase 0 round-2 gemini review - Collapse concurrent membership refreshes through golang.org/x/sync/singleflight so a browser stampede after cache expiry hits one seed, not many. - Use github.com/cockroachdb/errors for consistency with the rest of the repo. - Derive a short discoveryRPCTimeout (2s) for each seed so a slow first seed cannot stall overall discovery. - /api/cluster/overview now rejects non-GET and caps request body via http.MaxBytesReader. - writeJSON buffers before committing the status line, so encode failures surface as 500 with a static fallback body instead of a truncated 2xx body. - Tests cover singleflight dedup, method rejection, encode-failure 500 path, and success path. --- cmd/elastickv-admin/main.go | 72 ++++++++++++++++----- cmd/elastickv-admin/main_test.go | 103 +++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 15 deletions(-) diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 7be5669e..c29ff880 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -4,13 +4,12 @@ package main import ( + "bytes" "context" "crypto/tls" "crypto/x509" "encoding/json" - "errors" "flag" - "fmt" "log" "net/http" "os" @@ -22,6 +21,8 @@ import ( "time" pb "github.com/bootjp/elastickv/proto" + "github.com/cockroachdb/errors" + "golang.org/x/sync/singleflight" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/credentials" @@ -34,11 +35,13 @@ const ( defaultBindAddr = "127.0.0.1:8080" defaultNodesRefreshInterval = 15 * time.Second defaultGRPCRequestTimeout = 10 * time.Second + discoveryRPCTimeout = 2 * time.Second readHeaderTimeout = 5 * time.Second readTimeout = 30 * time.Second writeTimeout = 30 * time.Second idleTimeout = 120 * time.Second shutdownTimeout = 5 * time.Second + maxRequestBodyBytes = 4 << 10 ) var ( @@ -123,7 +126,7 @@ func run() error { shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout) defer shutdownCancel() if err := srv.Shutdown(shutdownCtx); err != nil { - return fmt.Errorf("shutdown: %w", err) + return errors.Wrap(err, "shutdown") } return nil case err := <-errCh: @@ -155,11 +158,11 @@ func loadToken(path string, insecureMode bool) (string, error) { } abs, err := filepath.Abs(path) if err != nil { - return "", fmt.Errorf("resolve token path: %w", err) + return "", errors.Wrap(err, "resolve token path") } b, err := os.ReadFile(abs) if err != nil { - return "", fmt.Errorf("read token file: %w", err) + return "", errors.Wrap(err, "read token file") } token := strings.TrimSpace(string(b)) if token == "" { @@ -195,7 +198,7 @@ func loadTransportCredentials( } pem, err := os.ReadFile(caFile) if err != nil { - return nil, fmt.Errorf("read node TLS CA file: %w", err) + return nil, errors.Wrap(err, "read node TLS CA file") } pool := x509.NewCertPool() if !pool.AppendCertsFromPEM(pem) { @@ -226,6 +229,11 @@ type fanout struct { mu sync.Mutex clients map[string]*nodeClient members *membership + + // refreshGroup deduplicates concurrent membership refresh RPCs so a burst + // of browser requests immediately after cache expiry collapses into a + // single GetClusterOverview call against one seed. + refreshGroup singleflight.Group } func newFanout( @@ -268,7 +276,7 @@ func (f *fanout) clientFor(addr string) (*nodeClient, error) { } conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(f.creds)) if err != nil { - return nil, fmt.Errorf("dial %s: %w", addr, err) + return nil, errors.Wrapf(err, "dial %s", addr) } c := &nodeClient{addr: addr, conn: conn, client: pb.NewAdminClient(conn)} f.clients[addr] = c @@ -299,9 +307,11 @@ func (f *fanout) outgoingCtx(parent context.Context) context.Context { // currentTargets returns the list of node addresses to fan out to. If the // membership cache is fresh it is returned directly; otherwise the admin binary -// queries one reachable seed via GetClusterOverview and caches the resulting -// member list for refreshInterval. On total failure it falls back to seeds so -// a single unreachable seed does not take the admin offline. +// queries seeds via GetClusterOverview and caches the resulting member list +// for refreshInterval. Concurrent refreshes are collapsed through singleflight +// so a burst of requests after cache expiry hits only one seed. On total +// failure it falls back to the static seed list so a single unreachable seed +// does not take the admin offline. func (f *fanout) currentTargets(ctx context.Context) []string { f.mu.Lock() if f.members != nil && time.Since(f.members.fetchedAt) < f.refreshInterval { @@ -311,13 +321,26 @@ func (f *fanout) currentTargets(ctx context.Context) []string { } f.mu.Unlock() + result, _, _ := f.refreshGroup.Do("members", func() (any, error) { + return f.refreshMembership(ctx), nil + }) + addrs, _ := result.([]string) + return addrs +} + +// refreshMembership performs the actual discovery RPC. It honours the caller's +// context for overall cancellation but derives a short per-seed timeout from +// discoveryRPCTimeout so a slow first seed does not stall the whole request. +func (f *fanout) refreshMembership(ctx context.Context) []string { for _, seed := range f.seeds { cli, err := f.clientFor(seed) if err != nil { log.Printf("elastickv-admin: dial seed %s: %v", seed, err) continue } - resp, err := cli.client.GetClusterOverview(f.outgoingCtx(ctx), &pb.GetClusterOverviewRequest{}) + rpcCtx, cancel := context.WithTimeout(ctx, discoveryRPCTimeout) + resp, err := cli.client.GetClusterOverview(f.outgoingCtx(rpcCtx), &pb.GetClusterOverviewRequest{}) + cancel() if err != nil { if status.Code(err) == codes.Unavailable { f.invalidateClient(seed) @@ -364,6 +387,11 @@ func membersFrom(seed string, resp *pb.GetClusterOverviewResponse) []string { } func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + writeJSONError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + r.Body = http.MaxBytesReader(w, r.Body, maxRequestBodyBytes) ctx, cancel := context.WithTimeout(r.Context(), defaultGRPCRequestTimeout) defer cancel() @@ -408,13 +436,27 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, map[string]any{"nodes": results}) } +// writeJSON marshals body into a buffer first, so an encoding failure can +// still surface as a 500 instead of a truncated body under a committed 2xx +// header. The admin API response bodies are small (bounded by rows/routes +// caps in later phases), so buffering is safe. func writeJSON(w http.ResponseWriter, code int, body any) { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + if err := enc.Encode(body); err != nil { + log.Printf("elastickv-admin: encode JSON response: %v", err) + w.Header().Set("Content-Type", "application/json; charset=utf-8") + w.WriteHeader(http.StatusInternalServerError) + const fallback = `{"code":500,"message":"internal server error"}` + "\n" + if _, werr := w.Write([]byte(fallback)); werr != nil { + log.Printf("elastickv-admin: write fallback response: %v", werr) + } + return + } w.Header().Set("Content-Type", "application/json; charset=utf-8") w.WriteHeader(code) - // Status code is already committed by WriteHeader; log encode failures so - // truncated or malformed responses remain visible to operators. - if err := json.NewEncoder(w).Encode(body); err != nil { - log.Printf("elastickv-admin: encode JSON response: %v", err) + if _, err := w.Write(buf.Bytes()); err != nil { + log.Printf("elastickv-admin: write JSON response: %v", err) } } diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index 597f608c..dc814e75 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -7,9 +7,13 @@ import ( "crypto/rand" "crypto/x509" "crypto/x509/pkix" + "encoding/json" "encoding/pem" + "math" "math/big" "net" + "net/http" + "net/http/httptest" "os" "path/filepath" "strings" @@ -266,3 +270,102 @@ func TestFanoutCurrentTargetsFallsBackToSeeds(t *testing.T) { t.Fatalf("fallback targets = %v, want [%s]", targets, seedAddr) } } + +// TestFanoutCurrentTargetsSingleflight asserts that concurrent refreshes after +// cache expiry collapse into one GetClusterOverview call. +func TestFanoutCurrentTargetsSingleflight(t *testing.T) { + t.Parallel() + + peer := &fakeAdminServer{members: []string{"peer-1:1"}} + seedAddr := startFakeAdmin(t, peer) + + f := newFanout([]string{seedAddr}, "", math.MaxInt64, insecure.NewCredentials()) + defer f.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + // Warm: trigger first refresh serially so singleflight key exists. + _ = f.currentTargets(ctx) + if peer.calls.Load() != 1 { + t.Fatalf("warm-up calls = %d, want 1", peer.calls.Load()) + } + + // Force expiry by nil-ing the cache and then fire many concurrent refresh + // attempts. Because refreshInterval is effectively infinite, only the + // forced clear can cause a refresh, and singleflight should collapse the + // burst into a single RPC. + f.mu.Lock() + f.members = nil + f.mu.Unlock() + + const concurrency = 20 + done := make(chan struct{}) + for i := 0; i < concurrency; i++ { + go func() { + _ = f.currentTargets(ctx) + done <- struct{}{} + }() + } + for i := 0; i < concurrency; i++ { + <-done + } + + // Expect exactly one additional RPC for the burst. + if got := peer.calls.Load(); got != 2 { + t.Fatalf("singleflight failed: calls = %d, want 2", got) + } +} + +func TestHandleOverviewRejectsNonGET(t *testing.T) { + t.Parallel() + f := newFanout([]string{"127.0.0.1:0"}, "", time.Second, insecure.NewCredentials()) + defer f.Close() + + req := httptest.NewRequest(http.MethodPost, "/api/cluster/overview", strings.NewReader("{}")) + rec := httptest.NewRecorder() + f.handleOverview(rec, req) + + if rec.Code != http.StatusMethodNotAllowed { + t.Fatalf("code = %d, want %d", rec.Code, http.StatusMethodNotAllowed) + } + var body struct { + Code int `json:"code"` + Message string `json:"message"` + } + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatal(err) + } + if body.Code != http.StatusMethodNotAllowed { + t.Fatalf("body.code = %d", body.Code) + } +} + +func TestWriteJSONSurfacesEncodeFailure(t *testing.T) { + t.Parallel() + rec := httptest.NewRecorder() + // math.Inf(1) is not representable in JSON; encoding fails. + writeJSON(rec, http.StatusOK, math.Inf(1)) + if rec.Code != http.StatusInternalServerError { + t.Fatalf("code = %d, want %d", rec.Code, http.StatusInternalServerError) + } + if !strings.Contains(rec.Body.String(), "internal server error") { + t.Fatalf("body = %q", rec.Body.String()) + } +} + +func TestWriteJSONSuccessPath(t *testing.T) { + t.Parallel() + rec := httptest.NewRecorder() + writeJSON(rec, http.StatusOK, map[string]int{"n": 42}) + if rec.Code != http.StatusOK { + t.Fatalf("code = %d", rec.Code) + } + var out map[string]int + if err := json.Unmarshal(rec.Body.Bytes(), &out); err != nil { + t.Fatal(err) + } + if out["n"] != 42 { + t.Fatalf("body = %v", out) + } +} From ffc21a0aa2b062645bae12faa9b07a9e187a58fd Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 23:16:10 +0900 Subject: [PATCH 12/30] docs,fix(admin): address round-3 reviews Gemini and CodeRabbit feedback on commit 8bd34d92: - adapter/admin_grpc.go: derive the auth-gate prefix from pb.Admin_ServiceDesc.ServiceName instead of the hardcoded /Admin/ so a future proto package declaration cannot silently bypass authentication. Test updated to match. - Design doc section 5.6: replace raftLogIndex with a leader-picked 128-bit proposalID in the lineageID derivation so a re-proposal that lands at a different log index preserves the same lineageID. - Design doc section 5.6: startup lineage backfill must be performed by the Raft leader via a new BackfillLineage command so replicas cannot race and produce divergent records. - Design doc section 9.1: simplify fan-out merge key to (bucketID, raftGroupID, leaderTerm, windowStart). Raft guarantees one leader per term per group so sourceNode was redundant; conflicting duplicates surface with conflict=true rather than being dropped. - Design doc section 8: narrow the no-Raft-or-FSM-changes claim to Phases 0-2 and spell out the Phase 3 additions (proposalID in route-transition commands, BackfillLineage, per-group KeyViz proposals). - Design doc section 5.3: add language tag to the fenced code block so markdownlint is clean. Rejected: CodeRabbit's suggestion to add a proto package declaration to admin.proto. No file in this proto/ directory declares one, the repo does not use Buf, and adding one would break the convention without local benefit. --- adapter/admin_grpc.go | 9 +++++++-- adapter/admin_grpc_test.go | 2 +- docs/admin_ui_key_visualizer_design.md | 12 ++++++------ 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index c58e9f74..d8feaa55 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -144,6 +144,11 @@ func sortedGroupIDs(m map[uint64]AdminGroup) []uint64 { return ids } +// adminMethodPrefix is "/Admin/" today but is derived from the generated +// service descriptor so a future proto package declaration (which would +// package-qualify the service name) does not silently bypass the auth gate. +var adminMethodPrefix = "/" + pb.Admin_ServiceDesc.ServiceName + "/" + // AdminTokenAuth builds a gRPC unary+stream interceptor pair enforcing // "authorization: Bearer " metadata against the supplied token. An // empty token disables enforcement; callers should pair that mode with a @@ -177,7 +182,7 @@ func AdminTokenAuth(token string) (grpc.UnaryServerInterceptor, grpc.StreamServe info *grpc.UnaryServerInfo, handler grpc.UnaryHandler, ) (any, error) { - if !strings.HasPrefix(info.FullMethod, "/Admin/") { + if !strings.HasPrefix(info.FullMethod, adminMethodPrefix) { return handler(ctx, req) } if err := check(ctx); err != nil { @@ -191,7 +196,7 @@ func AdminTokenAuth(token string) (grpc.UnaryServerInterceptor, grpc.StreamServe info *grpc.StreamServerInfo, handler grpc.StreamHandler, ) error { - if !strings.HasPrefix(info.FullMethod, "/Admin/") { + if !strings.HasPrefix(info.FullMethod, adminMethodPrefix) { return handler(srv, ss) } if err := check(ss.Context()); err != nil { diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index 2572664b..38326544 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -124,7 +124,7 @@ func TestAdminTokenAuth(t *testing.T) { t.Fatal("interceptor should be non-nil for configured token") } - info := &grpc.UnaryServerInfo{FullMethod: "/Admin/GetClusterOverview"} + info := &grpc.UnaryServerInfo{FullMethod: "/" + pb.Admin_ServiceDesc.ServiceName + "/GetClusterOverview"} handler := func(_ context.Context, _ any) (any, error) { return "ok", nil } cases := []struct { diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index d6e9dfa2..358b4ee9 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -166,7 +166,7 @@ Benchmark gate in CI: `BenchmarkCoordinatorDispatch` with sampler off vs on; the ### 5.3 In-memory representation and the route budget -``` +```text Sampler ├─ routes atomic.Pointer[routeTable] // immutable map[RouteID]*routeSlot, COW-updated off the hot path │ each routeSlot points to (reads, writes, readBytes, writeBytes, sampleRate) @@ -205,10 +205,10 @@ Phases 0–2 keep history in memory only. Restart loses the heatmap — acceptab Phase 3 persists compacted columns **distributed across the user Raft groups themselves, not the default group**. Concentrating KeyViz writes on the default group would centralise I/O and Raft-log growth onto a single group, creating exactly the kind of hotspot this feature is built to surface. Instead: - Each compacted KeyViz column is written to the **Raft group that owns its key range**, under a group-local admin namespace `!admin|keyviz|range||`; the prefix is not routed through the default group. Phase 3 also adds an explicit system-namespace filter so every user-plane read and timestamp-selection path — `pebbleStore.ScanAt`, `ReverseScanAt`, `GetAt`, `ExistsAt`, and `ShardedCoordinator.maxLatestCommitTS` — ignores `!admin|*` records; point reads that target an `!admin|*` key return `NotFound` as if the key did not exist, so an attacker cannot distinguish "hidden" from "missing". The current `isPebbleMetaKey` exact-match check (`store/lsm_store.go:299`) is widened to a prefix check on `!admin|`, and the same check is applied in `nextScannableUserKey` / `prevScannableUserKey` so internal KeyViz records are skipped during user-plane scans. To prevent the inverse leak, every data-plane adapter (gRPC `RawKV`/`TransactionalKV`, Redis, DynamoDB, S3) rejects user-plane writes — `Put`, `Delete`, transactional mutations, and Redis equivalents — whose key starts with `!admin|`. The check is centralised in `kv.ShardedCoordinator` so adapters cannot forget it; a write attempting an `!admin|*` key returns `InvalidArgument` and is recorded in the audit metric. -- `lineageID` is generated **exactly once, by the Raft leader proposing the split/merge**, as part of the route-transition command itself, and then stored in the Raft log — so every replica reads the same value instead of regenerating it. This avoids violating the repository invariant that persistence timestamps must originate from the Raft leader, not from a node-local clock. The transition HLC used is the **leader-issued HLC stamped onto the `SplitRange`/`MergeRange` Raft proposal** (same HLC that backs OCC decisions), never a node-local snapshot; followers observe the lineageID by replaying the committed command. -- The UUIDv7 is derived deterministically from that leader-issued HLC plus the proposal's Raft log index so the same transition yields the same lineageID on every replica and on re-proposal: the 48-bit `unix_ts_ms` field gets the HLC physical part (ms resolution), and the full 16-bit HLC logical counter is packed across `rand_a` (12 bits) and the top nibble of `rand_b` — logical bits `[15:4]` into `rand_a`, logical bits `[3:0]` into the top 4 bits of `rand_b`, so no logical bits are dropped. The remaining 58 bits of `rand_b` are filled from `BLAKE2b-256(raftGroupID || raftLogIndex || proposalBytes)` truncated to 58 bits — deterministic across replicas, collision-resistant across transitions, and no runtime RNG dependency. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs, raftLogIndex}` with `validFromHLC` carrying the full HLC so the reader can re-sort authoritatively; `RouteID` is recorded only as the current routing hint, never as the primary history key. +- `lineageID` is generated **exactly once, by the Raft leader proposing the split/merge**, as part of the route-transition command itself, and then stored in the Raft log — so every replica reads the same value instead of regenerating it. This avoids violating the repository invariant that persistence timestamps must originate from the Raft leader, not from a node-local clock. The transition HLC used is the **leader-issued HLC stamped onto the `SplitRange`/`MergeRange` Raft proposal** (same HLC that backs OCC decisions), never a node-local snapshot; followers observe the lineageID by replaying the committed command. If the leader retries the proposal (e.g., after a `VerifyLeader` failure), the retry keeps the original lineageID because it is embedded in the command payload; nothing about the lineageID depends on the eventual Raft log index it lands at. +- The UUIDv7 is derived deterministically from the leader-issued HLC plus a stable **proposal ID** that the leader generates before enqueueing the command (128-bit random, embedded in the proposal), not the Raft log index — this is what keeps the ID stable across re-proposals. The 48-bit `unix_ts_ms` field gets the HLC physical part (ms resolution), and the full 16-bit HLC logical counter is packed across `rand_a` (12 bits) and the top nibble of `rand_b` — logical bits `[15:4]` into `rand_a`, logical bits `[3:0]` into the top 4 bits of `rand_b`, so no logical bits are dropped. The remaining 58 bits of `rand_b` are filled from `BLAKE2b-256(raftGroupID || proposalID)` truncated to 58 bits — deterministic across replicas, collision-resistant across transitions, and no runtime RNG dependency after the leader has picked the proposal ID. The lineage record stores `{start, end, routeID, validFromHLC, validToHLC, parentLineageIDs, proposalID}` with `validFromHLC` carrying the full HLC so the reader can re-sort authoritatively; `RouteID` is recorded only as the current routing hint, never as the primary history key. - Split and merge events append small group-local lineage records under `!admin|keyviz|lineage|` and mark closed branches with `validToHLC` so retention GC can later prune them. On split, both children point back to the parent lineage and inherit the parent's compacted history for continuity. On merge, the survivor records both child lineage IDs and the reader sums overlapping intervals. If a node sees historical rows without a lineage record during an upgrade, the admin reader falls back to overlap on the persisted `[start, end)` range before using `RouteID`. -- On startup, the sampler rebuilds its in-memory `RouteID → lineageID` map by scanning the group-local lineage index for routes currently owned by the node's groups and matching active `[start, end)` ranges from `ListRoutes`. If a route exists without a matching lineage record, the node creates a new lineage record with a parent pointer to the best overlapping retained range. This makes rolling restarts and upgrades preserve historical continuity. +- On startup, the sampler rebuilds its in-memory `RouteID → lineageID` map by scanning the group-local lineage index for routes currently owned by the node's groups and matching active `[start, end)` ranges from `ListRoutes`. If a route exists without a matching lineage record (legacy data from before Phase 3), **only the current Raft leader proposes a `BackfillLineage` command** — a single-writer Raft entry carrying the leader-issued HLC, a leader-picked proposal ID (same construction as above), and a parent pointer to the best overlapping retained range. Followers observe the record by replaying the committed entry, never by generating it locally. This makes rolling restarts and upgrades preserve historical continuity without letting concurrent replicas race and persist divergent lineage IDs. - Writes are batched per group on a configurable interval (`--keyvizPersistInterval`, **default 5 min**, max 1 h) and dispatched as a single low-priority Raft proposal per group, keeping the write amplification proportional to the group's own traffic. Hourly was rejected as the default because a node crash between flushes would lose up to one hour of heatmap; 5 min bounds worst-case loss while still amortising Raft cost. As a defence-in-depth against single-point loss, each node also keeps the most recent unflushed window in a small **append-only WAL file** (`/keyviz/wal-.log`) under the same retention contract, with two hard bounds to keep restart fast: the WAL is **size-capped at `--keyvizWALMaxBytes` (default 64 MiB)** and **checkpointed every `--keyvizPersistInterval`** — when a batch is persisted to Raft, the corresponding WAL prefix is truncated. This caps worst-case replay at one interval's worth of data (at the default, tens of MiB at most), and a target recovery budget of **≤1 s replay time at 1 M ops/s**. If the WAL exceeds its size cap before the next flush — indicating the node is behind on persistence — the sampler drops the oldest records and records a `keyviz_wal_shed_total` metric instead of blocking the hot path. On startup the sampler fast-loads the WAL without running the adaptive controller, then resumes normal operation; readiness is gated on WAL replay completion so rolling upgrades do not route traffic to a node that is still rebuilding state. Operators that want stricter durability set `--keyvizPersistInterval=30s`; those that want faster restart at the cost of more write amplification set a smaller `--keyvizWALMaxBytes`. - Retention is enforced by a KeyViz-specific GC pass, not by assuming ordinary HLC expiry will delete the latest MVCC version. Phase 3 prefers a **Pebble `CompactionFilter`** that drops expired `!admin|keyviz|*` versions during normal background compactions — this avoids the I/O and CPU cost of an out-of-band scan-and-delete sweep, since the work happens during compactions that would run anyway. As a fallback for store flavours where a CompactionFilter is unavailable, an opt-in maintenance pass tombstones expired column and lineage records using a bounded, time-budgeted scan (default ≤5% of disk read bandwidth). Persistence refuses to enable if neither path is available, avoiding unbounded growth. - Lineage records are retained while any column in the 7-day retention window references them. The same GC pass prunes closed lineage branches whose `validToHLC` and descendants are older than retention, so frequent split/merge clusters do not accumulate an unbounded lineage tree. @@ -264,7 +264,7 @@ This adds roughly a dozen integer fields per tracked operation and avoids both t | `main.go` | Register token-protected `Admin` gRPC service; wire `keyviz.Sampler` into the coordinator; wire `LiveSummary` into observers; add `--adminTokenFile`, `--adminInsecureNoAuth`, `--keyvizMaxTrackedRoutes`, `--keyvizPersistInterval`, and `--keyvizWALMaxBytes`. | | `web/` (new) | Svelte SPA source. | -No changes to Raft or FSM are required. Data-plane protocol adapters only receive the sampler call site and the `LiveSummary` hook that sits next to existing Prometheus writes. Phase 3 intentionally touches the store/coordinator read paths to keep `!admin|keyviz|*` metadata out of user scans and timestamp selection. +Phases 0–2 require no Raft or FSM changes. Data-plane protocol adapters only receive the sampler call site and the `LiveSummary` hook that sits next to existing Prometheus writes. Phase 3 does change Raft and FSM paths: split/merge Raft commands carry a leader-picked `proposalID` and the derived `lineageID`, a new `BackfillLineage` leader-only command is added for startup recovery, per-group low-priority Raft proposals persist compacted KeyViz columns, and the store/coordinator read paths are updated to keep `!admin|keyviz|*` metadata out of user scans and timestamp selection. ## 9. Deployment and Operation @@ -279,7 +279,7 @@ Because writes are recorded by Raft leaders and follower-local reads are recorde - `--nodes` accepts a comma-separated list of seed addresses. The admin binary calls `GetClusterOverview` on any reachable seed to discover the current full membership (node → gRPC endpoint, plus per-group leader identity). Membership is cached for `--nodesRefreshInterval` (**default 15 s**) so a stampede of concurrent browser requests hits at most one `GetClusterOverview` per interval per seed, while scale-out events are still reflected within seconds. The cache is refreshed lazily on the first request after expiry and invalidated immediately on any per-node `Unavailable` error, so removed or replaced nodes are dropped on the next request instead of waiting for the next tick. - For each query (`GetKeyVizMatrix`, `GetRouteDetail`, `GetAdapterSummary`), the admin binary issues parallel gRPC calls to every known node and merges results server-side before sending one combined JSON payload to the browser. -- Merging rule for the heatmap: rows are grouped by `bucketID`/`lineageID` and time step. Read samples from multiple nodes are **summed**, because they represent distinct locally served reads. Write samples are grouped by `(bucketID, raftGroupID, leaderTerm, sourceNode, windowStart)` and summed across distinct leader terms during leadership transitions; exact duplicate source keys are deduplicated. The admin binary never uses "later timestamp wins" to overwrite a previous leader's complete window with a new leader's partial window. If two leaders claim overlapping terms for the same group, the cell is returned with `conflict=true` and rendered hatched rather than silently dropping data. +- Merging rule for the heatmap: rows are grouped by `bucketID`/`lineageID` and time step. Read samples from multiple nodes are **summed**, because they represent distinct locally served reads. For write samples the authoritative identity is `(raftGroupID, leaderTerm)` — by Raft invariants at most one leader exists per term per group — so the admin binary collapses write samples to **one value per `(bucketID, raftGroupID, leaderTerm, windowStart)`** key. If the same logical key arrives from more than one node (e.g., an ex-leader that has not yet expired its local cache plus a correctly-responding new leader in the same term), the entries are expected to be identical and the merger keeps one; if they differ, the cell is surfaced with `conflict=true` (not silently dropped). Across distinct `leaderTerm` values for the same group and window, values are summed because each term's leader only observed its own term's writes. The admin binary never uses "later timestamp wins" to overwrite a previous leader's complete window with a new leader's partial window. - Degraded mode: if any node is unreachable, the admin binary returns a partial result with a per-node `{node, ok, error}` status array so the UI can surface "3 of 4 nodes responded" instead of silently hiding ranges. The heatmap hatches rows or time windows whose expected source node failed. - A single-node mode (`--nodes=one:50051 --no-fanout`) is retained for operators who explicitly want the partial view. From 1ffd21ab79e855d1e99d7cd97c1dec29115f7f8a Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 23:28:57 +0900 Subject: [PATCH 13/30] fix(admin),docs: address round-4 review Gemini round-4 feedback on commit ffc21a0a: - cmd/elastickv-admin: encode protobuf responses via protojson (preserves proto3 camelCase and well-known-type mapping) instead of encoding/json, which loses field-name fidelity when marshaling generated pb types. Added a regression test that checks for the camelCase "grpcAddress" marker. - docs: add Phase 3 partial-availability UX section to section 5.6 describing how the admin UI degrades when a user Raft group is unavailable (per-group status array, hatched rows, cached lineage mapping, 200 + allGroupsHealthy boolean) so the heatmap does not silently gap out. Note: MaxBytesReader on /api/cluster/overview was already added in round-2 commit 8bd34d92. --- cmd/elastickv-admin/main.go | 45 +++++++++++++++++++------- cmd/elastickv-admin/main_test.go | 26 +++++++++++++++ docs/admin_ui_key_visualizer_design.md | 8 +++++ 3 files changed, 68 insertions(+), 11 deletions(-) diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index c29ff880..62897ca5 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -29,6 +29,8 @@ import ( "google.golang.org/grpc/credentials/insecure" "google.golang.org/grpc/metadata" "google.golang.org/grpc/status" + "google.golang.org/protobuf/encoding/protojson" + "google.golang.org/protobuf/proto" ) const ( @@ -386,6 +388,29 @@ func membersFrom(seed string, resp *pb.GetClusterOverviewResponse) []string { return out } +// perNodeResult wraps a fan-out response from one node. Data is stored as +// json.RawMessage so it can be filled with a protojson-encoded protobuf +// message — encoding/json would lose the proto3 field-name mapping and +// well-known-type handling. +type perNodeResult struct { + Node string `json:"node"` + OK bool `json:"ok"` + Error string `json:"error,omitempty"` + Data json.RawMessage `json:"data,omitempty"` +} + +// marshalProto encodes a protobuf message with the JSON mapping that preserves +// proto3 field names and well-known-type semantics. +var protoMarshaler = protojson.MarshalOptions{EmitUnpopulated: true, UseProtoNames: false} + +func marshalProto(m proto.Message) (json.RawMessage, error) { + raw, err := protoMarshaler.Marshal(m) + if err != nil { + return nil, errors.Wrap(err, "protojson marshal") + } + return raw, nil +} + func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { writeJSONError(w, http.StatusMethodNotAllowed, "method not allowed") @@ -396,21 +421,13 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { defer cancel() targets := f.currentTargets(ctx) - - type perNode struct { - Node string `json:"node"` - OK bool `json:"ok"` - Error string `json:"error,omitempty"` - Data *pb.GetClusterOverviewResponse `json:"data,omitempty"` - } - - results := make([]perNode, len(targets)) + results := make([]perNodeResult, len(targets)) var wg sync.WaitGroup for i, addr := range targets { wg.Add(1) go func(i int, addr string) { defer wg.Done() - entry := perNode{Node: addr} + entry := perNodeResult{Node: addr} cli, err := f.clientFor(addr) if err != nil { entry.Error = err.Error() @@ -426,8 +443,14 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { results[i] = entry return } + data, mErr := marshalProto(resp) + if mErr != nil { + entry.Error = errors.Wrap(mErr, "marshal response").Error() + results[i] = entry + return + } entry.OK = true - entry.Data = resp + entry.Data = data results[i] = entry }(i, addr) } diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index dc814e75..37d86de4 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -369,3 +369,29 @@ func TestWriteJSONSuccessPath(t *testing.T) { t.Fatalf("body = %v", out) } } + +// TestHandleOverviewUsesProtojson asserts that admin responses preserve the +// proto3 JSON mapping (camelCase field names, zero-valued fields emitted) so +// the browser sees stable field names regardless of encoding/json's behavior. +func TestHandleOverviewUsesProtojson(t *testing.T) { + t.Parallel() + peer := &fakeAdminServer{members: []string{"m:1"}} + seedAddr := startFakeAdmin(t, peer) + + f := newFanout([]string{seedAddr}, "", time.Second, insecure.NewCredentials()) + defer f.Close() + + req := httptest.NewRequest(http.MethodGet, "/api/cluster/overview", nil) + rec := httptest.NewRecorder() + f.handleOverview(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("code = %d", rec.Code) + } + body := rec.Body.String() + // protojson uses camelCase by default; encoding/json would emit + // "grpc_address" (proto name). Catch the regression explicitly. + if !strings.Contains(body, "grpcAddress") { + t.Fatalf("response missing protojson camelCase field; body=%q", body) + } +} diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index 358b4ee9..a5d37bc4 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -217,6 +217,14 @@ Phase 3 persists compacted columns **distributed across the user Raft groups the This keeps the data-plane Raft-log overhead bounded by per-group load and fails independently when a single group is unavailable. +**Partial-availability UX.** Distributing persistence across the user Raft groups trades the default-group single-point-of-failure for per-group independence, but it also means a single unavailable group cannot serve that key range's history. The UI copes with that explicitly rather than silently showing gaps: + +- The fan-out reader collects a per-group `{groupID, ok, error, fromRange, toRange}` status array alongside the merged matrix. The admin binary returns `status=PARTIAL` on the HTTP response when any group failed and forwards the status array unchanged. +- Rows whose owning group is in `error` state are returned with `aggregate=true`, the constituent route list, and a `degraded=true` flag so the UI renders them hatched and labels them "historical data unavailable from group *N*" in the drawer. Live (in-memory) columns still flow for any node currently sampling, so the heatmap is never fully blank — only historical columns for the affected range degrade. +- `GetKeyVizMatrix` and `GetRouteDetail` continue to return `200 OK` with the partial body plus the status array, so automation does not see a 5xx during a transient partial outage. An explicit `allGroupsHealthy` boolean and a `degradedGroups[]` list let callers gate on strict health when they need to. +- Lineage lookups cache the last-known `(lineageID → group)` mapping in the admin binary for `--nodesRefreshInterval`, so a brief group flap does not drop the route from the heatmap entirely: the cached mapping is still used to annotate the row, and the fan-out reader retries on the next request. +- When a group is permanently lost, operators recover by either restoring the group (history reappears on the next request) or invoking an out-of-band `elastickv-admin reassign-lineage` flow (deferred to Phase 4) that moves the lineage metadata to a healthy group; the design here only guarantees that the UI stays useful during the outage, not that history is automatically relocated. + ### 5.7 Key preview labels Raw keys are binary. The UI needs a printable hint per bucket. Strategy: From 65c75b3a18295806af17dd0175a32a9d20eb82ec Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 23:35:34 +0900 Subject: [PATCH 14/30] fix(admin): decouple membership singleflight from caller ctx Gemini round-5 feedback on commit 1ffd21ab: singleflight.Do captures the first caller's ctx, so that caller canceling (browser tab closed, client timeout) aborts the shared refresh for every other waiter. - Switch to singleflight.DoChan and run the refresh on a detached background ctx bounded by membershipRefreshBudget (10s). - If the caller cancels before the refresh finishes, return the last cached membership (or seeds). The detached refresh continues and populates the cache for the next request. - Add regression test asserting a canceled first caller does not break later callers. --- cmd/elastickv-admin/main.go | 50 +++++++++++++++++++++++--------- cmd/elastickv-admin/main_test.go | 32 ++++++++++++++++++++ 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 62897ca5..8400b366 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -38,12 +38,17 @@ const ( defaultNodesRefreshInterval = 15 * time.Second defaultGRPCRequestTimeout = 10 * time.Second discoveryRPCTimeout = 2 * time.Second - readHeaderTimeout = 5 * time.Second - readTimeout = 30 * time.Second - writeTimeout = 30 * time.Second - idleTimeout = 120 * time.Second - shutdownTimeout = 5 * time.Second - maxRequestBodyBytes = 4 << 10 + // membershipRefreshBudget caps the detached background refresh so it + // cannot run forever even if every seed is slow. Sized for up to a few + // sequential discoveryRPCTimeout attempts before the singleflight + // collapses. + membershipRefreshBudget = 10 * time.Second + readHeaderTimeout = 5 * time.Second + readTimeout = 30 * time.Second + writeTimeout = 30 * time.Second + idleTimeout = 120 * time.Second + shutdownTimeout = 5 * time.Second + maxRequestBodyBytes = 4 << 10 ) var ( @@ -311,9 +316,12 @@ func (f *fanout) outgoingCtx(parent context.Context) context.Context { // membership cache is fresh it is returned directly; otherwise the admin binary // queries seeds via GetClusterOverview and caches the resulting member list // for refreshInterval. Concurrent refreshes are collapsed through singleflight -// so a burst of requests after cache expiry hits only one seed. On total -// failure it falls back to the static seed list so a single unreachable seed -// does not take the admin offline. +// so a burst of requests after cache expiry hits only one seed. The shared +// refresh runs on a detached background context bounded by +// membershipRefreshBudget so one caller canceling (e.g., browser tab close) +// does not abort the work for every other concurrent waiter. On total failure +// the admin binary falls back to the static seed list so a single unreachable +// seed does not take the admin offline. func (f *fanout) currentTargets(ctx context.Context) []string { f.mu.Lock() if f.members != nil && time.Since(f.members.fetchedAt) < f.refreshInterval { @@ -323,11 +331,27 @@ func (f *fanout) currentTargets(ctx context.Context) []string { } f.mu.Unlock() - result, _, _ := f.refreshGroup.Do("members", func() (any, error) { - return f.refreshMembership(ctx), nil + ch := f.refreshGroup.DoChan("members", func() (any, error) { + bgCtx, cancel := context.WithTimeout(context.Background(), membershipRefreshBudget) + defer cancel() + return f.refreshMembership(bgCtx), nil }) - addrs, _ := result.([]string) - return addrs + select { + case r := <-ch: + addrs, _ := r.Val.([]string) + return addrs + case <-ctx.Done(): + // Caller bailed. Give them whatever targets we can assemble without + // blocking: the last cached membership if we have one, else seeds. + // The detached refresh continues in the background and will populate + // the cache for the next request. + f.mu.Lock() + defer f.mu.Unlock() + if f.members != nil { + return append([]string(nil), f.members.addrs...) + } + return append([]string(nil), f.seeds...) + } } // refreshMembership performs the actual discovery RPC. It honours the caller's diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index 37d86de4..8f9f337f 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -370,6 +370,38 @@ func TestWriteJSONSuccessPath(t *testing.T) { } } +// TestFanoutRefreshSurvivesFirstCallerCancel asserts that canceling the first +// caller's context does not kill the shared singleflight refresh — subsequent +// callers should still see a populated membership. +func TestFanoutRefreshSurvivesFirstCallerCancel(t *testing.T) { + t.Parallel() + + peer := &fakeAdminServer{members: []string{"m:1"}} + seedAddr := startFakeAdmin(t, peer) + + f := newFanout([]string{seedAddr}, "", 50*time.Millisecond, insecure.NewCredentials()) + defer f.Close() + + // First caller cancels before the refresh completes. + cancelled, cancel := context.WithCancel(context.Background()) + cancel() + _ = f.currentTargets(cancelled) + + // A fresh caller a beat later must see the member list populated by the + // still-running background refresh rather than the raw seed list. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + targets := f.currentTargets(ctx) + cancel() + if len(targets) == 2 { + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatalf("membership never populated; peer calls=%d", peer.calls.Load()) +} + // TestHandleOverviewUsesProtojson asserts that admin responses preserve the // proto3 JSON mapping (camelCase field names, zero-valued fields emitted) so // the browser sees stable field names regardless of encoding/json's behavior. From 4d16c8559aeabb4ec6f6afb4597e731e9a57f61f Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Wed, 22 Apr 2026 23:42:25 +0900 Subject: [PATCH 15/30] fix(admin): guard fanout clients map across Close Gemini round-6 flagged that Close() setting f.clients = nil can panic a concurrent clientFor (assigning to a nil map) during shutdown. - Add a closed flag guarded by the fanout mutex; Close replaces the map with an empty one rather than nil so any iterator is safe. - clientFor returns errFanoutClosed when called post-Close; invalidateClient becomes a no-op. - Close is idempotent. - Regression test under the race detector covers clientFor / invalidateClient / double-Close after shutdown. --- cmd/elastickv-admin/main.go | 22 +++++++++++++++++++++- cmd/elastickv-admin/main_test.go | 15 +++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 8400b366..b83f40ad 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -236,6 +236,7 @@ type fanout struct { mu sync.Mutex clients map[string]*nodeClient members *membership + closed bool // refreshGroup deduplicates concurrent membership refresh RPCs so a burst // of browser requests immediately after cache expiry collapses into a @@ -243,6 +244,11 @@ type fanout struct { refreshGroup singleflight.Group } +// errFanoutClosed is returned by clientFor when Close has already run, so +// callers can treat it as a graceful shutdown signal instead of bubbling up as +// a generic map-panic. +var errFanoutClosed = errors.New("admin fanout is closed") + func newFanout( seeds []string, token string, @@ -267,17 +273,27 @@ func newFanout( func (f *fanout) Close() { f.mu.Lock() defer f.mu.Unlock() + if f.closed { + return + } + f.closed = true for _, c := range f.clients { if err := c.conn.Close(); err != nil { log.Printf("elastickv-admin: close gRPC connection to %s: %v", c.addr, err) } } - f.clients = nil + // Replace with an empty map rather than nil so the remaining + // closed-guarded accessors can still iterate or lookup without panicking + // while still releasing the client references for GC. + f.clients = map[string]*nodeClient{} } func (f *fanout) clientFor(addr string) (*nodeClient, error) { f.mu.Lock() defer f.mu.Unlock() + if f.closed { + return nil, errFanoutClosed + } if c, ok := f.clients[addr]; ok { return c, nil } @@ -294,6 +310,10 @@ func (f *fanout) clientFor(addr string) (*nodeClient, error) { // Unavailable so the next request re-dials or skips the removed node. func (f *fanout) invalidateClient(addr string) { f.mu.Lock() + if f.closed { + f.mu.Unlock() + return + } c, ok := f.clients[addr] delete(f.clients, addr) f.members = nil diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index 8f9f337f..39098003 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -370,6 +370,21 @@ func TestWriteJSONSuccessPath(t *testing.T) { } } +// TestFanoutClientForAfterCloseIsSafe asserts that clientFor and +// invalidateClient do not panic when invoked concurrently with Close — a +// shutdown-time race that otherwise hits a nil-map write in clientFor. +func TestFanoutClientForAfterCloseIsSafe(t *testing.T) { + t.Parallel() + f := newFanout([]string{"127.0.0.1:1"}, "", time.Second, insecure.NewCredentials()) + f.Close() + + if _, err := f.clientFor("127.0.0.1:2"); err == nil { + t.Fatal("expected error after Close, got nil") + } + f.invalidateClient("127.0.0.1:2") // must be a no-op, not panic + f.Close() // idempotent +} + // TestFanoutRefreshSurvivesFirstCallerCancel asserts that canceling the first // caller's context does not kill the shared singleflight refresh — subsequent // callers should still see a populated membership. From 52ae3e27db251e6a4f26043d8a8ff6bd9b98b1a9 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 00:44:27 +0900 Subject: [PATCH 16/30] fix(admin): bound admin binary resources Gemini round-7 feedback on commit 4d16c855 flagged three unbounded external-input sinks: - loadToken: os.Stat before os.ReadFile and reject files over maxTokenFileBytes (4 KiB) so a misconfigured path (like a log file) cannot force huge allocations during startup. - membersFrom: cap the discovered-node list at maxDiscoveredNodes (512) so a malicious or misconfigured peer cannot inflate the fan-out into goroutine/fd exhaustion. Log a warning when truncated. - fanout.clientFor: bound the client cache at maxCachedClients (256); evict a non-seed entry on overflow and close its connection so high-churn clusters do not leak file descriptors. Regression tests cover oversized-token rejection, member-list cap, and client-cache cap. --- cmd/elastickv-admin/main.go | 56 ++++++++++++++++++++++++++- cmd/elastickv-admin/main_test.go | 65 ++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 1 deletion(-) diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index b83f40ad..6ab6575f 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -10,6 +10,7 @@ import ( "crypto/x509" "encoding/json" "flag" + "fmt" "log" "net/http" "os" @@ -49,6 +50,19 @@ const ( idleTimeout = 120 * time.Second shutdownTimeout = 5 * time.Second maxRequestBodyBytes = 4 << 10 + // maxTokenFileBytes caps the admin-token file so a misconfigured path + // pointing at a huge file (for example a log) cannot force the admin + // process to allocate arbitrary memory before the bearer-token check. + maxTokenFileBytes = 4 << 10 + // maxCachedClients caps the fanout's cached gRPC connections so a cluster + // with high node churn or a malicious discovery response cannot leak file + // descriptors indefinitely. Sized to cover tested cluster sizes while + // staying well below typical ulimits. + maxCachedClients = 256 + // maxDiscoveredNodes bounds the member list returned by a peer's + // GetClusterOverview so a malicious or misconfigured node cannot force + // the admin binary to spawn unbounded goroutines / gRPC calls. + maxDiscoveredNodes = 512 ) var ( @@ -167,6 +181,15 @@ func loadToken(path string, insecureMode bool) (string, error) { if err != nil { return "", errors.Wrap(err, "resolve token path") } + info, err := os.Stat(abs) + if err != nil { + return "", errors.Wrap(err, "stat token file") + } + if info.Size() > maxTokenFileBytes { + return "", fmt.Errorf("token file %s is %d bytes; maximum is %d — refusing to load", + abs, info.Size(), maxTokenFileBytes) + } + // Size is bounded above, so materializing the file is safe. b, err := os.ReadFile(abs) if err != nil { return "", errors.Wrap(err, "read token file") @@ -297,6 +320,27 @@ func (f *fanout) clientFor(addr string) (*nodeClient, error) { if c, ok := f.clients[addr]; ok { return c, nil } + // Bound the cache so a high-churn cluster or a stream of hostile + // discovery responses cannot leak file descriptors. Evict any one entry + // (map iteration is randomized) to make room; the evicted target will be + // re-dialed on demand if it comes back. Never evict an address that is in + // the active seeds list. + if len(f.clients) >= maxCachedClients { + seeds := map[string]struct{}{} + for _, s := range f.seeds { + seeds[s] = struct{}{} + } + for victim, vc := range f.clients { + if _, keep := seeds[victim]; keep { + continue + } + delete(f.clients, victim) + if err := vc.conn.Close(); err != nil { + log.Printf("elastickv-admin: evict %s: close: %v", victim, err) + } + break + } + } conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(f.creds)) if err != nil { return nil, errors.Wrapf(err, "dial %s", addr) @@ -407,10 +451,13 @@ func (f *fanout) refreshMembership(ctx context.Context) []string { // membersFrom extracts a deduplicated address list from a cluster overview // response, always including the node that answered so the answering seed is -// still queried even if it omits itself from members. +// still queried even if it omits itself from members. The result is capped at +// maxDiscoveredNodes so a malicious or misconfigured peer cannot inflate the +// fan-out. func membersFrom(seed string, resp *pb.GetClusterOverviewResponse) []string { seen := map[string]struct{}{} out := make([]string, 0, len(resp.GetMembers())+1) + truncated := false add := func(addr string) { addr = strings.TrimSpace(addr) if addr == "" { @@ -419,6 +466,10 @@ func membersFrom(seed string, resp *pb.GetClusterOverviewResponse) []string { if _, dup := seen[addr]; dup { return } + if len(out) >= maxDiscoveredNodes { + truncated = true + return + } seen[addr] = struct{}{} out = append(out, addr) } @@ -429,6 +480,9 @@ func membersFrom(seed string, resp *pb.GetClusterOverviewResponse) []string { for _, m := range resp.GetMembers() { add(m.GetGrpcAddress()) } + if truncated { + log.Printf("elastickv-admin: discovery response exceeded %d nodes; truncating (peer=%s)", maxDiscoveredNodes, seed) + } return out } diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index 39098003..3549a0fd 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -156,6 +156,71 @@ func writePEMCert(t *testing.T) []byte { return pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der}) } +func TestLoadTokenRejectsOversizedFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "huge") + // One byte past the cap: exact boundary plus one. + payload := strings.Repeat("x", maxTokenFileBytes+1) + if err := os.WriteFile(path, []byte(payload), 0o600); err != nil { + t.Fatal(err) + } + _, err := loadToken(path, false) + if err == nil || !strings.Contains(err.Error(), "maximum is") { + t.Fatalf("expected size-cap error, got %v", err) + } +} + +func TestMembersFromCapsAtMaxDiscoveredNodes(t *testing.T) { + t.Parallel() + resp := &pb.GetClusterOverviewResponse{ + Self: &pb.NodeIdentity{GrpcAddress: "self:1"}, + } + // Return way more members than the cap allows. + for i := 0; i < maxDiscoveredNodes+50; i++ { + resp.Members = append(resp.Members, &pb.NodeIdentity{ + GrpcAddress: "node-" + strconvItoa(i) + ":1", + }) + } + got := membersFrom("seed:1", resp) + if len(got) != maxDiscoveredNodes { + t.Fatalf("len = %d, want %d (cap)", len(got), maxDiscoveredNodes) + } +} + +// small helper to avoid pulling strconv into the test file just for one call. +func strconvItoa(i int) string { + if i == 0 { + return "0" + } + var digits []byte + for i > 0 { + digits = append([]byte{byte('0' + i%10)}, digits...) + i /= 10 + } + return string(digits) +} + +func TestFanoutClientCacheEvictsWhenFull(t *testing.T) { + t.Parallel() + f := newFanout([]string{"seed:1"}, "", time.Second, insecure.NewCredentials()) + defer f.Close() + + // Fill the cache past the cap. New dials should not error out and the + // map must stay bounded. + for i := 0; i < maxCachedClients+5; i++ { + if _, err := f.clientFor("node-" + strconvItoa(i) + ":1"); err != nil { + t.Fatalf("clientFor[%d]: %v", i, err) + } + } + f.mu.Lock() + size := len(f.clients) + f.mu.Unlock() + if size > maxCachedClients { + t.Fatalf("cache size = %d, exceeds cap %d", size, maxCachedClients) + } +} + func TestMembersFromDeduplicatesAndIncludesSeed(t *testing.T) { t.Parallel() resp := &pb.GetClusterOverviewResponse{ From a3e0dc3eed71c4e9e27c2453e257d423d2c232d8 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 03:40:59 +0900 Subject: [PATCH 17/30] feat(admin),fix(admin): wire AdminServer into main and split ctx budgets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review on commit 68d78421 flagged two issues: P1 — AdminServer was never registered on the node's gRPC listeners, so /api/cluster/overview against real clusters would return Unimplemented. Wire it in: - main.go: add --adminTokenFile and --adminInsecureNoAuth flags (mutually exclusive; neither set leaves the service disabled). - configureAdminService builds an adapter.AdminServer + optional bearer-token interceptors; RegisterGroup is called for each raft runtime before startup; insecure mode logs a warning. - startRaftServers now appends the admin interceptors to the grpc.ServerOption chain and registers pb.RegisterAdminServer on each per-group listener when enabled. - Tests cover default-disabled, mutual exclusion, token-file happy path, insecure-no-auth, and the 4 KiB token-size cap. P2 — cmd/elastickv-admin's /api/cluster/overview used one 10s ctx for both membership discovery (up to membershipRefreshBudget) and per-node fan-out, so slow seeds could starve the fan-out. Split into discoveryWaitBudget (3s) for the singleflight wait and a fresh defaultGRPCRequestTimeout (10s) for the per-node RPCs derived from r.Context() directly. --- cmd/elastickv-admin/main.go | 15 ++- internal/raftengine/etcd/fsm_snapshot_file.go | 7 +- main.go | 117 +++++++++++++++++- main_admin_test.go | 79 ++++++++++++ 4 files changed, 212 insertions(+), 6 deletions(-) create mode 100644 main_admin_test.go diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 6ab6575f..1beffbd6 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -39,6 +39,11 @@ const ( defaultNodesRefreshInterval = 15 * time.Second defaultGRPCRequestTimeout = 10 * time.Second discoveryRPCTimeout = 2 * time.Second + // discoveryWaitBudget is how long a request handler is willing to wait + // for the singleflight membership refresh before falling back to the + // cached (or static seed) list. Kept well below defaultGRPCRequestTimeout + // so a slow discovery cannot starve the subsequent per-node fan-out. + discoveryWaitBudget = 3 * time.Second // membershipRefreshBudget caps the detached background refresh so it // cannot run forever even if every seed is slow. Sized for up to a few // sequential discoveryRPCTimeout attempts before the singleflight @@ -515,10 +520,16 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { return } r.Body = http.MaxBytesReader(w, r.Body, maxRequestBodyBytes) + + // Split the discovery and per-node fan-out budgets. Reusing one ctx for + // both lets a slow membership refresh consume the entire deadline and + // leave the fan-out with an already-canceled context, so separate them. + discoveryCtx, discoveryCancel := context.WithTimeout(r.Context(), discoveryWaitBudget) + targets := f.currentTargets(discoveryCtx) + discoveryCancel() + ctx, cancel := context.WithTimeout(r.Context(), defaultGRPCRequestTimeout) defer cancel() - - targets := f.currentTargets(ctx) results := make([]perNodeResult, len(targets)) var wg sync.WaitGroup for i, addr := range targets { diff --git a/internal/raftengine/etcd/fsm_snapshot_file.go b/internal/raftengine/etcd/fsm_snapshot_file.go index 8e0e498f..9ed5825a 100644 --- a/internal/raftengine/etcd/fsm_snapshot_file.go +++ b/internal/raftengine/etcd/fsm_snapshot_file.go @@ -19,6 +19,7 @@ import ( const ( fsmSnapDirName = "fsm-snap" + snapFileExt = ".snap" snapshotTokenSize = 17 // 4 (magic) + 1 (version) + 8 (index) + 4 (crc32c) snapshotTokenVersion = byte(0x01) @@ -135,7 +136,7 @@ func fsmSnapPath(fsmSnapDir string, index uint64) string { // Snap files are named "{term:016x}-{index:016x}.snap". // Returns 0 on parse failure. func parseSnapFileIndex(name string) uint64 { - base := strings.TrimSuffix(name, ".snap") + base := strings.TrimSuffix(name, snapFileExt) idx := strings.LastIndex(base, "-") if idx < 0 { return 0 @@ -554,7 +555,7 @@ func collectLiveSnapIndexes(snapDir string) (map[uint64]bool, error) { } liveIndexes := make(map[uint64]bool, len(snapEntries)) for _, e := range snapEntries { - if !e.IsDir() && filepath.Ext(e.Name()) == ".snap" { + if !e.IsDir() && filepath.Ext(e.Name()) == snapFileExt { if idx := parseSnapFileIndex(e.Name()); idx > 0 { liveIndexes[idx] = true } @@ -644,7 +645,7 @@ func purgeOldSnapshotFiles(snapDir, fsmSnapDir string) error { func collectSnapNames(entries []os.DirEntry) []string { var snaps []string for _, e := range entries { - if !e.IsDir() && filepath.Ext(e.Name()) == ".snap" { + if !e.IsDir() && filepath.Ext(e.Name()) == snapFileExt { snaps = append(snaps, e.Name()) } } diff --git a/main.go b/main.go index d0d81970..3947ef92 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "context" "flag" + "fmt" "log" "net" "net/http" @@ -107,8 +108,12 @@ var ( raftRedisMap = flag.String("raftRedisMap", "", "Map of Raft address to Redis address (raftAddr=redisAddr,...)") raftS3Map = flag.String("raftS3Map", "", "Map of Raft address to S3 address (raftAddr=s3Addr,...)") raftDynamoMap = flag.String("raftDynamoMap", "", "Map of Raft address to DynamoDB address (raftAddr=dynamoAddr,...)") + adminTokenFile = flag.String("adminTokenFile", "", "Path to a file containing the read-only bearer token required on the Admin gRPC service (leave blank with --adminInsecureNoAuth off to disable the Admin service)") + adminInsecureNoAuth = flag.Bool("adminInsecureNoAuth", false, "Register the Admin gRPC service without bearer-token authentication; development only") ) +const adminTokenMaxBytes = 4 << 10 + func main() { flag.Parse() @@ -197,6 +202,11 @@ func run() error { return nil }) + adminServer, adminGRPCOpts, err := setupAdminService(*raftId, *myAddr, runtimes) + if err != nil { + return err + } + runner := runtimeServerRunner{ ctx: runCtx, lc: &lc, @@ -206,6 +216,8 @@ func run() error { shardStore: shardStore, coordinate: coordinate, distServer: distServer, + adminServer: adminServer, + adminGRPCOpts: adminGRPCOpts, redisAddress: *redisAddr, leaderRedis: cfg.leaderRedis, pubsubRelay: adapter.NewRedisPubSubRelay(), @@ -493,6 +505,98 @@ func dispatchMonitorSources(runtimes []*raftGroupRuntime) []monitoring.DispatchS return out } +// setupAdminService is a thin wrapper around configureAdminService that also +// binds each Raft runtime to the server and logs an operator warning when +// running without authentication. Keeping this out of run() preserves run's +// cyclomatic-complexity budget. +func setupAdminService( + nodeID, grpcAddress string, + runtimes []*raftGroupRuntime, +) (*adapter.AdminServer, []grpc.ServerOption, error) { + srv, opts, err := configureAdminService( + *adminTokenFile, + *adminInsecureNoAuth, + adapter.NodeIdentity{NodeID: nodeID, GRPCAddress: grpcAddress}, + ) + if err != nil { + return nil, nil, err + } + if srv == nil { + return nil, nil, nil + } + for _, rt := range runtimes { + srv.RegisterGroup(rt.spec.id, rt.engine) + } + if *adminInsecureNoAuth { + log.Printf("WARNING: --adminInsecureNoAuth is set; Admin gRPC service exposed without authentication") + } + return srv, opts, nil +} + +// configureAdminService builds the node-side AdminServer plus the gRPC +// interceptor options that enforce its bearer token, or returns (nil, nil, +// nil) when the service is intentionally disabled. It is mutually exclusive +// with --adminInsecureNoAuth so operators have to opt into the unauthenticated +// mode explicitly. +func configureAdminService( + tokenPath string, + insecureNoAuth bool, + self adapter.NodeIdentity, +) (*adapter.AdminServer, []grpc.ServerOption, error) { + if tokenPath == "" && !insecureNoAuth { + return nil, nil, nil + } + if tokenPath != "" && insecureNoAuth { + return nil, nil, errors.New("--adminInsecureNoAuth and --adminTokenFile are mutually exclusive") + } + token := "" + if tokenPath != "" { + loaded, err := loadAdminTokenFile(tokenPath) + if err != nil { + return nil, nil, err + } + token = loaded + } + srv := adapter.NewAdminServer(self, nil) + unary, stream := adapter.AdminTokenAuth(token) + var opts []grpc.ServerOption + if unary != nil { + opts = append(opts, grpc.ChainUnaryInterceptor(unary)) + } + if stream != nil { + opts = append(opts, grpc.ChainStreamInterceptor(stream)) + } + return srv, opts, nil +} + +// loadAdminTokenFile materialises --adminTokenFile with a strict upper bound +// so a misconfigured path (for example a log file) cannot force an arbitrary +// allocation before the bearer-token check. +func loadAdminTokenFile(path string) (string, error) { + abs, err := filepath.Abs(path) + if err != nil { + return "", errors.Wrap(err, "resolve admin token path") + } + info, err := os.Stat(abs) + if err != nil { + return "", errors.Wrap(err, "stat admin token file") + } + if info.Size() > adminTokenMaxBytes { + return "", fmt.Errorf( + "admin token file %s is %d bytes; maximum is %d", + abs, info.Size(), adminTokenMaxBytes) + } + b, err := os.ReadFile(abs) + if err != nil { + return "", errors.Wrap(err, "read admin token file") + } + token := strings.TrimSpace(string(b)) + if token == "" { + return "", errors.New("admin token file is empty") + } + return token, nil +} + // startMonitoringCollectors wires up the per-tick Prometheus // collectors (raft dispatch, Pebble LSM, store-layer OCC conflicts) // on top of the running raft runtimes. Kept separate from run() so @@ -561,15 +665,22 @@ func startRaftServers( distServer *adapter.DistributionServer, relay *adapter.RedisPubSubRelay, proposalObserverForGroup func(uint64) kv.ProposalObserver, + adminServer *adapter.AdminServer, + adminGRPCOpts []grpc.ServerOption, ) error { for _, rt := range runtimes { - gs := grpc.NewServer(internalutil.GRPCServerOptions()...) + opts := append([]grpc.ServerOption(nil), internalutil.GRPCServerOptions()...) + opts = append(opts, adminGRPCOpts...) + gs := grpc.NewServer(opts...) trx := kv.NewTransactionWithProposer(rt.engine, kv.WithProposalObserver(observerForGroup(proposalObserverForGroup, rt.spec.id))) grpcSvc := adapter.NewGRPCServer(shardStore, coordinate) pb.RegisterRawKVServer(gs, grpcSvc) pb.RegisterTransactionalKVServer(gs, grpcSvc) pb.RegisterInternalServer(gs, adapter.NewInternalWithEngine(trx, rt.engine, coordinate.Clock(), relay)) pb.RegisterDistributionServer(gs, distServer) + if adminServer != nil { + pb.RegisterAdminServer(gs, adminServer) + } rt.registerGRPC(gs) internalraftadmin.RegisterOperationalServices(ctx, gs, rt.engine, []string{"RawKV"}) reflection.Register(gs) @@ -790,6 +901,8 @@ type runtimeServerRunner struct { shardStore *kv.ShardStore coordinate kv.Coordinator distServer *adapter.DistributionServer + adminServer *adapter.AdminServer + adminGRPCOpts []grpc.ServerOption redisAddress string leaderRedis map[raft.ServerAddress]string pubsubRelay *adapter.RedisPubSubRelay @@ -824,6 +937,8 @@ func (r runtimeServerRunner) start() error { func(groupID uint64) kv.ProposalObserver { return r.metricsRegistry.RaftProposalObserver(groupID) }, + r.adminServer, + r.adminGRPCOpts, ); err != nil { return waitErrgroupAfterStartupFailure(r.cancel, r.eg, err) } diff --git a/main_admin_test.go b/main_admin_test.go new file mode 100644 index 00000000..1b458061 --- /dev/null +++ b/main_admin_test.go @@ -0,0 +1,79 @@ +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/bootjp/elastickv/adapter" +) + +func TestConfigureAdminServiceDisabledByDefault(t *testing.T) { + t.Parallel() + srv, opts, err := configureAdminService("", false, adapter.NodeIdentity{NodeID: "n1"}) + if err != nil { + t.Fatalf("disabled-by-default should not error: %v", err) + } + if srv != nil || opts != nil { + t.Fatalf("disabled service should return nil, nil; got %v %v", srv, opts) + } +} + +func TestConfigureAdminServiceRejectsMutualExclusion(t *testing.T) { + t.Parallel() + dir := t.TempDir() + tokPath := filepath.Join(dir, "t") + if err := os.WriteFile(tokPath, []byte("x"), 0o600); err != nil { + t.Fatal(err) + } + if _, _, err := configureAdminService(tokPath, true, adapter.NodeIdentity{}); err == nil { + t.Fatal("expected mutual-exclusion error") + } +} + +func TestConfigureAdminServiceTokenFile(t *testing.T) { + t.Parallel() + dir := t.TempDir() + tokPath := filepath.Join(dir, "t") + if err := os.WriteFile(tokPath, []byte("hunter2\n"), 0o600); err != nil { + t.Fatal(err) + } + srv, opts, err := configureAdminService(tokPath, false, adapter.NodeIdentity{NodeID: "n1"}) + if err != nil { + t.Fatalf("configureAdminService: %v", err) + } + if srv == nil { + t.Fatal("expected an AdminServer instance") + } + // Expect a unary + stream interceptor for the admin-token gate. + if len(opts) != 2 { + t.Fatalf("expected 2 grpc.ServerOption (unary + stream), got %d", len(opts)) + } +} + +func TestConfigureAdminServiceInsecureNoAuth(t *testing.T) { + t.Parallel() + srv, opts, err := configureAdminService("", true, adapter.NodeIdentity{NodeID: "n1"}) + if err != nil { + t.Fatalf("insecure mode should succeed: %v", err) + } + if srv == nil { + t.Fatal("expected AdminServer in insecure mode") + } + if len(opts) != 0 { + t.Fatalf("insecure mode should not attach interceptors, got %d", len(opts)) + } +} + +func TestLoadAdminTokenFileRejectsOversize(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "huge") + if err := os.WriteFile(path, []byte(strings.Repeat("x", adminTokenMaxBytes+1)), 0o600); err != nil { + t.Fatal(err) + } + if _, err := loadAdminTokenFile(path); err == nil || !strings.Contains(err.Error(), "maximum is") { + t.Fatalf("expected size-cap error, got %v", err) + } +} From 4a3a49f7e82f46102dc07b17b8cc5011b5711f74 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 03:50:06 +0900 Subject: [PATCH 18/30] fix(admin): plaintext default + populate admin members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-2 review flagged two P1 issues on commit a3e0dc3e: 1. Admin binary dialed TLS by default, but Phase 0 nodes only serve plaintext — every default invocation failed during handshake. Flip loadTransportCredentials so plaintext is the default when neither --nodeTLSCACertFile nor --nodeTLSInsecureSkipVerify is set; TLS is now explicit opt-in. Removed the now-redundant --nodeTLSPlaintext flag. --nodeTLSServerName is an error without a TLS flag. 2. main.go constructed the node's AdminServer with members=nil, so the admin binary's discovery cache collapsed fan-out to whichever seed responded. Populate members from bootstrapServers via adminMembersFromBootstrap (excludes self) so GetClusterOverview returns the real peer list and the admin binary fans out to the full cluster. Tests: updated TLS precedence to reflect the new default, added coverage for adminMembersFromBootstrap (self exclusion, empty bootstrap, single-node case), and refreshed configureAdminService signatures through the members parameter. --- cmd/elastickv-admin/main.go | 36 ++++++++++++++++------------- cmd/elastickv-admin/main_test.go | 36 ++++++++++++++++++----------- main.go | 34 +++++++++++++++++++++++++--- main_admin_test.go | 39 ++++++++++++++++++++++++++++---- 4 files changed, 109 insertions(+), 36 deletions(-) diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 1beffbd6..f7925e0d 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -76,10 +76,13 @@ var ( nodeTokenFile = flag.String("nodeTokenFile", "", "File containing the bearer token sent to nodes' Admin service") nodesRefreshInterval = flag.Duration("nodesRefreshInterval", defaultNodesRefreshInterval, "Duration to cache cluster membership before re-fetching") insecureNoAuth = flag.Bool("adminInsecureNoAuth", false, "Skip bearer token authentication; development only") - nodeTLSCACertFile = flag.String("nodeTLSCACertFile", "", "PEM file with CA certificates used to verify nodes' gRPC TLS; enables TLS when set") - nodeTLSServerName = flag.String("nodeTLSServerName", "", "Expected TLS server name when connecting to nodes (overrides the address host)") - nodeTLSSkipVerify = flag.Bool("nodeTLSInsecureSkipVerify", false, "Skip TLS certificate verification; development only") - nodeTLSPlaintext = flag.Bool("nodeTLSPlaintext", false, "Skip TLS entirely and dial nodes with plaintext credentials; development only") + // Node gRPC is plaintext in Phase 0, so the admin binary defaults to + // plaintext too. TLS is opt-in: set --nodeTLSCACertFile (preferred) or + // --nodeTLSInsecureSkipVerify to switch to TLS. When the cluster turns + // on TLS, operators flip the flag without code changes. + nodeTLSCACertFile = flag.String("nodeTLSCACertFile", "", "PEM file with CA certificates used to verify nodes' gRPC TLS; setting this flag enables TLS dialing") + nodeTLSServerName = flag.String("nodeTLSServerName", "", "Expected TLS server name when connecting to nodes (overrides the address host); only honoured when TLS is enabled") + nodeTLSSkipVerify = flag.Bool("nodeTLSInsecureSkipVerify", false, "Dial nodes with TLS but skip certificate verification; development only. Implies TLS.") ) func main() { @@ -100,7 +103,7 @@ func run() error { return err } - creds, err := loadTransportCredentials(*nodeTLSPlaintext, *nodeTLSCACertFile, *nodeTLSServerName, *nodeTLSSkipVerify) + creds, err := loadTransportCredentials(*nodeTLSCACertFile, *nodeTLSServerName, *nodeTLSSkipVerify) if err != nil { return err } @@ -207,30 +210,31 @@ func loadToken(path string, insecureMode bool) (string, error) { } // loadTransportCredentials builds the gRPC TransportCredentials used to dial -// nodes. Precedence: --nodeTLSPlaintext (dev-only plaintext) → mutually -// exclusive with the TLS flags → otherwise TLS with the system trust roots by -// default, optionally overridden by --nodeTLSCACertFile and -// --nodeTLSInsecureSkipVerify. +// nodes. Phase 0 nodes expose a plaintext gRPC server, so the default is +// insecure credentials — if neither --nodeTLSCACertFile nor +// --nodeTLSInsecureSkipVerify is set, the admin binary dials plaintext. +// Passing either flag opts into TLS; --nodeTLSServerName is honoured only +// alongside a TLS opt-in. func loadTransportCredentials( - plaintext bool, caFile, serverName string, skipVerify bool, ) (credentials.TransportCredentials, error) { - if plaintext { - if caFile != "" || serverName != "" || skipVerify { - return nil, errors.New("--nodeTLSPlaintext is mutually exclusive with other TLS flags") + tlsRequested := caFile != "" || skipVerify + if !tlsRequested { + if serverName != "" { + return nil, errors.New("--nodeTLSServerName requires TLS; set --nodeTLSCACertFile or --nodeTLSInsecureSkipVerify") } return insecure.NewCredentials(), nil } + if caFile != "" && skipVerify { + return nil, errors.New("--nodeTLSCACertFile and --nodeTLSInsecureSkipVerify are mutually exclusive") + } cfg := &tls.Config{ MinVersion: tls.VersionTLS12, ServerName: serverName, InsecureSkipVerify: skipVerify, //nolint:gosec // gated behind --nodeTLSInsecureSkipVerify; dev-only. } if caFile != "" { - if skipVerify { - return nil, errors.New("--nodeTLSCACertFile and --nodeTLSInsecureSkipVerify are mutually exclusive") - } pem, err := os.ReadFile(caFile) if err != nil { return nil, errors.Wrap(err, "read node TLS CA file") diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index 3549a0fd..3d9c2112 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -97,40 +97,50 @@ func TestLoadTokenRejectsInsecureWithFile(t *testing.T) { } } -func TestLoadTransportCredentialsPrecedence(t *testing.T) { +func TestLoadTransportCredentialsPlaintextDefault(t *testing.T) { t.Parallel() - - if _, err := loadTransportCredentials(true, "", "", false); err != nil { - t.Fatalf("plaintext alone should succeed: %v", err) - } - if _, err := loadTransportCredentials(true, "/tmp/ca.pem", "", false); err == nil { - t.Fatal("plaintext + CA file should error") + if _, err := loadTransportCredentials("", "", false); err != nil { + t.Fatalf("no-flags default should succeed: %v", err) } - if _, err := loadTransportCredentials(true, "", "", true); err == nil { - t.Fatal("plaintext + skip-verify should error") + if _, err := loadTransportCredentials("", "node-1", false); err == nil { + t.Fatal("serverName without TLS opt-in should error") } +} +func TestLoadTransportCredentialsTLS(t *testing.T) { + t.Parallel() dir := t.TempDir() ca := filepath.Join(dir, "ca.pem") if err := os.WriteFile(ca, writePEMCert(t), 0o600); err != nil { t.Fatal(err) } - if _, err := loadTransportCredentials(false, ca, "", true); err == nil { - t.Fatal("CA file + skip-verify should error") + if _, err := loadTransportCredentials(ca, "", true); err == nil { + t.Fatal("CA file + skip-verify should error (mutually exclusive)") } - creds, err := loadTransportCredentials(false, ca, "node-1", false) + creds, err := loadTransportCredentials(ca, "node-1", false) if err != nil { t.Fatalf("valid CA config failed: %v", err) } if creds == nil { t.Fatal("expected TLS creds") } + creds, err = loadTransportCredentials("", "", true) + if err != nil { + t.Fatalf("skip-verify alone should succeed: %v", err) + } + if creds == nil { + t.Fatal("expected TLS creds for skip-verify") + } +} +func TestLoadTransportCredentialsRejectsBadCA(t *testing.T) { + t.Parallel() + dir := t.TempDir() bad := filepath.Join(dir, "bad.pem") if err := os.WriteFile(bad, []byte("not a cert"), 0o600); err != nil { t.Fatal(err) } - if _, err := loadTransportCredentials(false, bad, "", false); err == nil { + if _, err := loadTransportCredentials(bad, "", false); err == nil { t.Fatal("expected error for unparseable CA file") } } diff --git a/main.go b/main.go index 3947ef92..a88d6668 100644 --- a/main.go +++ b/main.go @@ -202,7 +202,7 @@ func run() error { return nil }) - adminServer, adminGRPCOpts, err := setupAdminService(*raftId, *myAddr, runtimes) + adminServer, adminGRPCOpts, err := setupAdminService(*raftId, *myAddr, runtimes, bootstrapServers) if err != nil { return err } @@ -508,15 +508,20 @@ func dispatchMonitorSources(runtimes []*raftGroupRuntime) []monitoring.DispatchS // setupAdminService is a thin wrapper around configureAdminService that also // binds each Raft runtime to the server and logs an operator warning when // running without authentication. Keeping this out of run() preserves run's -// cyclomatic-complexity budget. +// cyclomatic-complexity budget. Members are seeded from the bootstrap +// configuration so GetClusterOverview advertises peer node addresses to the +// admin binary's fan-out discovery path. func setupAdminService( nodeID, grpcAddress string, runtimes []*raftGroupRuntime, + bootstrapServers []raft.Server, ) (*adapter.AdminServer, []grpc.ServerOption, error) { + members := adminMembersFromBootstrap(nodeID, bootstrapServers) srv, opts, err := configureAdminService( *adminTokenFile, *adminInsecureNoAuth, adapter.NodeIdentity{NodeID: nodeID, GRPCAddress: grpcAddress}, + members, ) if err != nil { return nil, nil, err @@ -533,6 +538,28 @@ func setupAdminService( return srv, opts, nil } +// adminMembersFromBootstrap extracts the peer list (everyone except self) from +// the Raft bootstrap configuration so GetClusterOverview returns a populated +// members list. Without this the admin binary's membersFrom cache collapses to +// only the responding seed and stops fanning out across the cluster. +func adminMembersFromBootstrap(selfID string, servers []raft.Server) []adapter.NodeIdentity { + if len(servers) == 0 { + return nil + } + out := make([]adapter.NodeIdentity, 0, len(servers)) + for _, s := range servers { + id := string(s.ID) + if id == selfID { + continue + } + out = append(out, adapter.NodeIdentity{ + NodeID: id, + GRPCAddress: string(s.Address), + }) + } + return out +} + // configureAdminService builds the node-side AdminServer plus the gRPC // interceptor options that enforce its bearer token, or returns (nil, nil, // nil) when the service is intentionally disabled. It is mutually exclusive @@ -542,6 +569,7 @@ func configureAdminService( tokenPath string, insecureNoAuth bool, self adapter.NodeIdentity, + members []adapter.NodeIdentity, ) (*adapter.AdminServer, []grpc.ServerOption, error) { if tokenPath == "" && !insecureNoAuth { return nil, nil, nil @@ -557,7 +585,7 @@ func configureAdminService( } token = loaded } - srv := adapter.NewAdminServer(self, nil) + srv := adapter.NewAdminServer(self, members) unary, stream := adapter.AdminTokenAuth(token) var opts []grpc.ServerOption if unary != nil { diff --git a/main_admin_test.go b/main_admin_test.go index 1b458061..bb9a3ee3 100644 --- a/main_admin_test.go +++ b/main_admin_test.go @@ -7,11 +7,12 @@ import ( "testing" "github.com/bootjp/elastickv/adapter" + "github.com/hashicorp/raft" ) func TestConfigureAdminServiceDisabledByDefault(t *testing.T) { t.Parallel() - srv, opts, err := configureAdminService("", false, adapter.NodeIdentity{NodeID: "n1"}) + srv, opts, err := configureAdminService("", false, adapter.NodeIdentity{NodeID: "n1"}, nil) if err != nil { t.Fatalf("disabled-by-default should not error: %v", err) } @@ -27,7 +28,7 @@ func TestConfigureAdminServiceRejectsMutualExclusion(t *testing.T) { if err := os.WriteFile(tokPath, []byte("x"), 0o600); err != nil { t.Fatal(err) } - if _, _, err := configureAdminService(tokPath, true, adapter.NodeIdentity{}); err == nil { + if _, _, err := configureAdminService(tokPath, true, adapter.NodeIdentity{}, nil); err == nil { t.Fatal("expected mutual-exclusion error") } } @@ -39,7 +40,7 @@ func TestConfigureAdminServiceTokenFile(t *testing.T) { if err := os.WriteFile(tokPath, []byte("hunter2\n"), 0o600); err != nil { t.Fatal(err) } - srv, opts, err := configureAdminService(tokPath, false, adapter.NodeIdentity{NodeID: "n1"}) + srv, opts, err := configureAdminService(tokPath, false, adapter.NodeIdentity{NodeID: "n1"}, nil) if err != nil { t.Fatalf("configureAdminService: %v", err) } @@ -54,7 +55,7 @@ func TestConfigureAdminServiceTokenFile(t *testing.T) { func TestConfigureAdminServiceInsecureNoAuth(t *testing.T) { t.Parallel() - srv, opts, err := configureAdminService("", true, adapter.NodeIdentity{NodeID: "n1"}) + srv, opts, err := configureAdminService("", true, adapter.NodeIdentity{NodeID: "n1"}, nil) if err != nil { t.Fatalf("insecure mode should succeed: %v", err) } @@ -66,6 +67,36 @@ func TestConfigureAdminServiceInsecureNoAuth(t *testing.T) { } } +func TestAdminMembersFromBootstrapExcludesSelf(t *testing.T) { + t.Parallel() + servers := []raft.Server{ + {ID: "n1", Address: "10.0.0.11:50051"}, + {ID: "n2", Address: "10.0.0.12:50051"}, + {ID: "n3", Address: "10.0.0.13:50051"}, + } + got := adminMembersFromBootstrap("n1", servers) + if len(got) != 2 { + t.Fatalf("len = %d, want 2 (self excluded)", len(got)) + } + want := map[string]string{"n2": "10.0.0.12:50051", "n3": "10.0.0.13:50051"} + for _, m := range got { + if want[m.NodeID] != m.GRPCAddress { + t.Fatalf("member %+v not in expected set %v", m, want) + } + } +} + +func TestAdminMembersFromBootstrapEmpty(t *testing.T) { + t.Parallel() + if got := adminMembersFromBootstrap("n1", nil); got != nil { + t.Fatalf("empty bootstrap should produce nil, got %v", got) + } + single := []raft.Server{{ID: "n1", Address: "a:1"}} + if got := adminMembersFromBootstrap("n1", single); len(got) != 0 { + t.Fatalf("single-node bootstrap should yield no members, got %v", got) + } +} + func TestLoadAdminTokenFileRejectsOversize(t *testing.T) { t.Parallel() dir := t.TempDir() From ac8d6c3f158952e906e9a880903129fe4d369c55 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 13:18:26 +0900 Subject: [PATCH 19/30] fix(admin): evict seed entries when cache has no non-seed victim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P2 on commit 4a3a49f7: when len(f.clients) >= maxCachedClients and every cached entry is a seed (operator passes >= maxCachedClients addresses via --nodes), the previous eviction loop skipped every entry, the loop became a no-op, and clientFor still inserted the new client — so the cap was vacuous on that path and the cache could leak file descriptors. - Extract eviction into evictOneLocked: first pass prefers a non-seed victim, second pass falls back to any entry (including seeds) so the cap is always enforced. - Add regression test that saturates the cache with maxCachedClients+3 seeds and asserts the cache never exceeds the cap. --- cmd/elastickv-admin/main.go | 57 ++++++++++++++++++++++---------- cmd/elastickv-admin/main_test.go | 26 +++++++++++++++ 2 files changed, 65 insertions(+), 18 deletions(-) diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index f7925e0d..d543cb8e 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -330,25 +330,14 @@ func (f *fanout) clientFor(addr string) (*nodeClient, error) { return c, nil } // Bound the cache so a high-churn cluster or a stream of hostile - // discovery responses cannot leak file descriptors. Evict any one entry - // (map iteration is randomized) to make room; the evicted target will be - // re-dialed on demand if it comes back. Never evict an address that is in - // the active seeds list. + // discovery responses cannot leak file descriptors. Prefer evicting a + // non-seed entry — those are easiest to re-dial on demand — but when the + // cache is already saturated with seeds (operator passed >=maxCachedClients + // addresses to --nodes), fall back to evicting any entry. Without that + // fallback the cap is vacuous in the seed-heavy configuration Codex + // flagged, so the cache could grow without bound. if len(f.clients) >= maxCachedClients { - seeds := map[string]struct{}{} - for _, s := range f.seeds { - seeds[s] = struct{}{} - } - for victim, vc := range f.clients { - if _, keep := seeds[victim]; keep { - continue - } - delete(f.clients, victim) - if err := vc.conn.Close(); err != nil { - log.Printf("elastickv-admin: evict %s: close: %v", victim, err) - } - break - } + f.evictOneLocked() } conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(f.creds)) if err != nil { @@ -359,6 +348,38 @@ func (f *fanout) clientFor(addr string) (*nodeClient, error) { return c, nil } +// evictOneLocked removes exactly one entry from f.clients and closes its +// connection. Prefers non-seed entries; falls back to any entry if none exist +// (for example when len(seeds) >= maxCachedClients). Caller must hold f.mu. +func (f *fanout) evictOneLocked() { + seeds := make(map[string]struct{}, len(f.seeds)) + for _, s := range f.seeds { + seeds[s] = struct{}{} + } + var fallback string + var fallbackClient *nodeClient + for victim, vc := range f.clients { + if fallback == "" { + fallback, fallbackClient = victim, vc + } + if _, keep := seeds[victim]; keep { + continue + } + delete(f.clients, victim) + if err := vc.conn.Close(); err != nil { + log.Printf("elastickv-admin: evict %s: close: %v", victim, err) + } + return + } + if fallbackClient == nil { + return + } + delete(f.clients, fallback) + if err := fallbackClient.conn.Close(); err != nil { + log.Printf("elastickv-admin: evict %s (seed-fallback): close: %v", fallback, err) + } +} + // invalidateClient drops a cached connection — used when a peer returns // Unavailable so the next request re-dials or skips the removed node. func (f *fanout) invalidateClient(addr string) { diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index 3d9c2112..7c69dac2 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -211,6 +211,32 @@ func strconvItoa(i int) string { return string(digits) } +// TestFanoutClientCacheEvictsEvenWhenAllEntriesAreSeeds asserts that when +// operators configure more seeds than maxCachedClients the cache still honors +// its cap — without the seed-fallback, the eviction loop would skip every +// entry and the cache would grow past the documented bound. +func TestFanoutClientCacheEvictsEvenWhenAllEntriesAreSeeds(t *testing.T) { + t.Parallel() + seeds := make([]string, 0, maxCachedClients+3) + for i := 0; i < maxCachedClients+3; i++ { + seeds = append(seeds, "seed-"+strconvItoa(i)+":1") + } + f := newFanout(seeds, "", time.Second, insecure.NewCredentials()) + defer f.Close() + + for _, s := range seeds { + if _, err := f.clientFor(s); err != nil { + t.Fatalf("clientFor(%s): %v", s, err) + } + } + f.mu.Lock() + size := len(f.clients) + f.mu.Unlock() + if size > maxCachedClients { + t.Fatalf("cache size = %d, exceeds cap %d (seed-only path)", size, maxCachedClients) + } +} + func TestFanoutClientCacheEvictsWhenFull(t *testing.T) { t.Parallel() f := newFanout([]string{"seed:1"}, "", time.Second, insecure.NewCredentials()) From f2935fc25013537ee6e3d44251bec153ec2489e3 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Thu, 23 Apr 2026 13:47:23 +0900 Subject: [PATCH 20/30] fix(admin),docs: round-11 review + main merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Merged main (PR #590 dropped hashicorp/raft); migrated admin wiring to raftengine.Server. Addressed Codex P2 and CodeRabbit findings on commit ac8d6c3f: - Refcount cached gRPC clients (cmd/elastickv-admin): clientFor now returns (client, release, err); evictOneLocked and invalidateClient retire entries but keep the *grpc.ClientConn alive until the last borrower releases. Prevents spurious cancellation of healthy in-flight GetClusterOverview calls when the cache hits maxCachedClients under fan-out load. - New --allowRemoteBind flag; by default --bindAddr is rejected unless the host is loopback. The admin HTTP layer has no browser-facing auth while holding the node admin token, so remote bind is an explicit opt-in for operators behind an auth proxy. - AdminServer.GetRaftGroups now populates LastContactUnixMs from raftengine.Status.LastContact (absolute ms = nowFunc() - LastContact). Nowtest-seamed via nowFunc so tests are deterministic. - Design doc §5.1 flush path rewritten to use atomic.SwapUint64 instead of pointer-swap, so no late Observe can race past the flush snapshot — no retired counters to harvest. - Design doc §5.2 calibration: replace "benchmark at startup" with a checked-in costPerObserveNs constant with CI drift check. Regression tests added under -race for: eviction-while-in-flight, post-close invalidateClient no-op, --bindAddr loopback validation, GetRaftGroups LastContactUnixMs. --- adapter/admin_grpc.go | 23 +++- adapter/admin_grpc_test.go | 31 ++++- cmd/elastickv-admin/main.go | 184 ++++++++++++++++++------- cmd/elastickv-admin/main_test.go | 75 +++++++++- docs/admin_ui_key_visualizer_design.md | 4 +- main.go | 11 +- main_admin_test.go | 6 +- 7 files changed, 268 insertions(+), 66 deletions(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index d8feaa55..d86abf4d 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -6,6 +6,7 @@ import ( "sort" "strings" "sync" + "time" "github.com/bootjp/elastickv/internal/raftengine" pb "github.com/bootjp/elastickv/proto" @@ -16,6 +17,10 @@ import ( "google.golang.org/grpc/status" ) +// nowFunc is a test seam for injecting a fixed clock into GetRaftGroups so +// its timestamp output is deterministic. +var nowFunc = time.Now + // AdminGroup exposes per-Raft-group state to the Admin service. It is a narrow // subset of raftengine.Engine so tests can supply an in-memory fake without // standing up a real Raft cluster. @@ -103,14 +108,22 @@ func (s *AdminServer) GetRaftGroups( defer s.groupsMu.RUnlock() ids := sortedGroupIDs(s.groups) out := make([]*pb.RaftGroupState, 0, len(ids)) + now := nowFunc() for _, id := range ids { st := s.groups[id].Status() + // Translate LastContact (duration since the last contact with the + // leader, per raftengine.Status) into an absolute unix-ms so UI + // clients can diff against their own clock instead of having to + // reason about the server's uptime. Zero LastContact (leader on + // self, or no contact recorded yet) reports the current time + // rather than an arbitrary epoch zero. out = append(out, &pb.RaftGroupState{ - RaftGroupId: id, - LeaderNodeId: st.Leader.ID, - LeaderTerm: st.Term, - CommitIndex: st.CommitIndex, - AppliedIndex: st.AppliedIndex, + RaftGroupId: id, + LeaderNodeId: st.Leader.ID, + LeaderTerm: st.Term, + CommitIndex: st.CommitIndex, + AppliedIndex: st.AppliedIndex, + LastContactUnixMs: now.Add(-st.LastContact).UnixMilli(), }) } return &pb.GetRaftGroupsResponse{Groups: out}, nil diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index 38326544..ff3a7ba4 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -3,6 +3,7 @@ package adapter import ( "context" "testing" + "time" "github.com/bootjp/elastickv/internal/raftengine" pb "github.com/bootjp/elastickv/proto" @@ -55,7 +56,13 @@ func TestGetClusterOverviewReturnsSelfAndLeaders(t *testing.T) { func TestGetRaftGroupsExposesCommitApplied(t *testing.T) { t.Parallel() srv := NewAdminServer(NodeIdentity{NodeID: "n1"}, nil) - srv.RegisterGroup(1, fakeGroup{leaderID: "n1", term: 2, commit: 99, applied: 97}) + srv.RegisterGroup(1, fakeGroupWithContact{leaderID: "n1", term: 2, commit: 99, applied: 97, lastContact: 5 * time.Second}) + + // Freeze nowFunc so the computed last-contact timestamp is deterministic. + origNow := nowFunc + fixed := time.Unix(1_000_000, 0) + nowFunc = func() time.Time { return fixed } + t.Cleanup(func() { nowFunc = origNow }) resp, err := srv.GetRaftGroups(context.Background(), &pb.GetRaftGroupsRequest{}) if err != nil { @@ -68,6 +75,28 @@ func TestGetRaftGroupsExposesCommitApplied(t *testing.T) { if g.CommitIndex != 99 || g.AppliedIndex != 97 || g.LeaderTerm != 2 { t.Fatalf("unexpected state %+v", g) } + wantLastContact := fixed.Add(-5 * time.Second).UnixMilli() + if g.LastContactUnixMs != wantLastContact { + t.Fatalf("LastContactUnixMs = %d, want %d", g.LastContactUnixMs, wantLastContact) + } +} + +type fakeGroupWithContact struct { + leaderID string + term uint64 + commit uint64 + applied uint64 + lastContact time.Duration +} + +func (f fakeGroupWithContact) Status() raftengine.Status { + return raftengine.Status{ + Leader: raftengine.LeaderInfo{ID: f.leaderID}, + Term: f.term, + CommitIndex: f.commit, + AppliedIndex: f.applied, + LastContact: f.lastContact, + } } // TestGroupOrderingIsStable locks in deterministic ascending-by-RaftGroupId diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index d543cb8e..2e0501a2 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -12,6 +12,7 @@ import ( "flag" "fmt" "log" + "net" "net/http" "os" "os/signal" @@ -83,6 +84,7 @@ var ( nodeTLSCACertFile = flag.String("nodeTLSCACertFile", "", "PEM file with CA certificates used to verify nodes' gRPC TLS; setting this flag enables TLS dialing") nodeTLSServerName = flag.String("nodeTLSServerName", "", "Expected TLS server name when connecting to nodes (overrides the address host); only honoured when TLS is enabled") nodeTLSSkipVerify = flag.Bool("nodeTLSInsecureSkipVerify", false, "Dial nodes with TLS but skip certificate verification; development only. Implies TLS.") + allowRemoteBind = flag.Bool("allowRemoteBind", false, "Allow --bindAddr to listen on a non-loopback interface. The admin UI has no browser-facing auth; set this only when the UI is fronted by an authenticating reverse proxy.") ) func main() { @@ -92,25 +94,36 @@ func main() { } } -func run() error { +type runConfig struct { + seeds []string + fan *fanout +} + +// initRun consolidates flag parsing and fanout construction so run() stays +// under the project's cyclop budget. +func initRun() (runConfig, error) { seeds := splitNodes(*nodes) if len(seeds) == 0 { - return errors.New("--nodes is required (comma-separated gRPC addresses)") + return runConfig{}, errors.New("--nodes is required (comma-separated gRPC addresses)") } - token, err := loadToken(*nodeTokenFile, *insecureNoAuth) if err != nil { - return err + return runConfig{}, err + } + if err := validateBindAddr(*bindAddr, *allowRemoteBind); err != nil { + return runConfig{}, err } - creds, err := loadTransportCredentials(*nodeTLSCACertFile, *nodeTLSServerName, *nodeTLSSkipVerify) if err != nil { - return err + return runConfig{}, err } - fan := newFanout(seeds, token, *nodesRefreshInterval, creds) - defer fan.Close() + return runConfig{seeds: seeds, fan: fan}, nil +} +// buildMux wires the Phase 0 HTTP surface. Lives outside run() both for +// testability and to keep run() under the cyclop budget. +func buildMux(fan *fanout) *http.ServeMux { mux := http.NewServeMux() mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) @@ -127,10 +140,19 @@ func run() error { w.Header().Set("Content-Type", "text/plain; charset=utf-8") _, _ = w.Write([]byte("elastickv-admin: phase 0 — SPA not yet embedded\n")) }) + return mux +} + +func run() error { + cfg, err := initRun() + if err != nil { + return err + } + defer cfg.fan.Close() srv := &http.Server{ Addr: *bindAddr, - Handler: mux, + Handler: buildMux(cfg.fan), ReadHeaderTimeout: readHeaderTimeout, ReadTimeout: readTimeout, WriteTimeout: writeTimeout, @@ -142,7 +164,7 @@ func run() error { errCh := make(chan error, 1) go func() { - log.Printf("elastickv-admin listening on %s (seeds=%v)", *bindAddr, seeds) + log.Printf("elastickv-admin listening on %s (seeds=%v)", *bindAddr, cfg.seeds) if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { errCh <- err return @@ -163,6 +185,33 @@ func run() error { } } +// validateBindAddr rejects a non-loopback bind unless the operator has +// explicitly opted into --allowRemoteBind. The admin binary performs no +// browser-side authentication in Phase 0 while holding a privileged node +// admin token, so a misconfigured 0.0.0.0:8080 would expose that token-gated +// cluster view to anyone on the network. +func validateBindAddr(addr string, allow bool) error { + if allow { + return nil + } + host, _, err := net.SplitHostPort(addr) + if err != nil { + return errors.Wrapf(err, "invalid --bindAddr %q", addr) + } + host = strings.TrimSpace(host) + if host == "" { + return fmt.Errorf("--bindAddr %q has an empty host; pass an explicit loopback host like 127.0.0.1 or set --allowRemoteBind when fronted by an auth proxy", addr) + } + ip := net.ParseIP(host) + switch { + case host == "localhost": + return nil + case ip != nil && ip.IsLoopback(): + return nil + } + return fmt.Errorf("--bindAddr %q is not loopback; set --allowRemoteBind to expose the admin UI remotely (the UI has no browser-side auth — do so only behind an auth proxy)", addr) +} + func splitNodes(raw string) []string { parts := strings.Split(raw, ",") out := make([]string, 0, len(parts)) @@ -252,6 +301,15 @@ type nodeClient struct { addr string conn *grpc.ClientConn client pb.AdminClient + + // refcount and evicted are protected by fanout.mu. They let the cache + // evict entries while RPCs are in flight: eviction removes the entry + // from the map and marks it evicted, and the conn is closed only once + // the last borrower calls release. Without this the previous design + // could cancel healthy in-flight GetClusterOverview calls whenever the + // cache was saturated. + refcount int + evicted bool } type membership struct { @@ -309,6 +367,9 @@ func (f *fanout) Close() { return } f.closed = true + // Shutdown is an intentional cancellation of any in-flight RPCs; close + // connections eagerly and let borrowers see the cancel. Borrowers that + // still hold leases will observe the conn as closed on their next call. for _, c := range f.clients { if err := c.conn.Close(); err != nil { log.Printf("elastickv-admin: close gRPC connection to %s: %v", c.addr, err) @@ -320,37 +381,58 @@ func (f *fanout) Close() { f.clients = map[string]*nodeClient{} } -func (f *fanout) clientFor(addr string) (*nodeClient, error) { +// clientFor returns a leased nodeClient that callers must release once they +// finish the RPC (release is the second return value, always non-nil and safe +// to call). The cache is bounded by maxCachedClients; if the cache is full, +// one entry is evicted — prefer non-seed victims, fall back to any entry when +// the cache is saturated with seeds. Evicted entries stop accepting new leases +// but their underlying *grpc.ClientConn is kept alive until every outstanding +// borrower has released; this prevents an eviction from canceling a healthy +// concurrent GetClusterOverview. +func (f *fanout) clientFor(addr string) (*nodeClient, func(), error) { f.mu.Lock() defer f.mu.Unlock() if f.closed { - return nil, errFanoutClosed + return nil, func() {}, errFanoutClosed } if c, ok := f.clients[addr]; ok { - return c, nil - } - // Bound the cache so a high-churn cluster or a stream of hostile - // discovery responses cannot leak file descriptors. Prefer evicting a - // non-seed entry — those are easiest to re-dial on demand — but when the - // cache is already saturated with seeds (operator passed >=maxCachedClients - // addresses to --nodes), fall back to evicting any entry. Without that - // fallback the cap is vacuous in the seed-heavy configuration Codex - // flagged, so the cache could grow without bound. + c.refcount++ + return c, f.releaseFunc(c), nil + } if len(f.clients) >= maxCachedClients { f.evictOneLocked() } conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(f.creds)) if err != nil { - return nil, errors.Wrapf(err, "dial %s", addr) + return nil, func() {}, errors.Wrapf(err, "dial %s", addr) } - c := &nodeClient{addr: addr, conn: conn, client: pb.NewAdminClient(conn)} + c := &nodeClient{addr: addr, conn: conn, client: pb.NewAdminClient(conn), refcount: 1} f.clients[addr] = c - return c, nil + return c, f.releaseFunc(c), nil } -// evictOneLocked removes exactly one entry from f.clients and closes its -// connection. Prefers non-seed entries; falls back to any entry if none exist -// (for example when len(seeds) >= maxCachedClients). Caller must hold f.mu. +// releaseFunc returns the closer used to drop a lease. On the last release +// of an evicted client the underlying connection is finally closed. +func (f *fanout) releaseFunc(c *nodeClient) func() { + return func() { + f.mu.Lock() + defer f.mu.Unlock() + if c.refcount > 0 { + c.refcount-- + } + if c.refcount == 0 && c.evicted { + if err := c.conn.Close(); err != nil { + log.Printf("elastickv-admin: deferred close for %s: %v", c.addr, err) + } + } + } +} + +// evictOneLocked removes exactly one entry from f.clients. Prefers non-seed +// entries; falls back to any entry if none are eligible (for example when +// len(seeds) >= maxCachedClients). The underlying connection is closed only +// if no borrowers still hold a lease; otherwise closing is deferred to the +// last release (see releaseFunc). Caller must hold f.mu. func (f *fanout) evictOneLocked() { seeds := make(map[string]struct{}, len(f.seeds)) for _, s := range f.seeds { @@ -365,37 +447,45 @@ func (f *fanout) evictOneLocked() { if _, keep := seeds[victim]; keep { continue } - delete(f.clients, victim) - if err := vc.conn.Close(); err != nil { - log.Printf("elastickv-admin: evict %s: close: %v", victim, err) - } + f.retireLocked(victim, vc) return } - if fallbackClient == nil { + if fallbackClient != nil { + f.retireLocked(fallback, fallbackClient) + } +} + +// retireLocked removes a client from the cache and, if no lease is currently +// held, closes its connection. Otherwise the connection stays open until the +// last borrower releases, so an evicted entry never cancels an in-flight +// RPC. Caller must hold f.mu. +func (f *fanout) retireLocked(addr string, c *nodeClient) { + delete(f.clients, addr) + if c.evicted { return } - delete(f.clients, fallback) - if err := fallbackClient.conn.Close(); err != nil { - log.Printf("elastickv-admin: evict %s (seed-fallback): close: %v", fallback, err) + c.evicted = true + if c.refcount > 0 { + return + } + if err := c.conn.Close(); err != nil { + log.Printf("elastickv-admin: retire %s: close: %v", addr, err) } } // invalidateClient drops a cached connection — used when a peer returns -// Unavailable so the next request re-dials or skips the removed node. +// Unavailable so the next request re-dials or skips the removed node. The +// connection stays open until the last borrower releases, so invalidating +// does not cancel other goroutines' in-flight RPCs. func (f *fanout) invalidateClient(addr string) { f.mu.Lock() + defer f.mu.Unlock() if f.closed { - f.mu.Unlock() return } - c, ok := f.clients[addr] - delete(f.clients, addr) f.members = nil - f.mu.Unlock() - if ok { - if err := c.conn.Close(); err != nil { - log.Printf("elastickv-admin: close gRPC connection to %s: %v", addr, err) - } + if c, ok := f.clients[addr]; ok { + f.retireLocked(addr, c) } } @@ -453,7 +543,7 @@ func (f *fanout) currentTargets(ctx context.Context) []string { // discoveryRPCTimeout so a slow first seed does not stall the whole request. func (f *fanout) refreshMembership(ctx context.Context) []string { for _, seed := range f.seeds { - cli, err := f.clientFor(seed) + cli, release, err := f.clientFor(seed) if err != nil { log.Printf("elastickv-admin: dial seed %s: %v", seed, err) continue @@ -461,6 +551,7 @@ func (f *fanout) refreshMembership(ctx context.Context) []string { rpcCtx, cancel := context.WithTimeout(ctx, discoveryRPCTimeout) resp, err := cli.client.GetClusterOverview(f.outgoingCtx(rpcCtx), &pb.GetClusterOverviewRequest{}) cancel() + release() if err != nil { if status.Code(err) == codes.Unavailable { f.invalidateClient(seed) @@ -562,12 +653,13 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { go func(i int, addr string) { defer wg.Done() entry := perNodeResult{Node: addr} - cli, err := f.clientFor(addr) + cli, release, err := f.clientFor(addr) if err != nil { entry.Error = err.Error() results[i] = entry return } + defer release() resp, err := cli.client.GetClusterOverview(f.outgoingCtx(ctx), &pb.GetClusterOverviewRequest{}) if err != nil { if status.Code(err) == codes.Unavailable { diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index 7c69dac2..d30d227c 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -28,6 +28,38 @@ import ( "google.golang.org/grpc/status" ) +func TestValidateBindAddr(t *testing.T) { + t.Parallel() + cases := []struct { + name string + addr string + allow bool + wantErr bool + }{ + {"loopback ipv4", "127.0.0.1:8080", false, false}, + {"loopback ipv6", "[::1]:8080", false, false}, + {"localhost", "localhost:8080", false, false}, + {"remote bind default rejected", "0.0.0.0:8080", false, true}, + {"specific ip default rejected", "10.0.0.5:8080", false, true}, + {"empty host rejected", ":8080", false, true}, + {"allow opt-in permits remote", "0.0.0.0:8080", true, false}, + {"malformed addr", "not-an-addr", false, true}, + } + for _, tc := range cases { + + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + err := validateBindAddr(tc.addr, tc.allow) + if tc.wantErr && err == nil { + t.Fatalf("want error, got nil") + } + if !tc.wantErr && err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + } +} + func TestSplitNodesTrimsAndDrops(t *testing.T) { t.Parallel() got := splitNodes(" host-a:50051 ,,host-b:50051 ,") @@ -225,8 +257,10 @@ func TestFanoutClientCacheEvictsEvenWhenAllEntriesAreSeeds(t *testing.T) { defer f.Close() for _, s := range seeds { - if _, err := f.clientFor(s); err != nil { + if _, release, err := f.clientFor(s); err != nil { t.Fatalf("clientFor(%s): %v", s, err) + } else { + release() } } f.mu.Lock() @@ -245,9 +279,11 @@ func TestFanoutClientCacheEvictsWhenFull(t *testing.T) { // Fill the cache past the cap. New dials should not error out and the // map must stay bounded. for i := 0; i < maxCachedClients+5; i++ { - if _, err := f.clientFor("node-" + strconvItoa(i) + ":1"); err != nil { + _, release, err := f.clientFor("node-" + strconvItoa(i) + ":1") + if err != nil { t.Fatalf("clientFor[%d]: %v", i, err) } + release() } f.mu.Lock() size := len(f.clients) @@ -471,6 +507,39 @@ func TestWriteJSONSuccessPath(t *testing.T) { } } +// TestFanoutEvictionDoesNotCloseInFlightConn asserts that evicting a cached +// entry while a borrower still holds the lease does NOT close the underlying +// gRPC connection — the close is deferred to the last release(), so in-flight +// RPCs on the evicted client complete successfully. +func TestFanoutEvictionDoesNotCloseInFlightConn(t *testing.T) { + t.Parallel() + + peer := &fakeAdminServer{members: []string{"m:1"}} + addr := startFakeAdmin(t, peer) + + f := newFanout([]string{addr}, "", time.Second, insecure.NewCredentials()) + defer f.Close() + + // Borrower 1 leases the client. + cli, release, err := f.clientFor(addr) + if err != nil { + t.Fatal(err) + } + + // Force eviction while the lease is held. invalidateClient marks + // the entry retired+refcount>0, so the conn must stay open. + f.invalidateClient(addr) + + // The lease should still be usable — conn.Close() has been deferred. + if _, callErr := cli.client.GetClusterOverview( + context.Background(), &pb.GetClusterOverviewRequest{}, + ); callErr != nil { + t.Fatalf("in-flight RPC on retired client failed (eviction raced): %v", callErr) + } + release() // last release closes the conn; verify no panic / double-close. + release() // extra release must be a no-op (refcount already zero). +} + // TestFanoutClientForAfterCloseIsSafe asserts that clientFor and // invalidateClient do not panic when invoked concurrently with Close — a // shutdown-time race that otherwise hits a nil-map write in clientFor. @@ -479,7 +548,7 @@ func TestFanoutClientForAfterCloseIsSafe(t *testing.T) { f := newFanout([]string{"127.0.0.1:1"}, "", time.Second, insecure.NewCredentials()) f.Close() - if _, err := f.clientFor("127.0.0.1:2"); err == nil { + if _, _, err := f.clientFor("127.0.0.1:2"); err == nil { t.Fatal("expected error after Close, got nil") } f.invalidateClient("127.0.0.1:2") // must be a no-op, not panic diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index a5d37bc4..83f75b6d 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -147,7 +147,7 @@ Writes are sampled exactly once by the current Raft leader before proposal. Read The hot path uses lock-free reads for route lookup and counter increments. The data structures used are: - **Current-window counters**: `routes` is an immutable `routeTable` published through `atomic.Pointer[routeTable]`. `routeTable` owns `map[RouteID]*routeSlot`; each `routeSlot` owns an `atomic.Pointer[routeCounters]`. `Observe` loads the current table, performs a plain map lookup against that immutable snapshot, loads the slot's counter pointer, and uses `atomic.AddUint64` on the counter fields. Adding a new `RouteID` or replacing split/merge mappings performs a copy-on-write table update under a non-hot-path `routesMu`, then publishes the new table with one atomic store. No `Observe` call ever runs against a Go map that can be mutated concurrently. -- **Flush**: instead of holding a long write lock, the flush goroutine **atomically swaps** the `*routeCounters` pointer for each key using `atomic.Pointer[routeCounters]`, then reads the old pointer's frozen counters to build the new matrix column. `Observe` that loaded the old pointer before the swap completes its increments against the (now-retired) old counters, which the next flush will harvest. No counts are lost; at most one step-boundary's worth of counts land in the next column instead of the current one. +- **Flush**: the flush goroutine drains each counter in place with `atomic.SwapUint64(&counter, 0)`. The value returned by the swap is the exact count accumulated since the previous flush; subsequent `Observe` calls see the zeroed counter and add to it without contention. There is no "old pointer" for late writers to hit — the fast path only ever touches the current counter cell, so no increment can race past the flush snapshot. Split/merge reshapes (§5.4) still go through the copy-on-write `routeTable`, but the counters themselves stay in place and are harvested by `SwapUint64`. No counts are lost and no late-writer cleanup is required. - **Split/merge** (§5.4): the route-watch callback creates the new child slots and publishes a new immutable `routeTable` *before* the `distribution.Engine` exposes the new `RouteID` to the coordinator, so by the time `Observe` sees the new `RouteID` the counter already exists and the callback does not race with the hot path. ### 5.2 Adaptive sub-sampling and the accuracy SLO @@ -160,7 +160,7 @@ The capture rate itself is not the SLO — at `sampleRate = 8` the raw capture r For Poisson-ish traffic, the relative error of the Horvitz–Thompson estimator is approximately `1 / sqrt(acceptedSamples)` for 1-in-N sub-sampling where N > 1. Setting this ≤0.05 at 95% CI gives a required `acceptedSamples ≥ (1.96 / 0.05)² ≈ 1537`, independent of the current 1-in-N rate. Buckets sampled at `sampleRate = 1` are exact and do not need the bound. The adaptive controller enforces this by never raising `sampleRate` past the point where the most recent window's `acceptedSamples` falls below that bound; if a burst violates the bound the affected buckets are flagged in the response and the UI renders them hatched so the operator knows the estimate is soft. -`sampleRate` only rises at all when the previous flush window's estimated `Observe` cost crosses a measured threshold. To avoid adding profiling overhead to the hot path, the cost is estimated with a **synthetic model** (no runtime profiler involved): at startup `BenchmarkCoordinatorDispatch` with the sampler enabled records `costPerObserveNs` once, and each flush window computes `estimatedObserveCPU = Σ_routes(observeCount × costPerObserveNs)` directly from the counters already being harvested. This is exact up to the benchmarked cost constant and zero-overhead at runtime. In steady state with moderate per-route QPS, `sampleRate` stays at 1 and every op is counted. +`sampleRate` only rises at all when the previous flush window's estimated `Observe` cost crosses a measured threshold. To avoid profiling overhead on the hot path, the cost is estimated with a **synthetic model** whose per-call constant is a **checked-in number** (`costPerObserveNs`) — not something measured at startup. The value is produced by the CI benchmark `BenchmarkCoordinatorDispatch` and committed into `keyviz/cost.go`; a CI check fails if the observed cost drifts beyond ±20% so the constant stays honest. At runtime each flush window computes `estimatedObserveCPU = Σ_routes(observeCount × costPerObserveNs)` directly from the counters already being harvested — no benchmark runs at process start, and no runtime profiler is ever enabled. In steady state with moderate per-route QPS, `sampleRate` stays at 1 and every op is counted. Benchmark gate in CI: `BenchmarkCoordinatorDispatch` with sampler off vs on; the delta must stay within run-to-run variance. Separately, a correctness test drives a known synthetic workload through a sub-sampling sampler and asserts the ±5% / 95%-CI bound holds across 1000 trials. diff --git a/main.go b/main.go index 2cfb918f..e234d511 100644 --- a/main.go +++ b/main.go @@ -499,7 +499,7 @@ func dispatchMonitorSources(runtimes []*raftGroupRuntime) []monitoring.DispatchS func setupAdminService( nodeID, grpcAddress string, runtimes []*raftGroupRuntime, - bootstrapServers []raft.Server, + bootstrapServers []raftengine.Server, ) (*adapter.AdminServer, []grpc.ServerOption, error) { members := adminMembersFromBootstrap(nodeID, bootstrapServers) srv, opts, err := configureAdminService( @@ -527,19 +527,18 @@ func setupAdminService( // the Raft bootstrap configuration so GetClusterOverview returns a populated // members list. Without this the admin binary's membersFrom cache collapses to // only the responding seed and stops fanning out across the cluster. -func adminMembersFromBootstrap(selfID string, servers []raft.Server) []adapter.NodeIdentity { +func adminMembersFromBootstrap(selfID string, servers []raftengine.Server) []adapter.NodeIdentity { if len(servers) == 0 { return nil } out := make([]adapter.NodeIdentity, 0, len(servers)) for _, s := range servers { - id := string(s.ID) - if id == selfID { + if s.ID == selfID { continue } out = append(out, adapter.NodeIdentity{ - NodeID: id, - GRPCAddress: string(s.Address), + NodeID: s.ID, + GRPCAddress: s.Address, }) } return out diff --git a/main_admin_test.go b/main_admin_test.go index bb9a3ee3..5aa5cdb4 100644 --- a/main_admin_test.go +++ b/main_admin_test.go @@ -7,7 +7,7 @@ import ( "testing" "github.com/bootjp/elastickv/adapter" - "github.com/hashicorp/raft" + "github.com/bootjp/elastickv/internal/raftengine" ) func TestConfigureAdminServiceDisabledByDefault(t *testing.T) { @@ -69,7 +69,7 @@ func TestConfigureAdminServiceInsecureNoAuth(t *testing.T) { func TestAdminMembersFromBootstrapExcludesSelf(t *testing.T) { t.Parallel() - servers := []raft.Server{ + servers := []raftengine.Server{ {ID: "n1", Address: "10.0.0.11:50051"}, {ID: "n2", Address: "10.0.0.12:50051"}, {ID: "n3", Address: "10.0.0.13:50051"}, @@ -91,7 +91,7 @@ func TestAdminMembersFromBootstrapEmpty(t *testing.T) { if got := adminMembersFromBootstrap("n1", nil); got != nil { t.Fatalf("empty bootstrap should produce nil, got %v", got) } - single := []raft.Server{{ID: "n1", Address: "a:1"}} + single := []raftengine.Server{{ID: "n1", Address: "a:1"}} if got := adminMembersFromBootstrap("n1", single); len(got) != 0 { t.Fatalf("single-node bootstrap should yield no members, got %v", got) } From 09a38edfc05deb1cf14677b52c1dc6a16dacdf60 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 24 Apr 2026 01:22:16 +0900 Subject: [PATCH 21/30] fix(admin),docs: address CodeRabbit round-2 on a126e71a MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit review 4164075693: - adapter/admin_grpc.go: move the test-seam clock from a package-global nowFunc to a per-AdminServer field with SetClock(); parallel tests on other AdminServer instances can no longer race through the seam. Updated the LastContact test to use SetClock. - cmd/elastickv-admin/main.go + main.go: replace the Stat/ReadFile TOCTOU token-load with io.LimitReader(f, max+1); a file that grows or is swapped between the stat() and read() now still cannot force an oversized allocation. Tests updated for the new error wording. - cmd/elastickv-admin/main.go: make the deferred close of retired gRPC clients idempotent by tracking nodeClient.closed under fanout.mu. A second release() on an already-closed client is now a true no-op; Close() also marks clients closed to avoid double-close with late releases. - docs §5.1/§5.3/§5.4/§10/§11: align all remaining references with the atomic.SwapUint64 flush protocol (drop "counter pointer" / "pointer swap" wording). No retired pointers, no late-writer harvest needed. - docs §9.1: replace the non-existent `--no-fanout` flag example with a note that passing one address to --nodes yields the partial single-node view; a dedicated flag is deferred. --- adapter/admin_grpc.go | 24 ++++++++++-- adapter/admin_grpc_test.go | 8 ++-- cmd/elastickv-admin/main.go | 52 +++++++++++++++++--------- cmd/elastickv-admin/main_test.go | 2 +- docs/admin_ui_key_visualizer_design.md | 14 +++---- main.go | 20 ++++++---- main_admin_test.go | 2 +- 7 files changed, 79 insertions(+), 43 deletions(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index d86abf4d..c864a14b 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -17,9 +17,6 @@ import ( "google.golang.org/grpc/status" ) -// nowFunc is a test seam for injecting a fixed clock into GetRaftGroups so -// its timestamp output is deterministic. -var nowFunc = time.Now // AdminGroup exposes per-Raft-group state to the Admin service. It is a narrow // subset of raftengine.Engine so tests can supply an in-memory fake without @@ -51,6 +48,12 @@ type AdminServer struct { groupsMu sync.RWMutex groups map[uint64]AdminGroup + // now is the clock used for LastContactUnixMs and any other + // timestamping this service needs. It's a per-server field (not a + // package global) so `-race` tests that swap the clock on one server + // instance cannot contend with concurrent RPCs on another instance. + now func() time.Time + pb.UnimplementedAdminServer } @@ -65,7 +68,20 @@ func NewAdminServer(self NodeIdentity, members []NodeIdentity) *AdminServer { self: self, members: cloned, groups: make(map[uint64]AdminGroup), + now: time.Now, + } +} + +// SetClock overrides the clock used by GetRaftGroups, letting tests inject a +// fixed time without mutating any package-global state. Concurrent RPCs on +// other AdminServer instances are unaffected. +func (s *AdminServer) SetClock(now func() time.Time) { + if now == nil { + now = time.Now } + s.groupsMu.Lock() + s.now = now + s.groupsMu.Unlock() } // RegisterGroup binds a Raft group ID to its engine so the Admin service can @@ -108,7 +124,7 @@ func (s *AdminServer) GetRaftGroups( defer s.groupsMu.RUnlock() ids := sortedGroupIDs(s.groups) out := make([]*pb.RaftGroupState, 0, len(ids)) - now := nowFunc() + now := s.now() for _, id := range ids { st := s.groups[id].Status() // Translate LastContact (duration since the last contact with the diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index ff3a7ba4..429d8133 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -58,11 +58,11 @@ func TestGetRaftGroupsExposesCommitApplied(t *testing.T) { srv := NewAdminServer(NodeIdentity{NodeID: "n1"}, nil) srv.RegisterGroup(1, fakeGroupWithContact{leaderID: "n1", term: 2, commit: 99, applied: 97, lastContact: 5 * time.Second}) - // Freeze nowFunc so the computed last-contact timestamp is deterministic. - origNow := nowFunc + // Freeze the per-server clock so the computed last-contact timestamp is + // deterministic. No package-global state is mutated, so other parallel + // tests cannot race through this seam. fixed := time.Unix(1_000_000, 0) - nowFunc = func() time.Time { return fixed } - t.Cleanup(func() { nowFunc = origNow }) + srv.SetClock(func() time.Time { return fixed }) resp, err := srv.GetRaftGroups(context.Background(), &pb.GetRaftGroupsRequest{}) if err != nil { diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 2e0501a2..67af7017 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -11,6 +11,7 @@ import ( "encoding/json" "flag" "fmt" + "io" "log" "net" "net/http" @@ -238,19 +239,22 @@ func loadToken(path string, insecureMode bool) (string, error) { if err != nil { return "", errors.Wrap(err, "resolve token path") } - info, err := os.Stat(abs) + // Read through an io.LimitReader bounded to maxTokenFileBytes+1 so a file + // that grows or is swapped between stat() and read() still cannot force + // an oversized allocation. If we can drain one byte past the cap the + // token is too large — reject it. + f, err := os.Open(abs) if err != nil { - return "", errors.Wrap(err, "stat token file") + return "", errors.Wrap(err, "open token file") } - if info.Size() > maxTokenFileBytes { - return "", fmt.Errorf("token file %s is %d bytes; maximum is %d — refusing to load", - abs, info.Size(), maxTokenFileBytes) - } - // Size is bounded above, so materializing the file is safe. - b, err := os.ReadFile(abs) + defer func() { _ = f.Close() }() + b, err := io.ReadAll(io.LimitReader(f, maxTokenFileBytes+1)) if err != nil { return "", errors.Wrap(err, "read token file") } + if len(b) > maxTokenFileBytes { + return "", fmt.Errorf("token file %s exceeds maximum of %d bytes — refusing to load", abs, maxTokenFileBytes) + } token := strings.TrimSpace(string(b)) if token == "" { return "", errors.New("token file is empty") @@ -302,14 +306,16 @@ type nodeClient struct { conn *grpc.ClientConn client pb.AdminClient - // refcount and evicted are protected by fanout.mu. They let the cache - // evict entries while RPCs are in flight: eviction removes the entry - // from the map and marks it evicted, and the conn is closed only once - // the last borrower calls release. Without this the previous design - // could cancel healthy in-flight GetClusterOverview calls whenever the - // cache was saturated. + // refcount, evicted, and closed are protected by fanout.mu. They let the + // cache evict entries while RPCs are in flight: eviction removes the + // entry from the map and marks it evicted, and the conn is closed only + // once the last borrower calls release. closed guards against a second + // release on an already-closed client so the public contract (extra + // release() calls are no-ops) holds even when refcount transiently + // bounces back to zero. refcount int evicted bool + closed bool } type membership struct { @@ -370,7 +376,13 @@ func (f *fanout) Close() { // Shutdown is an intentional cancellation of any in-flight RPCs; close // connections eagerly and let borrowers see the cancel. Borrowers that // still hold leases will observe the conn as closed on their next call. + // Mark each client closed so the deferred release path does not attempt + // a double-close. for _, c := range f.clients { + if c.closed { + continue + } + c.closed = true if err := c.conn.Close(); err != nil { log.Printf("elastickv-admin: close gRPC connection to %s: %v", c.addr, err) } @@ -412,7 +424,8 @@ func (f *fanout) clientFor(addr string) (*nodeClient, func(), error) { } // releaseFunc returns the closer used to drop a lease. On the last release -// of an evicted client the underlying connection is finally closed. +// of an evicted client the underlying connection is finally closed. Extra +// release() calls after the conn is already closed are safe no-ops. func (f *fanout) releaseFunc(c *nodeClient) func() { return func() { f.mu.Lock() @@ -420,7 +433,8 @@ func (f *fanout) releaseFunc(c *nodeClient) func() { if c.refcount > 0 { c.refcount-- } - if c.refcount == 0 && c.evicted { + if c.refcount == 0 && c.evicted && !c.closed { + c.closed = true if err := c.conn.Close(); err != nil { log.Printf("elastickv-admin: deferred close for %s: %v", c.addr, err) } @@ -458,16 +472,18 @@ func (f *fanout) evictOneLocked() { // retireLocked removes a client from the cache and, if no lease is currently // held, closes its connection. Otherwise the connection stays open until the // last borrower releases, so an evicted entry never cancels an in-flight -// RPC. Caller must hold f.mu. +// RPC. Idempotent — double-retiring or retiring after the last release is a +// no-op. Caller must hold f.mu. func (f *fanout) retireLocked(addr string, c *nodeClient) { delete(f.clients, addr) if c.evicted { return } c.evicted = true - if c.refcount > 0 { + if c.refcount > 0 || c.closed { return } + c.closed = true if err := c.conn.Close(); err != nil { log.Printf("elastickv-admin: retire %s: close: %v", addr, err) } diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index d30d227c..506b3439 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -208,7 +208,7 @@ func TestLoadTokenRejectsOversizedFile(t *testing.T) { t.Fatal(err) } _, err := loadToken(path, false) - if err == nil || !strings.Contains(err.Error(), "maximum is") { + if err == nil || !strings.Contains(err.Error(), "exceeds maximum") { t.Fatalf("expected size-cap error, got %v", err) } } diff --git a/docs/admin_ui_key_visualizer_design.md b/docs/admin_ui_key_visualizer_design.md index 83f75b6d..0ee75404 100644 --- a/docs/admin_ui_key_visualizer_design.md +++ b/docs/admin_ui_key_visualizer_design.md @@ -146,7 +146,7 @@ Writes are sampled exactly once by the current Raft leader before proposal. Read The hot path uses lock-free reads for route lookup and counter increments. The data structures used are: -- **Current-window counters**: `routes` is an immutable `routeTable` published through `atomic.Pointer[routeTable]`. `routeTable` owns `map[RouteID]*routeSlot`; each `routeSlot` owns an `atomic.Pointer[routeCounters]`. `Observe` loads the current table, performs a plain map lookup against that immutable snapshot, loads the slot's counter pointer, and uses `atomic.AddUint64` on the counter fields. Adding a new `RouteID` or replacing split/merge mappings performs a copy-on-write table update under a non-hot-path `routesMu`, then publishes the new table with one atomic store. No `Observe` call ever runs against a Go map that can be mutated concurrently. +- **Current-window counters**: `routes` is an immutable `routeTable` published through `atomic.Pointer[routeTable]`. `routeTable` owns `map[RouteID]*routeSlot`; each `routeSlot` owns fixed counter fields (`reads`, `writes`, `readBytes`, `writeBytes`) that are mutated with `atomic.AddUint64`. `Observe` loads the current table, performs a plain map lookup against that immutable snapshot, and increments the slot's counters directly — no counter pointer is ever swapped, so there is no retirement window where a writer could race a flush. Adding a new `RouteID` or replacing split/merge mappings performs a copy-on-write table update under a non-hot-path `routesMu`, then publishes the new table with one atomic store. No `Observe` call ever runs against a Go map that can be mutated concurrently. - **Flush**: the flush goroutine drains each counter in place with `atomic.SwapUint64(&counter, 0)`. The value returned by the swap is the exact count accumulated since the previous flush; subsequent `Observe` calls see the zeroed counter and add to it without contention. There is no "old pointer" for late writers to hit — the fast path only ever touches the current counter cell, so no increment can race past the flush snapshot. Split/merge reshapes (§5.4) still go through the copy-on-write `routeTable`, but the counters themselves stay in place and are harvested by `SwapUint64`. No counts are lost and no late-writer cleanup is required. - **Split/merge** (§5.4): the route-watch callback creates the new child slots and publishes a new immutable `routeTable` *before* the `distribution.Engine` exposes the new `RouteID` to the coordinator, so by the time `Observe` sees the new `RouteID` the counter already exists and the callback does not race with the hot path. @@ -173,7 +173,7 @@ Sampler └─ history *ringBuffer[matrixColumn] // one column per stepSeconds (default 60s) ``` -Every `stepSeconds` a flush goroutine swaps each route's counter pointer (§5.1) and drops a new column into the ring buffer. +Every `stepSeconds` a flush goroutine drains each route's counters with `atomic.SwapUint64(&counter, 0)` (§5.1) and drops a new column into the ring buffer. **Route budget and memory cap.** Naïve sizing (`columns × routes × series × 8B`) does not scale: 1 M routes × 1440 columns × 4 series × 8 B = ~46 GiB. Unbounded growth is unacceptable. The sampler enforces a hard budget on tracked routes: @@ -192,7 +192,7 @@ If an operator needs higher fidelity across more routes than the cap allows, the ### 5.4 Keeping up with splits and merges -`distribution.Engine` already emits a watch stream on route-state transitions. The sampler subscribes and, on a split, copies the parent route's historical column values into both children so the heatmap stays visually continuous across the event. On a merge, child columns are summed into the surviving parent. Current-window updates use the immutable-table, pointer-swap scheme from §5.1: child `routeSlot`s and `routeCounters` are installed in a freshly copied `routeTable` **before** the `distribution.Engine` publishes the new `RouteID` to the coordinator, so `Observe` never dereferences a missing route. Counts that raced a transition are attributed to whichever `RouteID` the coordinator resolved — acceptable because the loss is bounded by a single step window. +`distribution.Engine` already emits a watch stream on route-state transitions. The sampler subscribes and, on a split, copies the parent route's historical column values into both children so the heatmap stays visually continuous across the event. On a merge, child columns are summed into the surviving parent. Current-window updates use the immutable-table, copy-on-write scheme from §5.1: child `routeSlot`s (each with zeroed counter fields) are installed in a freshly copied `routeTable` **before** the `distribution.Engine` publishes the new `RouteID` to the coordinator, so `Observe` never dereferences a missing route. Counts that raced a transition are attributed to whichever `RouteID` the coordinator resolved — acceptable because the loss is bounded by a single step window. ### 5.5 Bucketing for the response @@ -289,20 +289,20 @@ Because writes are recorded by Raft leaders and follower-local reads are recorde - For each query (`GetKeyVizMatrix`, `GetRouteDetail`, `GetAdapterSummary`), the admin binary issues parallel gRPC calls to every known node and merges results server-side before sending one combined JSON payload to the browser. - Merging rule for the heatmap: rows are grouped by `bucketID`/`lineageID` and time step. Read samples from multiple nodes are **summed**, because they represent distinct locally served reads. For write samples the authoritative identity is `(raftGroupID, leaderTerm)` — by Raft invariants at most one leader exists per term per group — so the admin binary collapses write samples to **one value per `(bucketID, raftGroupID, leaderTerm, windowStart)`** key. If the same logical key arrives from more than one node (e.g., an ex-leader that has not yet expired its local cache plus a correctly-responding new leader in the same term), the entries are expected to be identical and the merger keeps one; if they differ, the cell is surfaced with `conflict=true` (not silently dropped). Across distinct `leaderTerm` values for the same group and window, values are summed because each term's leader only observed its own term's writes. The admin binary never uses "later timestamp wins" to overwrite a previous leader's complete window with a new leader's partial window. - Degraded mode: if any node is unreachable, the admin binary returns a partial result with a per-node `{node, ok, error}` status array so the UI can surface "3 of 4 nodes responded" instead of silently hiding ranges. The heatmap hatches rows or time windows whose expected source node failed. -- A single-node mode (`--nodes=one:50051 --no-fanout`) is retained for operators who explicitly want the partial view. +- A single-node mode — pass one address to `--nodes` and the admin binary will fan out to just that node's view. A future `--no-fanout` flag that also suppresses the background membership-discovery RPC is deferred; for now the operator can simulate it by pointing at a single seed and accepting the one-node partial view. ## 10. Performance Considerations -- Sampler fast path on a hit: `atomic.Pointer[routeTable].Load`, immutable map lookup by `RouteID`, `atomic.Pointer[routeCounters].Load`, then `atomic.AddUint64` on the four counters. No allocation per call, no mutex acquisition, no global lock. +- Sampler fast path on a hit: `atomic.Pointer[routeTable].Load`, immutable map lookup by `RouteID`, then `atomic.AddUint64` on the slot's four counter fields. No allocation per call, no mutex acquisition, no global lock. - The coordinator already holds the `RouteID` at the hook site, so the sampler does not re-resolve. -- The flush goroutine performs atomic pointer swaps per tracked route; there is no write lock covering `Observe` calls. Splits and merges publish a copied immutable route table with child counters before publishing the new `RouteID` (§5.4), so the callback does not race with the hot path. +- The flush goroutine performs in-place `atomic.SwapUint64` per tracked counter; there is no write lock covering `Observe` calls and no retired pointers for late writers to hit. Splits and merges publish a copied immutable route table with child counters before publishing the new `RouteID` (§5.4), so the callback does not race with the hot path. - API endpoints cap `to − from` at 7 days and `rows` at 1024 to bound server work. - `LiveSummary` adds a second atomic increment alongside each existing Prometheus `Inc()`, plus one atomic increment on a fixed-bucket histogram counter. Cost is on the order of a nanosecond and well below the noise floor in §5.2. - Fan-out cost (§9.1) is N parallel gRPC calls; each node serves only its locally observed samples, so the response size is distributed and the aggregate wall-clock is bounded by the slowest node, not the sum. ## 11. Testing -1. Unit tests for `keyviz.Sampler`: concurrent `Observe` under the `-race` detector while copy-on-write route-table updates run, flush correctness via the pointer-swap protocol, split/merge reshaping, forwarded-read "already sampled" deduplication, and the **accuracy SLO** (1000 trials of synthetic workload must satisfy ±5% relative error at 95% CI per §5.2). +1. Unit tests for `keyviz.Sampler`: concurrent `Observe` under the `-race` detector while copy-on-write route-table updates run, flush correctness via the `atomic.SwapUint64` drain protocol (no counts lost across the flush boundary), split/merge reshaping, forwarded-read "already sampled" deduplication, and the **accuracy SLO** (1000 trials of synthetic workload must satisfy ±5% relative error at 95% CI per §5.2). 2. Route-budget test: generate more than `--keyvizMaxTrackedRoutes` routes and assert that coarsening preserves total observed traffic, keeps hot routes un-merged, and returns `aggregate`, `bucketID`, `routeCount`, and constituent route metadata correctly. 3. Integration test in `kv/` that drives synthetic traffic through the coordinator and asserts the matrix reflects the skew. 4. gRPC handler tests with a fake engine and fake Raft status reader. diff --git a/main.go b/main.go index e234d511..970b2211 100644 --- a/main.go +++ b/main.go @@ -4,6 +4,7 @@ import ( "context" "flag" "fmt" + "io" "log" "net" "net/http" @@ -589,19 +590,22 @@ func loadAdminTokenFile(path string) (string, error) { if err != nil { return "", errors.Wrap(err, "resolve admin token path") } - info, err := os.Stat(abs) + // Read through an io.LimitReader bounded to adminTokenMaxBytes+1 so a + // file that grows or is swapped between stat() and read() cannot force + // an oversized allocation; draining one byte past the cap means the + // file is too large and we reject it. + f, err := os.Open(abs) if err != nil { - return "", errors.Wrap(err, "stat admin token file") + return "", errors.Wrap(err, "open admin token file") } - if info.Size() > adminTokenMaxBytes { - return "", fmt.Errorf( - "admin token file %s is %d bytes; maximum is %d", - abs, info.Size(), adminTokenMaxBytes) - } - b, err := os.ReadFile(abs) + defer func() { _ = f.Close() }() + b, err := io.ReadAll(io.LimitReader(f, adminTokenMaxBytes+1)) if err != nil { return "", errors.Wrap(err, "read admin token file") } + if len(b) > adminTokenMaxBytes { + return "", fmt.Errorf("admin token file %s exceeds maximum of %d bytes", abs, adminTokenMaxBytes) + } token := strings.TrimSpace(string(b)) if token == "" { return "", errors.New("admin token file is empty") diff --git a/main_admin_test.go b/main_admin_test.go index 5aa5cdb4..241fb4fc 100644 --- a/main_admin_test.go +++ b/main_admin_test.go @@ -104,7 +104,7 @@ func TestLoadAdminTokenFileRejectsOversize(t *testing.T) { if err := os.WriteFile(path, []byte(strings.Repeat("x", adminTokenMaxBytes+1)), 0o600); err != nil { t.Fatal(err) } - if _, err := loadAdminTokenFile(path); err == nil || !strings.Contains(err.Error(), "maximum is") { + if _, err := loadAdminTokenFile(path); err == nil || !strings.Contains(err.Error(), "exceeds maximum") { t.Fatalf("expected size-cap error, got %v", err) } } From a3c88b499c79339cce685dcc0918a8621ad51a53 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 24 Apr 2026 01:32:32 +0900 Subject: [PATCH 22/30] fix(admin),refactor: address codex+gemini review on 09a38edf Codex P2 + Gemini medium comments on 09a38edf/a126e71a: - adapter/admin_grpc.go: clamp sentinel-negative st.LastContact to zero before computing LastContactUnixMs. raftengine's etcd engine returns -1 when contact is unknown (follower with no leader yet); without the clamp the admin UI would show a future timestamp exactly when contact is unknown, reading as "freshly contacted". Added regression test that asserts the clamp. - cmd/elastickv-admin + main.go: extract the shared bearer-token loader to internal.LoadBearerTokenFile so the admin binary and node process share the same LimitReader-bounded read + trim + size-cap logic. Drops duplication flagged by Gemini. - cmd/elastickv-admin/main.go: make the singleflight DoChan result type-assertion safe (unexpected type logs and falls back to seeds instead of panicking, per Gemini). - adapter/admin_grpc.go: gci-format the imports (bootjp-prefix group separated from third-party) to satisfy the reviewdog/golangci pre-check. All tests (incl. -race) pass for adapter, cmd/elastickv-admin, internal, and main packages. --- adapter/admin_grpc.go | 15 ++++++---- adapter/admin_grpc_test.go | 22 ++++++++++++++ cmd/elastickv-admin/main.go | 39 ++++++++---------------- internal/tokenfile.go | 52 ++++++++++++++++++++++++++++++++ internal/tokenfile_test.go | 59 +++++++++++++++++++++++++++++++++++++ main.go | 31 ++++--------------- 6 files changed, 160 insertions(+), 58 deletions(-) create mode 100644 internal/tokenfile.go create mode 100644 internal/tokenfile_test.go diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index c864a14b..e53a3db7 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -17,7 +17,6 @@ import ( "google.golang.org/grpc/status" ) - // AdminGroup exposes per-Raft-group state to the Admin service. It is a narrow // subset of raftengine.Engine so tests can supply an in-memory fake without // standing up a real Raft cluster. @@ -130,16 +129,22 @@ func (s *AdminServer) GetRaftGroups( // Translate LastContact (duration since the last contact with the // leader, per raftengine.Status) into an absolute unix-ms so UI // clients can diff against their own clock instead of having to - // reason about the server's uptime. Zero LastContact (leader on - // self, or no contact recorded yet) reports the current time - // rather than an arbitrary epoch zero. + // reason about the server's uptime. The etcd engine returns a + // sentinel negative duration when the last contact is unknown + // (follower/candidate has never heard from a leader); clamping + // negatives to zero prevents the UI from rendering a future + // timestamp as "freshly contacted". + lastContact := st.LastContact + if lastContact < 0 { + lastContact = 0 + } out = append(out, &pb.RaftGroupState{ RaftGroupId: id, LeaderNodeId: st.Leader.ID, LeaderTerm: st.Term, CommitIndex: st.CommitIndex, AppliedIndex: st.AppliedIndex, - LastContactUnixMs: now.Add(-st.LastContact).UnixMilli(), + LastContactUnixMs: now.Add(-lastContact).UnixMilli(), }) } return &pb.GetRaftGroupsResponse{Groups: out}, nil diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index 429d8133..5d7816cf 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -81,6 +81,28 @@ func TestGetRaftGroupsExposesCommitApplied(t *testing.T) { } } +// TestGetRaftGroupsClampsNegativeLastContact pins the sentinel-negative +// handling for raftengine's "unknown last contact" value (-1). Without the +// clamp, the admin UI would show a future timestamp precisely when leader +// contact is unknown, which reads as "freshly contacted" to operators. +func TestGetRaftGroupsClampsNegativeLastContact(t *testing.T) { + t.Parallel() + srv := NewAdminServer(NodeIdentity{NodeID: "n1"}, nil) + srv.RegisterGroup(1, fakeGroupWithContact{leaderID: "n1", term: 1, lastContact: -1}) + + fixed := time.Unix(2_000_000, 0) + srv.SetClock(func() time.Time { return fixed }) + + resp, err := srv.GetRaftGroups(context.Background(), &pb.GetRaftGroupsRequest{}) + if err != nil { + t.Fatal(err) + } + got := resp.Groups[0].LastContactUnixMs + if got != fixed.UnixMilli() { + t.Fatalf("LastContactUnixMs = %d, want %d (now clamped to 0 duration)", got, fixed.UnixMilli()) + } +} + type fakeGroupWithContact struct { leaderID string term uint64 diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 67af7017..2310bed9 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -11,18 +11,17 @@ import ( "encoding/json" "flag" "fmt" - "io" "log" "net" "net/http" "os" "os/signal" - "path/filepath" "strings" "sync" "syscall" "time" + internalutil "github.com/bootjp/elastickv/internal" pb "github.com/bootjp/elastickv/proto" "github.com/cockroachdb/errors" "golang.org/x/sync/singleflight" @@ -235,31 +234,11 @@ func loadToken(path string, insecureMode bool) (string, error) { if insecureMode { return "", errors.New("--adminInsecureNoAuth and --nodeTokenFile are mutually exclusive") } - abs, err := filepath.Abs(path) + tok, err := internalutil.LoadBearerTokenFile(path, maxTokenFileBytes, "admin token") if err != nil { - return "", errors.Wrap(err, "resolve token path") + return "", errors.Wrap(err, "load admin token") } - // Read through an io.LimitReader bounded to maxTokenFileBytes+1 so a file - // that grows or is swapped between stat() and read() still cannot force - // an oversized allocation. If we can drain one byte past the cap the - // token is too large — reject it. - f, err := os.Open(abs) - if err != nil { - return "", errors.Wrap(err, "open token file") - } - defer func() { _ = f.Close() }() - b, err := io.ReadAll(io.LimitReader(f, maxTokenFileBytes+1)) - if err != nil { - return "", errors.Wrap(err, "read token file") - } - if len(b) > maxTokenFileBytes { - return "", fmt.Errorf("token file %s exceeds maximum of %d bytes — refusing to load", abs, maxTokenFileBytes) - } - token := strings.TrimSpace(string(b)) - if token == "" { - return "", errors.New("token file is empty") - } - return token, nil + return tok, nil } // loadTransportCredentials builds the gRPC TransportCredentials used to dial @@ -538,8 +517,14 @@ func (f *fanout) currentTargets(ctx context.Context) []string { }) select { case r := <-ch: - addrs, _ := r.Val.([]string) - return addrs + // refreshMembership always returns a []string today, but explicitly + // check the assertion so a future return-type change turns into a + // loud, non-panicking fallback to seeds instead of a silent crash. + if addrs, ok := r.Val.([]string); ok { + return addrs + } + log.Printf("elastickv-admin: membership refresh returned unexpected type %T; falling back to seeds", r.Val) + return append([]string(nil), f.seeds...) case <-ctx.Done(): // Caller bailed. Give them whatever targets we can assemble without // blocking: the last cached membership if we have one, else seeds. diff --git a/internal/tokenfile.go b/internal/tokenfile.go new file mode 100644 index 00000000..841e7b3b --- /dev/null +++ b/internal/tokenfile.go @@ -0,0 +1,52 @@ +package internal + +import ( + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/cockroachdb/errors" +) + +// LoadBearerTokenFile materialises a bearer-token file with a strict upper +// bound on size so a misconfigured path (for example, pointing at a log) +// cannot force an arbitrary allocation before the bearer-token check. +// The file is read through an io.LimitReader bounded to maxBytes+1 so a +// file that grows or is swapped between stat() and read() still cannot +// sneak past the cap. +// +// The returned string has surrounding whitespace trimmed; an empty file (or +// one that is only whitespace) is reported as an error so operators notice +// the misconfiguration immediately. +// +// The humanName is used in error messages to distinguish token files (e.g. +// "admin token" vs "node token"); callers typically pass a fixed string like +// "admin token" or "node token". +func LoadBearerTokenFile(path string, maxBytes int64, humanName string) (string, error) { + if humanName == "" { + humanName = "token" + } + abs, err := filepath.Abs(path) + if err != nil { + return "", errors.Wrapf(err, "resolve %s path", humanName) + } + f, err := os.Open(abs) + if err != nil { + return "", errors.Wrapf(err, "open %s file", humanName) + } + defer func() { _ = f.Close() }() + b, err := io.ReadAll(io.LimitReader(f, maxBytes+1)) + if err != nil { + return "", errors.Wrapf(err, "read %s file", humanName) + } + if int64(len(b)) > maxBytes { + return "", fmt.Errorf("%s file %s exceeds maximum of %d bytes", humanName, abs, maxBytes) + } + tok := strings.TrimSpace(string(b)) + if tok == "" { + return "", fmt.Errorf("%s file %s is empty", humanName, abs) + } + return tok, nil +} diff --git a/internal/tokenfile_test.go b/internal/tokenfile_test.go new file mode 100644 index 00000000..a8a55057 --- /dev/null +++ b/internal/tokenfile_test.go @@ -0,0 +1,59 @@ +package internal + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestLoadBearerTokenFileHappyPath(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "tok") + if err := os.WriteFile(path, []byte("\n s3cret \n"), 0o600); err != nil { + t.Fatal(err) + } + got, err := LoadBearerTokenFile(path, 4<<10, "admin token") + if err != nil { + t.Fatalf("LoadBearerTokenFile: %v", err) + } + if got != "s3cret" { + t.Fatalf("tok = %q, want s3cret", got) + } +} + +func TestLoadBearerTokenFileRejectsEmpty(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "empty") + if err := os.WriteFile(path, []byte(" \n"), 0o600); err != nil { + t.Fatal(err) + } + _, err := LoadBearerTokenFile(path, 4<<10, "admin token") + if err == nil || !strings.Contains(err.Error(), "is empty") { + t.Fatalf("want empty-file error, got %v", err) + } +} + +func TestLoadBearerTokenFileRejectsOversize(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "huge") + const cap_ = 64 + if err := os.WriteFile(path, []byte(strings.Repeat("x", cap_+1)), 0o600); err != nil { + t.Fatal(err) + } + _, err := LoadBearerTokenFile(path, cap_, "admin token") + if err == nil || !strings.Contains(err.Error(), "exceeds maximum") { + t.Fatalf("want oversize error, got %v", err) + } +} + +func TestLoadBearerTokenFileMissingFile(t *testing.T) { + t.Parallel() + _, err := LoadBearerTokenFile("/definitely/not/there", 4<<10, "admin token") + if err == nil { + t.Fatal("expected open-failure error") + } +} diff --git a/main.go b/main.go index 970b2211..514ca929 100644 --- a/main.go +++ b/main.go @@ -3,8 +3,6 @@ package main import ( "context" "flag" - "fmt" - "io" "log" "net" "net/http" @@ -584,33 +582,14 @@ func configureAdminService( // loadAdminTokenFile materialises --adminTokenFile with a strict upper bound // so a misconfigured path (for example a log file) cannot force an arbitrary -// allocation before the bearer-token check. +// allocation before the bearer-token check. Delegates to the shared helper in +// internal/ so the admin binary and the node process read tokens identically. func loadAdminTokenFile(path string) (string, error) { - abs, err := filepath.Abs(path) + tok, err := internalutil.LoadBearerTokenFile(path, adminTokenMaxBytes, "admin token") if err != nil { - return "", errors.Wrap(err, "resolve admin token path") + return "", errors.Wrap(err, "load admin token") } - // Read through an io.LimitReader bounded to adminTokenMaxBytes+1 so a - // file that grows or is swapped between stat() and read() cannot force - // an oversized allocation; draining one byte past the cap means the - // file is too large and we reject it. - f, err := os.Open(abs) - if err != nil { - return "", errors.Wrap(err, "open admin token file") - } - defer func() { _ = f.Close() }() - b, err := io.ReadAll(io.LimitReader(f, adminTokenMaxBytes+1)) - if err != nil { - return "", errors.Wrap(err, "read admin token file") - } - if len(b) > adminTokenMaxBytes { - return "", fmt.Errorf("admin token file %s exceeds maximum of %d bytes", abs, adminTokenMaxBytes) - } - token := strings.TrimSpace(string(b)) - if token == "" { - return "", errors.New("admin token file is empty") - } - return token, nil + return tok, nil } // startMonitoringCollectors wires up the per-tick Prometheus From 1bd4725018e672ee9ce638dc2e1b8ad8b98fdb71 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 24 Apr 2026 01:40:25 +0900 Subject: [PATCH 23/30] fix(admin): chain admin interceptors centrally in startRaftServers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini medium on 09a38edf: returning grpc.ChainUnaryInterceptor(unary) as a grpc.ServerOption from configureAdminService risked silent loss of the admin auth gate if anyone ever added a grpc.UnaryInterceptor (single-interceptor) option to internalutil.GRPCServerOptions() — gRPC-Go keeps only the last option of the same type. Change configureAdminService and setupAdminService to return raw interceptor slices in an adminGRPCInterceptors struct, and let startRaftServers collapse them with any other admin interceptors into a single grpc.ChainUnaryInterceptor / ChainStreamInterceptor call at registration time. No existing option is overwritten, and future additions fall into the same chain without special handling. Tests updated to assert the slice shape (unary:1, stream:1 for token gate; empty for disabled / insecure). --- main.go | 65 ++++++++++++++++++++++++++++++++-------------- main_admin_test.go | 20 +++++++------- 2 files changed, 55 insertions(+), 30 deletions(-) diff --git a/main.go b/main.go index 514ca929..5dd2ba2c 100644 --- a/main.go +++ b/main.go @@ -499,19 +499,19 @@ func setupAdminService( nodeID, grpcAddress string, runtimes []*raftGroupRuntime, bootstrapServers []raftengine.Server, -) (*adapter.AdminServer, []grpc.ServerOption, error) { +) (*adapter.AdminServer, adminGRPCInterceptors, error) { members := adminMembersFromBootstrap(nodeID, bootstrapServers) - srv, opts, err := configureAdminService( + srv, icept, err := configureAdminService( *adminTokenFile, *adminInsecureNoAuth, adapter.NodeIdentity{NodeID: nodeID, GRPCAddress: grpcAddress}, members, ) if err != nil { - return nil, nil, err + return nil, adminGRPCInterceptors{}, err } if srv == nil { - return nil, nil, nil + return nil, adminGRPCInterceptors{}, nil } for _, rt := range runtimes { srv.RegisterGroup(rt.spec.id, rt.engine) @@ -519,7 +519,7 @@ func setupAdminService( if *adminInsecureNoAuth { log.Printf("WARNING: --adminInsecureNoAuth is set; Admin gRPC service exposed without authentication") } - return srv, opts, nil + return srv, icept, nil } // adminMembersFromBootstrap extracts the peer list (everyone except self) from @@ -543,41 +543,56 @@ func adminMembersFromBootstrap(selfID string, servers []raftengine.Server) []ada return out } -// configureAdminService builds the node-side AdminServer plus the gRPC -// interceptor options that enforce its bearer token, or returns (nil, nil, -// nil) when the service is intentionally disabled. It is mutually exclusive -// with --adminInsecureNoAuth so operators have to opt into the unauthenticated +// adminGRPCInterceptors bundles the unary+stream interceptors that enforce the +// Admin bearer token. Returning the raw interceptor functions (rather than +// pre-wrapped grpc.ServerOption values via grpc.ChainUnaryInterceptor) lets +// the registration site combine them with any other interceptors in a single +// ChainUnaryInterceptor call, so using grpc.UnaryInterceptor alongside risks +// silent overwrites (gRPC-Go: last option of the same type wins). +type adminGRPCInterceptors struct { + unary []grpc.UnaryServerInterceptor + stream []grpc.StreamServerInterceptor +} + +func (a adminGRPCInterceptors) empty() bool { + return len(a.unary) == 0 && len(a.stream) == 0 +} + +// configureAdminService builds the node-side AdminServer plus the interceptor +// set that enforces its bearer token, or returns (nil, {}, nil) when the +// service is intentionally disabled. It is mutually exclusive with +// --adminInsecureNoAuth so operators have to opt into the unauthenticated // mode explicitly. func configureAdminService( tokenPath string, insecureNoAuth bool, self adapter.NodeIdentity, members []adapter.NodeIdentity, -) (*adapter.AdminServer, []grpc.ServerOption, error) { +) (*adapter.AdminServer, adminGRPCInterceptors, error) { if tokenPath == "" && !insecureNoAuth { - return nil, nil, nil + return nil, adminGRPCInterceptors{}, nil } if tokenPath != "" && insecureNoAuth { - return nil, nil, errors.New("--adminInsecureNoAuth and --adminTokenFile are mutually exclusive") + return nil, adminGRPCInterceptors{}, errors.New("--adminInsecureNoAuth and --adminTokenFile are mutually exclusive") } token := "" if tokenPath != "" { loaded, err := loadAdminTokenFile(tokenPath) if err != nil { - return nil, nil, err + return nil, adminGRPCInterceptors{}, err } token = loaded } srv := adapter.NewAdminServer(self, members) unary, stream := adapter.AdminTokenAuth(token) - var opts []grpc.ServerOption + var icept adminGRPCInterceptors if unary != nil { - opts = append(opts, grpc.ChainUnaryInterceptor(unary)) + icept.unary = append(icept.unary, unary) } if stream != nil { - opts = append(opts, grpc.ChainStreamInterceptor(stream)) + icept.stream = append(icept.stream, stream) } - return srv, opts, nil + return srv, icept, nil } // loadAdminTokenFile materialises --adminTokenFile with a strict upper bound @@ -661,11 +676,21 @@ func startRaftServers( relay *adapter.RedisPubSubRelay, proposalObserverForGroup func(uint64) kv.ProposalObserver, adminServer *adapter.AdminServer, - adminGRPCOpts []grpc.ServerOption, + adminGRPCOpts adminGRPCInterceptors, ) error { for _, rt := range runtimes { opts := append([]grpc.ServerOption(nil), internalutil.GRPCServerOptions()...) - opts = append(opts, adminGRPCOpts...) + // Collapse all interceptors into a single ChainUnaryInterceptor / + // ChainStreamInterceptor call so a future grpc.UnaryInterceptor + // (single-interceptor) option added anywhere in this chain cannot + // silently overwrite the admin auth gate — gRPC-Go keeps only the + // last option of the same type. + if len(adminGRPCOpts.unary) > 0 { + opts = append(opts, grpc.ChainUnaryInterceptor(adminGRPCOpts.unary...)) + } + if len(adminGRPCOpts.stream) > 0 { + opts = append(opts, grpc.ChainStreamInterceptor(adminGRPCOpts.stream...)) + } gs := grpc.NewServer(opts...) trx := kv.NewTransactionWithProposer(rt.engine, kv.WithProposalObserver(observerForGroup(proposalObserverForGroup, rt.spec.id))) grpcSvc := adapter.NewGRPCServer(shardStore, coordinate) @@ -897,7 +922,7 @@ type runtimeServerRunner struct { coordinate kv.Coordinator distServer *adapter.DistributionServer adminServer *adapter.AdminServer - adminGRPCOpts []grpc.ServerOption + adminGRPCOpts adminGRPCInterceptors redisAddress string leaderRedis map[string]string pubsubRelay *adapter.RedisPubSubRelay diff --git a/main_admin_test.go b/main_admin_test.go index 241fb4fc..d4f9c8bc 100644 --- a/main_admin_test.go +++ b/main_admin_test.go @@ -12,12 +12,12 @@ import ( func TestConfigureAdminServiceDisabledByDefault(t *testing.T) { t.Parallel() - srv, opts, err := configureAdminService("", false, adapter.NodeIdentity{NodeID: "n1"}, nil) + srv, icept, err := configureAdminService("", false, adapter.NodeIdentity{NodeID: "n1"}, nil) if err != nil { t.Fatalf("disabled-by-default should not error: %v", err) } - if srv != nil || opts != nil { - t.Fatalf("disabled service should return nil, nil; got %v %v", srv, opts) + if srv != nil || !icept.empty() { + t.Fatalf("disabled service should return nil server and empty interceptors; got %v %+v", srv, icept) } } @@ -40,30 +40,30 @@ func TestConfigureAdminServiceTokenFile(t *testing.T) { if err := os.WriteFile(tokPath, []byte("hunter2\n"), 0o600); err != nil { t.Fatal(err) } - srv, opts, err := configureAdminService(tokPath, false, adapter.NodeIdentity{NodeID: "n1"}, nil) + srv, icept, err := configureAdminService(tokPath, false, adapter.NodeIdentity{NodeID: "n1"}, nil) if err != nil { t.Fatalf("configureAdminService: %v", err) } if srv == nil { t.Fatal("expected an AdminServer instance") } - // Expect a unary + stream interceptor for the admin-token gate. - if len(opts) != 2 { - t.Fatalf("expected 2 grpc.ServerOption (unary + stream), got %d", len(opts)) + // Expect one unary + one stream interceptor for the admin-token gate. + if len(icept.unary) != 1 || len(icept.stream) != 1 { + t.Fatalf("expected 1 unary + 1 stream interceptor, got %d + %d", len(icept.unary), len(icept.stream)) } } func TestConfigureAdminServiceInsecureNoAuth(t *testing.T) { t.Parallel() - srv, opts, err := configureAdminService("", true, adapter.NodeIdentity{NodeID: "n1"}, nil) + srv, icept, err := configureAdminService("", true, adapter.NodeIdentity{NodeID: "n1"}, nil) if err != nil { t.Fatalf("insecure mode should succeed: %v", err) } if srv == nil { t.Fatal("expected AdminServer in insecure mode") } - if len(opts) != 0 { - t.Fatalf("insecure mode should not attach interceptors, got %d", len(opts)) + if !icept.empty() { + t.Fatalf("insecure mode should not attach interceptors, got %+v", icept) } } From e1f0e532f30438217e0b3a8d49ab6a253b3cc72a Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 24 Apr 2026 01:49:53 +0900 Subject: [PATCH 24/30] feat(admin): dynamic membership + unknown-contact sentinel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini medium items on a3c88b49: - adapter/admin_grpc.go (GetRaftGroups): report LastContactUnixMs=0 (epoch sentinel) when raftengine returns a negative LastContact (unknown — e.g., follower that has never heard from a leader). Previously the clamp collapsed to `now`, which the UI could not distinguish from "freshly contacted". proto/admin.proto documents the sentinel; regenerated admin.pb.go. - adapter/admin_grpc.go (GetClusterOverview): the member list is now the union of the bootstrap seed and each registered Raft group's live Configuration(). Scale-out nodes added after admin-server startup are now discoverable without restarting the admin binary. A per-group Configuration error does NOT fail the RPC — the seed list still produces useful output. - adapter.AdminGroup interface extended with Configuration(ctx); in-tree raftengine.Engine already satisfies it (ConfigReader). Test fakes updated. Tests added: - unknown LastContact → 0 instead of now. - scale-out node discovered via Configuration(). - Configuration error on one group does not fail the overview. --- adapter/admin_grpc.go | 82 +++++++++++++++++++++++++++++------- adapter/admin_grpc_test.go | 85 ++++++++++++++++++++++++++++++++++---- proto/admin.pb.go | 18 ++++---- proto/admin.proto | 4 ++ 4 files changed, 158 insertions(+), 31 deletions(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index e53a3db7..43b9e17b 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -19,9 +19,12 @@ import ( // AdminGroup exposes per-Raft-group state to the Admin service. It is a narrow // subset of raftengine.Engine so tests can supply an in-memory fake without -// standing up a real Raft cluster. +// standing up a real Raft cluster. Configuration is polled on each +// GetClusterOverview to pick up scale-out / scale-in events without the +// operator having to restart the admin binary. type AdminGroup interface { Status() raftengine.Status + Configuration(ctx context.Context) (raftengine.Configuration, error) } // NodeIdentity is the value form of the protobuf NodeIdentity message used for @@ -94,18 +97,18 @@ func (s *AdminServer) RegisterGroup(groupID uint64, g AdminGroup) { s.groupsMu.Unlock() } -// GetClusterOverview returns the local node identity, the configured member +// GetClusterOverview returns the local node identity, the current member // list, and per-group leader identity collected from the engines registered -// via RegisterGroup. +// via RegisterGroup. The member list is the union of (a) the bootstrap seed +// supplied to NewAdminServer and (b) the live Configuration of every +// registered Raft group — the latter picks up scale-out nodes added after +// startup so the admin binary's fan-out discovery does not miss them. func (s *AdminServer) GetClusterOverview( - _ context.Context, + ctx context.Context, _ *pb.GetClusterOverviewRequest, ) (*pb.GetClusterOverviewResponse, error) { leaders := s.snapshotLeaders() - members := make([]*pb.NodeIdentity, 0, len(s.members)) - for _, m := range s.members { - members = append(members, m.toProto()) - } + members := s.snapshotMembers(ctx) return &pb.GetClusterOverviewResponse{ Self: s.self.toProto(), Members: members, @@ -113,6 +116,52 @@ func (s *AdminServer) GetClusterOverview( }, nil } +// snapshotMembers unions the seed members with the live Configuration of each +// registered group (deduplicating by NodeID). Configuration errors are logged +// via the returned error in the per-member accumulator; they never fail the +// overall RPC because the seed list is always available as a fallback. +func (s *AdminServer) snapshotMembers(ctx context.Context) []*pb.NodeIdentity { + seen := make(map[string]struct{}) + out := make([]*pb.NodeIdentity, 0, len(s.members)) + add := func(id, addr string) { + if id == "" || id == s.self.NodeID { + return + } + if _, dup := seen[id]; dup { + return + } + seen[id] = struct{}{} + out = append(out, &pb.NodeIdentity{NodeId: id, GrpcAddress: addr}) + } + + // Seed members first — stable order when a group Configuration call + // errors or returns an empty list. + for _, m := range s.members { + add(m.NodeID, m.GRPCAddress) + } + + s.groupsMu.RLock() + groups := make([]AdminGroup, 0, len(s.groups)) + for _, g := range s.groups { + groups = append(groups, g) + } + s.groupsMu.RUnlock() + + for _, g := range groups { + cfg, err := g.Configuration(ctx) + if err != nil { + // A single group failing to report its ConfState does not fail + // the RPC; the seed list and other groups still produce useful + // output. + continue + } + for _, srv := range cfg.Servers { + add(srv.ID, srv.Address) + } + } + return out +} + // GetRaftGroups returns per-group state snapshots. Phase 0 wires commit/applied // indices only; per-follower contact and term history land in later phases. func (s *AdminServer) GetRaftGroups( @@ -130,13 +179,14 @@ func (s *AdminServer) GetRaftGroups( // leader, per raftengine.Status) into an absolute unix-ms so UI // clients can diff against their own clock instead of having to // reason about the server's uptime. The etcd engine returns a - // sentinel negative duration when the last contact is unknown - // (follower/candidate has never heard from a leader); clamping - // negatives to zero prevents the UI from rendering a future - // timestamp as "freshly contacted". - lastContact := st.LastContact - if lastContact < 0 { - lastContact = 0 + // sentinel negative duration when contact is unknown (e.g., a + // follower that has never heard from a leader). Report that case + // as `LastContactUnixMs=0` (epoch) so the UI can render "unknown" + // / "never contacted" rather than treating it as "freshly + // contacted just now". + var lastContactUnixMs int64 + if st.LastContact >= 0 { + lastContactUnixMs = now.Add(-st.LastContact).UnixMilli() } out = append(out, &pb.RaftGroupState{ RaftGroupId: id, @@ -144,7 +194,7 @@ func (s *AdminServer) GetRaftGroups( LeaderTerm: st.Term, CommitIndex: st.CommitIndex, AppliedIndex: st.AppliedIndex, - LastContactUnixMs: now.Add(-lastContact).UnixMilli(), + LastContactUnixMs: lastContactUnixMs, }) } return &pb.GetRaftGroupsResponse{Groups: out}, nil diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index 5d7816cf..5b435820 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -18,6 +18,8 @@ type fakeGroup struct { term uint64 commit uint64 applied uint64 + servers []raftengine.Server + cfgErr error } func (f fakeGroup) Status() raftengine.Status { @@ -29,6 +31,13 @@ func (f fakeGroup) Status() raftengine.Status { } } +func (f fakeGroup) Configuration(context.Context) (raftengine.Configuration, error) { + if f.cfgErr != nil { + return raftengine.Configuration{}, f.cfgErr + } + return raftengine.Configuration{Servers: append([]raftengine.Server(nil), f.servers...)}, nil +} + func TestGetClusterOverviewReturnsSelfAndLeaders(t *testing.T) { t.Parallel() srv := NewAdminServer( @@ -81,11 +90,68 @@ func TestGetRaftGroupsExposesCommitApplied(t *testing.T) { } } -// TestGetRaftGroupsClampsNegativeLastContact pins the sentinel-negative -// handling for raftengine's "unknown last contact" value (-1). Without the -// clamp, the admin UI would show a future timestamp precisely when leader -// contact is unknown, which reads as "freshly contacted" to operators. -func TestGetRaftGroupsClampsNegativeLastContact(t *testing.T) { +// TestGetClusterOverviewUnionsSeedsAndLiveConfig asserts that +// GetClusterOverview picks up a node that was added to a Raft group after the +// admin server was constructed (scale-out). Without Configuration polling, +// the static seed list would miss it entirely. +func TestGetClusterOverviewUnionsSeedsAndLiveConfig(t *testing.T) { + t.Parallel() + srv := NewAdminServer( + NodeIdentity{NodeID: "n1", GRPCAddress: "10.0.0.11:50051"}, + []NodeIdentity{{NodeID: "n2", GRPCAddress: "10.0.0.12:50051"}}, + ) + // Group reports a member (n3) that is NOT in the bootstrap seed list. + srv.RegisterGroup(1, fakeGroup{ + leaderID: "n1", term: 1, + servers: []raftengine.Server{ + {ID: "n1", Address: "10.0.0.11:50051"}, + {ID: "n2", Address: "10.0.0.12:50051"}, + {ID: "n3", Address: "10.0.0.13:50051"}, + }, + }) + + resp, err := srv.GetClusterOverview(context.Background(), &pb.GetClusterOverviewRequest{}) + if err != nil { + t.Fatal(err) + } + ids := make(map[string]string) + for _, m := range resp.Members { + ids[m.NodeId] = m.GrpcAddress + } + // Self (n1) is excluded; both seed (n2) and live-config (n3) must appear. + if len(ids) != 2 { + t.Fatalf("members = %v, want {n2, n3}", ids) + } + if ids["n2"] != "10.0.0.12:50051" || ids["n3"] != "10.0.0.13:50051" { + t.Fatalf("unexpected members %v", ids) + } +} + +// TestGetClusterOverviewSurvivesConfigurationError asserts that a group that +// errors on Configuration() does NOT fail the RPC — seed members are still +// returned. +func TestGetClusterOverviewSurvivesConfigurationError(t *testing.T) { + t.Parallel() + srv := NewAdminServer( + NodeIdentity{NodeID: "n1"}, + []NodeIdentity{{NodeID: "n2", GRPCAddress: "10.0.0.12:50051"}}, + ) + srv.RegisterGroup(1, fakeGroup{leaderID: "n1", cfgErr: context.DeadlineExceeded}) + + resp, err := srv.GetClusterOverview(context.Background(), &pb.GetClusterOverviewRequest{}) + if err != nil { + t.Fatalf("overview should not fail on group config error: %v", err) + } + if len(resp.Members) != 1 || resp.Members[0].NodeId != "n2" { + t.Fatalf("unexpected members %v", resp.Members) + } +} + +// TestGetRaftGroupsMapsUnknownLastContactToZero pins the sentinel-negative +// handling for raftengine's "unknown last contact" value (-1). The RPC +// reports 0 (epoch) in that case so the UI renders "unknown" rather than +// "contacted just now". +func TestGetRaftGroupsMapsUnknownLastContactToZero(t *testing.T) { t.Parallel() srv := NewAdminServer(NodeIdentity{NodeID: "n1"}, nil) srv.RegisterGroup(1, fakeGroupWithContact{leaderID: "n1", term: 1, lastContact: -1}) @@ -97,9 +163,8 @@ func TestGetRaftGroupsClampsNegativeLastContact(t *testing.T) { if err != nil { t.Fatal(err) } - got := resp.Groups[0].LastContactUnixMs - if got != fixed.UnixMilli() { - t.Fatalf("LastContactUnixMs = %d, want %d (now clamped to 0 duration)", got, fixed.UnixMilli()) + if got := resp.Groups[0].LastContactUnixMs; got != 0 { + t.Fatalf("LastContactUnixMs = %d, want 0 (unknown sentinel)", got) } } @@ -121,6 +186,10 @@ func (f fakeGroupWithContact) Status() raftengine.Status { } } +func (f fakeGroupWithContact) Configuration(context.Context) (raftengine.Configuration, error) { + return raftengine.Configuration{}, nil +} + // TestGroupOrderingIsStable locks in deterministic ascending-by-RaftGroupId // ordering so admin UIs and diff-based tests do not see rows jump around. func TestGroupOrderingIsStable(t *testing.T) { diff --git a/proto/admin.pb.go b/proto/admin.pb.go index 5845642e..5b0d08e6 100644 --- a/proto/admin.pb.go +++ b/proto/admin.pb.go @@ -345,13 +345,17 @@ func (x *GetClusterOverviewResponse) GetAggregateQps() uint64 { } type RaftGroupState struct { - state protoimpl.MessageState `protogen:"open.v1"` - RaftGroupId uint64 `protobuf:"varint,1,opt,name=raft_group_id,json=raftGroupId,proto3" json:"raft_group_id,omitempty"` - LeaderNodeId string `protobuf:"bytes,2,opt,name=leader_node_id,json=leaderNodeId,proto3" json:"leader_node_id,omitempty"` - LeaderTerm uint64 `protobuf:"varint,3,opt,name=leader_term,json=leaderTerm,proto3" json:"leader_term,omitempty"` - CommitIndex uint64 `protobuf:"varint,4,opt,name=commit_index,json=commitIndex,proto3" json:"commit_index,omitempty"` - AppliedIndex uint64 `protobuf:"varint,5,opt,name=applied_index,json=appliedIndex,proto3" json:"applied_index,omitempty"` - LastContactUnixMs int64 `protobuf:"varint,6,opt,name=last_contact_unix_ms,json=lastContactUnixMs,proto3" json:"last_contact_unix_ms,omitempty"` + state protoimpl.MessageState `protogen:"open.v1"` + RaftGroupId uint64 `protobuf:"varint,1,opt,name=raft_group_id,json=raftGroupId,proto3" json:"raft_group_id,omitempty"` + LeaderNodeId string `protobuf:"bytes,2,opt,name=leader_node_id,json=leaderNodeId,proto3" json:"leader_node_id,omitempty"` + LeaderTerm uint64 `protobuf:"varint,3,opt,name=leader_term,json=leaderTerm,proto3" json:"leader_term,omitempty"` + CommitIndex uint64 `protobuf:"varint,4,opt,name=commit_index,json=commitIndex,proto3" json:"commit_index,omitempty"` + AppliedIndex uint64 `protobuf:"varint,5,opt,name=applied_index,json=appliedIndex,proto3" json:"applied_index,omitempty"` + // last_contact_unix_ms is the unix-ms timestamp of the most recent leader + // contact observed on this node. Zero means "unknown" (for example, the + // engine is a follower that has never heard from a leader); UIs should + // render that case as "unknown" rather than "contacted at epoch". + LastContactUnixMs int64 `protobuf:"varint,6,opt,name=last_contact_unix_ms,json=lastContactUnixMs,proto3" json:"last_contact_unix_ms,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } diff --git a/proto/admin.proto b/proto/admin.proto index 3b476166..d0e61ede 100644 --- a/proto/admin.proto +++ b/proto/admin.proto @@ -41,6 +41,10 @@ message RaftGroupState { uint64 leader_term = 3; uint64 commit_index = 4; uint64 applied_index = 5; + // last_contact_unix_ms is the unix-ms timestamp of the most recent leader + // contact observed on this node. Zero means "unknown" (for example, the + // engine is a follower that has never heard from a leader); UIs should + // render that case as "unknown" rather than "contacted at epoch". int64 last_contact_unix_ms = 6; } From 37a043529ef6cafc4f2be08feaa85bc4a50b5d4a Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 02:15:52 +0900 Subject: [PATCH 25/30] fix(admin): live Raft config wins over stale bootstrap seed in overview Codex P2 on e1f0e532: snapshotMembers seeded out[] first, so a stale bootstrap entry for a readdressed node (same NodeID, new GRPCAddress) silently outvoted the current Raft Configuration. GetClusterOverview kept advertising the old endpoint and the admin fanout dialed the wrong target. Collect live Configuration entries first, then add seeds only for NodeIDs no group reported. A readdressed n2 now shows its current 10.0.0.22:50051 instead of the stale 10.0.0.12:50051 bootstrap value. Regression test pins the behavior. --- adapter/admin_grpc.go | 89 ++++++++++++++++++++++++++------------ adapter/admin_grpc_test.go | 34 +++++++++++++++ 2 files changed, 96 insertions(+), 27 deletions(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index 43b9e17b..5d007104 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -117,49 +117,84 @@ func (s *AdminServer) GetClusterOverview( } // snapshotMembers unions the seed members with the live Configuration of each -// registered group (deduplicating by NodeID). Configuration errors are logged -// via the returned error in the per-member accumulator; they never fail the -// overall RPC because the seed list is always available as a fallback. +// registered group, preferring the live address when the same NodeID appears +// in both sources. A stale bootstrap entry cannot outvote a readdressed node: +// if n2 was moved from 10.0.0.12 to 10.0.0.22, the overview reports the +// current 10.0.0.22 so fan-out dials the right target. Configuration errors +// on a single group do not fail the RPC — other groups plus the seed list +// still produce useful output. func (s *AdminServer) snapshotMembers(ctx context.Context) []*pb.NodeIdentity { - seen := make(map[string]struct{}) - out := make([]*pb.NodeIdentity, 0, len(s.members)) - add := func(id, addr string) { - if id == "" || id == s.self.NodeID { - return - } - if _, dup := seen[id]; dup { - return - } - seen[id] = struct{}{} - out = append(out, &pb.NodeIdentity{NodeId: id, GrpcAddress: addr}) - } + groups := s.cloneGroups() + addrByID, order := collectLiveMembers(ctx, groups, s.self.NodeID) + mergeSeedMembers(s.members, s.self.NodeID, addrByID, &order) - // Seed members first — stable order when a group Configuration call - // errors or returns an empty list. - for _, m := range s.members { - add(m.NodeID, m.GRPCAddress) + out := make([]*pb.NodeIdentity, 0, len(order)) + for _, id := range order { + out = append(out, &pb.NodeIdentity{NodeId: id, GrpcAddress: addrByID[id]}) } + return out +} +// cloneGroups snapshots the registered groups under the read lock so the +// caller can iterate without holding groupsMu while invoking Configuration +// (which may block on Raft). +func (s *AdminServer) cloneGroups() []AdminGroup { s.groupsMu.RLock() - groups := make([]AdminGroup, 0, len(s.groups)) + defer s.groupsMu.RUnlock() + out := make([]AdminGroup, 0, len(s.groups)) for _, g := range s.groups { - groups = append(groups, g) + out = append(out, g) } - s.groupsMu.RUnlock() + return out +} +// collectLiveMembers polls Configuration for each group and returns the union +// of server IDs (excluding self) with their live addresses. The order slice +// preserves first-seen iteration order for stable output. +func collectLiveMembers( + ctx context.Context, + groups []AdminGroup, + selfID string, +) (addrByID map[string]string, order []string) { + addrByID = make(map[string]string) + order = make([]string, 0) for _, g := range groups { cfg, err := g.Configuration(ctx) if err != nil { - // A single group failing to report its ConfState does not fail - // the RPC; the seed list and other groups still produce useful - // output. continue } for _, srv := range cfg.Servers { - add(srv.ID, srv.Address) + if srv.ID == "" || srv.ID == selfID { + continue + } + if _, dup := addrByID[srv.ID]; dup { + continue + } + addrByID[srv.ID] = srv.Address + order = append(order, srv.ID) } } - return out + return addrByID, order +} + +// mergeSeedMembers fills in seed entries for IDs no live Configuration +// reported. Seeds never overwrite a live address. +func mergeSeedMembers( + seeds []NodeIdentity, + selfID string, + addrByID map[string]string, + order *[]string, +) { + for _, m := range seeds { + if m.NodeID == "" || m.NodeID == selfID { + continue + } + if _, known := addrByID[m.NodeID]; known { + continue + } + addrByID[m.NodeID] = m.GRPCAddress + *order = append(*order, m.NodeID) + } } // GetRaftGroups returns per-group state snapshots. Phase 0 wires commit/applied diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index 5b435820..f8158232 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -127,6 +127,40 @@ func TestGetClusterOverviewUnionsSeedsAndLiveConfig(t *testing.T) { } } +// TestGetClusterOverviewLiveConfigWinsOverStaleSeed asserts that when a node +// is readdressed (same NodeID, new GRPCAddress), the live Raft Configuration +// wins over the stale bootstrap seed so fan-out dials the current endpoint. +// Codex P2 on e1f0e532: previously seed was added first and later entries +// with the same ID were ignored, silently pinning the old address. +func TestGetClusterOverviewLiveConfigWinsOverStaleSeed(t *testing.T) { + t.Parallel() + srv := NewAdminServer( + NodeIdentity{NodeID: "n1", GRPCAddress: "10.0.0.11:50051"}, + // Bootstrap: n2 lived at 10.0.0.12. + []NodeIdentity{{NodeID: "n2", GRPCAddress: "10.0.0.12:50051"}}, + ) + // Raft config reports n2 moved to 10.0.0.22. + srv.RegisterGroup(1, fakeGroup{ + leaderID: "n1", term: 1, + servers: []raftengine.Server{ + {ID: "n1", Address: "10.0.0.11:50051"}, + {ID: "n2", Address: "10.0.0.22:50051"}, + }, + }) + + resp, err := srv.GetClusterOverview(context.Background(), &pb.GetClusterOverviewRequest{}) + if err != nil { + t.Fatal(err) + } + if len(resp.Members) != 1 { + t.Fatalf("members = %d, want 1", len(resp.Members)) + } + got := resp.Members[0] + if got.NodeId != "n2" || got.GrpcAddress != "10.0.0.22:50051" { + t.Fatalf("members[0] = %+v, want n2 @ 10.0.0.22:50051 (live wins over seed)", got) + } +} + // TestGetClusterOverviewSurvivesConfigurationError asserts that a group that // errors on Configuration() does NOT fail the RPC — seed members are still // returned. From 5efc65555783124b3337433f29a1f0b6fe6a0964 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 06:21:43 +0900 Subject: [PATCH 26/30] fix(admin): deterministic self-addr + duplicate-ID tie-break MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P2 ×2 on 37a04352: 1. Self-address: setupAdminService was advertising *myAddr as Self.GRPCAddress, but in multi-group mode the process actually listens on each group's rt.spec.address (*myAddr can be unrelated or default). Fan-out would then include a non-listening endpoint and keep failing against it. canonicalSelfAddress now picks the lowest-group-ID listener deterministically (with *myAddr fallback for single-node dev runs). 2. Duplicate live NodeIDs: when two groups report the same server ID with different addresses during readdress convergence, collectLiveMembers kept first-seen — but Go map iteration is non-deterministic, so fan-out could flap between addresses. Sort groups by ID (ascending) before iterating so the lowest-ID group's address is the stable winner. Runs through GetClusterOverview 5 times in the regression test to pin stability. adapter/admin_grpc.go: cloneGroups → cloneGroupsSorted returning []groupEntry{id, group}; collectLiveMembers now takes sorted groups. --- adapter/admin_grpc.go | 42 +++++++++++++++++++++++++------------- adapter/admin_grpc_test.go | 40 ++++++++++++++++++++++++++++++++++++ main.go | 34 +++++++++++++++++++++++++++++- main_admin_test.go | 25 +++++++++++++++++++++++ 4 files changed, 126 insertions(+), 15 deletions(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index 5d007104..9670b789 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -124,7 +124,7 @@ func (s *AdminServer) GetClusterOverview( // on a single group do not fail the RPC — other groups plus the seed list // still produce useful output. func (s *AdminServer) snapshotMembers(ctx context.Context) []*pb.NodeIdentity { - groups := s.cloneGroups() + groups := s.cloneGroupsSorted() addrByID, order := collectLiveMembers(ctx, groups, s.self.NodeID) mergeSeedMembers(s.members, s.self.NodeID, addrByID, &order) @@ -135,31 +135,45 @@ func (s *AdminServer) snapshotMembers(ctx context.Context) []*pb.NodeIdentity { return out } -// cloneGroups snapshots the registered groups under the read lock so the -// caller can iterate without holding groupsMu while invoking Configuration -// (which may block on Raft). -func (s *AdminServer) cloneGroups() []AdminGroup { +// groupEntry pairs a Raft group ID with its AdminGroup so callers can iterate +// in a deterministic (ID-ascending) order. Sorting matters for +// collectLiveMembers: when two groups report the same NodeID with different +// addresses (e.g., mid-readdress), the iteration order picks which address +// wins, and a Go map's range order is unspecified. +type groupEntry struct { + id uint64 + group AdminGroup +} + +// cloneGroupsSorted snapshots the registered groups under the read lock and +// returns them sorted by group ID so iteration and tie-break decisions are +// stable across calls. +func (s *AdminServer) cloneGroupsSorted() []groupEntry { s.groupsMu.RLock() defer s.groupsMu.RUnlock() - out := make([]AdminGroup, 0, len(s.groups)) - for _, g := range s.groups { - out = append(out, g) + out := make([]groupEntry, 0, len(s.groups)) + for id, g := range s.groups { + out = append(out, groupEntry{id: id, group: g}) } + sort.Slice(out, func(i, j int) bool { return out[i].id < out[j].id }) return out } -// collectLiveMembers polls Configuration for each group and returns the union -// of server IDs (excluding self) with their live addresses. The order slice -// preserves first-seen iteration order for stable output. +// collectLiveMembers polls Configuration for each group (in ascending group +// ID order supplied by the caller) and returns the union of server IDs +// (excluding self) with their live addresses. When two groups report the +// same server ID with different addresses — e.g. mid-readdress before every +// group has converged — the lowest-ID group wins, which is stable across +// calls and matches "trust the primary group" intuition. func collectLiveMembers( ctx context.Context, - groups []AdminGroup, + groups []groupEntry, selfID string, ) (addrByID map[string]string, order []string) { addrByID = make(map[string]string) order = make([]string, 0) - for _, g := range groups { - cfg, err := g.Configuration(ctx) + for _, entry := range groups { + cfg, err := entry.group.Configuration(ctx) if err != nil { continue } diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index f8158232..1fd89289 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -127,6 +127,46 @@ func TestGetClusterOverviewUnionsSeedsAndLiveConfig(t *testing.T) { } } +// TestGetClusterOverviewDuplicateMemberIDsDeterministic pins the tie-break +// when two Raft groups disagree on a server's address (e.g. mid-readdress, +// before every group has converged): the group with the smallest ID wins and +// the result is stable across calls, so fan-out doesn't flap between stale +// and current addresses. +func TestGetClusterOverviewDuplicateMemberIDsDeterministic(t *testing.T) { + t.Parallel() + srv := NewAdminServer(NodeIdentity{NodeID: "n1"}, nil) + // Group 1 (lower ID): n2 already moved to new address. + srv.RegisterGroup(1, fakeGroup{ + leaderID: "n1", term: 1, + servers: []raftengine.Server{ + {ID: "n1", Address: "10.0.0.11:50051"}, + {ID: "n2", Address: "10.0.0.22:50051"}, + }, + }) + // Group 7 (higher ID): still reports n2 at the stale address. + srv.RegisterGroup(7, fakeGroup{ + leaderID: "n1", term: 1, + servers: []raftengine.Server{ + {ID: "n1", Address: "10.0.0.11:50051"}, + {ID: "n2", Address: "10.0.0.12:50051"}, + }, + }) + + // Run overview 5 times. All must return the low-ID group's n2 address. + for i := 0; i < 5; i++ { + resp, err := srv.GetClusterOverview(context.Background(), &pb.GetClusterOverviewRequest{}) + if err != nil { + t.Fatal(err) + } + if len(resp.Members) != 1 { + t.Fatalf("iter %d: members=%d want 1", i, len(resp.Members)) + } + if resp.Members[0].GrpcAddress != "10.0.0.22:50051" { + t.Fatalf("iter %d: got %s, want low-ID group's n2 @ 10.0.0.22:50051", i, resp.Members[0].GrpcAddress) + } + } +} + // TestGetClusterOverviewLiveConfigWinsOverStaleSeed asserts that when a node // is readdressed (same NodeID, new GRPCAddress), the live Raft Configuration // wins over the stale bootstrap seed so fan-out dials the current endpoint. diff --git a/main.go b/main.go index 5dd2ba2c..48d0e3c8 100644 --- a/main.go +++ b/main.go @@ -501,10 +501,16 @@ func setupAdminService( bootstrapServers []raftengine.Server, ) (*adapter.AdminServer, adminGRPCInterceptors, error) { members := adminMembersFromBootstrap(nodeID, bootstrapServers) + // In multi-group mode the process does not listen on *myAddr — each group + // has its own rt.spec.address. Use the lowest-group-ID listener as the + // canonical self address so GetClusterOverview.Self advertises an + // endpoint the fan-out can actually dial. Falls back to the flag value + // when no runtimes are registered (single-node dev runs). + selfAddr := canonicalSelfAddress(grpcAddress, runtimes) srv, icept, err := configureAdminService( *adminTokenFile, *adminInsecureNoAuth, - adapter.NodeIdentity{NodeID: nodeID, GRPCAddress: grpcAddress}, + adapter.NodeIdentity{NodeID: nodeID, GRPCAddress: selfAddr}, members, ) if err != nil { @@ -522,6 +528,32 @@ func setupAdminService( return srv, icept, nil } +// canonicalSelfAddress picks the listener address AdminServer should advertise +// as Self.GRPCAddress. The Admin gRPC service is registered on every Raft +// group's listener in startRaftServers, so any runtime's address is reachable; +// we pick the lowest group ID to make the choice deterministic across +// restarts. Returns the supplied fallback when no runtimes exist (e.g., a +// single-node dev invocation without --raftGroups). +func canonicalSelfAddress(fallback string, runtimes []*raftGroupRuntime) string { + var ( + bestID uint64 + bestAddr string + found bool + ) + for _, rt := range runtimes { + if rt == nil { + continue + } + if !found || rt.spec.id < bestID { + bestID, bestAddr, found = rt.spec.id, rt.spec.address, true + } + } + if !found { + return fallback + } + return bestAddr +} + // adminMembersFromBootstrap extracts the peer list (everyone except self) from // the Raft bootstrap configuration so GetClusterOverview returns a populated // members list. Without this the admin binary's membersFrom cache collapses to diff --git a/main_admin_test.go b/main_admin_test.go index d4f9c8bc..6cd8f60f 100644 --- a/main_admin_test.go +++ b/main_admin_test.go @@ -97,6 +97,31 @@ func TestAdminMembersFromBootstrapEmpty(t *testing.T) { } } +// TestCanonicalSelfAddressPicksLowestGroup pins the deterministic choice of +// Self.GRPCAddress when --raftGroups is set — the fan-out path has to dial an +// endpoint that this process actually listens on, so --address (which may be +// unrelated) must not win over the real group listeners. +func TestCanonicalSelfAddressPicksLowestGroup(t *testing.T) { + t.Parallel() + runtimes := []*raftGroupRuntime{ + {spec: groupSpec{id: 5, address: "10.0.0.1:50055"}}, + {spec: groupSpec{id: 2, address: "10.0.0.1:50052"}}, + {spec: groupSpec{id: 9, address: "10.0.0.1:50059"}}, + } + got := canonicalSelfAddress("localhost:50051", runtimes) + if got != "10.0.0.1:50052" { + t.Fatalf("got %q, want lowest-group address 10.0.0.1:50052", got) + } +} + +func TestCanonicalSelfAddressFallsBackWithoutRuntimes(t *testing.T) { + t.Parallel() + got := canonicalSelfAddress("localhost:50051", nil) + if got != "localhost:50051" { + t.Fatalf("got %q, want fallback localhost:50051", got) + } +} + func TestLoadAdminTokenFileRejectsOversize(t *testing.T) { t.Parallel() dir := t.TempDir() From fb813af0c9b7e7c67be498e8b25b6eebd72b976f Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 17:04:06 +0900 Subject: [PATCH 27/30] fix(admin): skip blank live addresses so seeds can backfill Codex P2 on 5efc6555: collectLiveMembers stored empty `srv.Address` values returned by the etcd engine (peer metadata missing during membership convergence), and the entry then shadowed a usable seed for the same NodeID. GetClusterOverview returned an empty `grpc_address` and the admin binary's membersFrom skipped the peer entirely until live Configuration caught up. Skip live entries with empty Address; let mergeSeedMembers backfill from the seed list. Regression test pins the behavior with a group that reports n2 without an address but the bootstrap seed knows it. --- adapter/admin_grpc.go | 8 +++++++- adapter/admin_grpc_test.go | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index 9670b789..b102d19b 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -165,6 +165,12 @@ func (s *AdminServer) cloneGroupsSorted() []groupEntry { // same server ID with different addresses — e.g. mid-readdress before every // group has converged — the lowest-ID group wins, which is stable across // calls and matches "trust the primary group" intuition. +// +// Entries with an empty `srv.Address` (the etcd engine can emit those when +// peer metadata is still missing) are skipped: storing a blank address would +// shadow a usable seed entry for the same NodeID and cause GetClusterOverview +// to drop the peer from fan-out altogether. Letting the seed list backfill +// keeps the peer reachable until the live Configuration converges. func collectLiveMembers( ctx context.Context, groups []groupEntry, @@ -178,7 +184,7 @@ func collectLiveMembers( continue } for _, srv := range cfg.Servers { - if srv.ID == "" || srv.ID == selfID { + if srv.ID == "" || srv.ID == selfID || srv.Address == "" { continue } if _, dup := addrByID[srv.ID]; dup { diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index 1fd89289..5d44511e 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -167,6 +167,40 @@ func TestGetClusterOverviewDuplicateMemberIDsDeterministic(t *testing.T) { } } +// TestGetClusterOverviewSeedBackfillsBlankLiveAddress asserts that when a +// Raft group reports a server with NodeID set but Address="" (the etcd +// engine emits these mid-membership-update), the seed list still gets to +// backfill that ID instead of being shadowed by a blank live entry. Without +// this, GetClusterOverview would drop the peer from fan-out entirely until +// the live Configuration converged. +func TestGetClusterOverviewSeedBackfillsBlankLiveAddress(t *testing.T) { + t.Parallel() + srv := NewAdminServer( + NodeIdentity{NodeID: "n1"}, + []NodeIdentity{{NodeID: "n2", GRPCAddress: "10.0.0.12:50051"}}, + ) + // Live config knows n2 exists but has no address yet. + srv.RegisterGroup(1, fakeGroup{ + leaderID: "n1", term: 1, + servers: []raftengine.Server{ + {ID: "n1", Address: "10.0.0.11:50051"}, + {ID: "n2", Address: ""}, + }, + }) + + resp, err := srv.GetClusterOverview(context.Background(), &pb.GetClusterOverviewRequest{}) + if err != nil { + t.Fatal(err) + } + if len(resp.Members) != 1 { + t.Fatalf("members = %d, want 1", len(resp.Members)) + } + got := resp.Members[0] + if got.NodeId != "n2" || got.GrpcAddress != "10.0.0.12:50051" { + t.Fatalf("members[0] = %+v, want seed n2 @ 10.0.0.12:50051 (blank live skipped)", got) + } +} + // TestGetClusterOverviewLiveConfigWinsOverStaleSeed asserts that when a node // is readdressed (same NodeID, new GRPCAddress), the live Raft Configuration // wins over the stale bootstrap seed so fan-out dials the current endpoint. From dc03dda12a0893050e194718bc20f1a5b3f69011 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 17:16:17 +0900 Subject: [PATCH 28/30] fix(etcd): hoist ".wal" literal to walFileExt constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-existing goconst flagged by the lint hook on the merge commit: ".wal" appeared 3× across wal_purge.go and tests. Mirrors the earlier fsm_snapshot_file.go snapFileExt cleanup from round 8. --- internal/raftengine/etcd/wal_purge.go | 2 +- internal/raftengine/etcd/wal_store.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/raftengine/etcd/wal_purge.go b/internal/raftengine/etcd/wal_purge.go index b2c16304..346be746 100644 --- a/internal/raftengine/etcd/wal_purge.go +++ b/internal/raftengine/etcd/wal_purge.go @@ -137,7 +137,7 @@ func collectWALNames(entries []os.DirEntry) []string { if e.IsDir() { continue } - if filepath.Ext(e.Name()) != ".wal" { + if filepath.Ext(e.Name()) != walFileExt { continue } names = append(names, e.Name()) diff --git a/internal/raftengine/etcd/wal_store.go b/internal/raftengine/etcd/wal_store.go index 91f8edd0..35e6c3f8 100644 --- a/internal/raftengine/etcd/wal_store.go +++ b/internal/raftengine/etcd/wal_store.go @@ -19,6 +19,7 @@ import ( const ( walDirName = "wal" snapDirName = "snap" + walFileExt = ".wal" ) type diskState struct { From 3b7c9350abc94aab8e985d38757b97c287f1e7b1 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 17:25:44 +0900 Subject: [PATCH 29/30] fix(admin): cap and pool the writeJSON encode buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini medium on dc03dda1: writeJSON buffered the entire response body in memory before writing, and `handleOverview` aggregates state from up to maxDiscoveredNodes=512 nodes — so a misbehaving downstream returning an oversized payload could push admin process memory unbounded. - Pool the encode buffer via sync.Pool so steady-state requests reuse allocations instead of churning the heap. Buffers larger than 1 MiB are dropped on return so retained pool objects stay bounded. - Wrap the encoder target in a cappedWriter that refuses writes past maxResponseBodyBytes=16 MiB (well above the bounded fan-out × proto-size worst case). On overflow writeJSON emits the same 500 fallback that encode-failure already uses. Regression test feeds writeJSON ~19 MiB of ASCII strings and asserts the response is 500 (cap enforced), not a 200 with truncated body. --- cmd/elastickv-admin/main.go | 76 ++++++++++++++++++++++++++++---- cmd/elastickv-admin/main_test.go | 24 ++++++++++ 2 files changed, 92 insertions(+), 8 deletions(-) diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index 2310bed9..d1a9f205 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -686,15 +686,48 @@ func (f *fanout) handleOverview(w http.ResponseWriter, r *http.Request) { writeJSON(w, http.StatusOK, map[string]any{"nodes": results}) } -// writeJSON marshals body into a buffer first, so an encoding failure can -// still surface as a 500 instead of a truncated body under a committed 2xx -// header. The admin API response bodies are small (bounded by rows/routes -// caps in later phases), so buffering is safe. +// maxResponseBodyBytes caps writeJSON's encode buffer. With fan-out across at +// most maxDiscoveredNodes=512 nodes returning a small GetClusterOverview proto +// each, real responses sit in the low MiBs; this cap (16 MiB) leaves plenty +// of headroom while preventing a misbehaving node from forcing unbounded +// memory growth in the admin process. +const maxResponseBodyBytes = 16 << 20 + +// writeJSONBufferPool reuses encode buffers across requests so a steady stream +// of /api/* calls doesn't churn the heap with per-request allocations. The +// pool stores *bytes.Buffer; each user resets and bounds the buffer. +var writeJSONBufferPool = sync.Pool{ + New: func() any { return new(bytes.Buffer) }, +} + +// writeJSON marshals body into a pooled, size-capped buffer first, so an +// encoding failure can still surface as a 500 instead of a truncated body +// under a committed 2xx header. The cap (maxResponseBodyBytes) bounds memory +// even if a misbehaving downstream returns an oversized payload. func writeJSON(w http.ResponseWriter, code int, body any) { - var buf bytes.Buffer - enc := json.NewEncoder(&buf) - if err := enc.Encode(body); err != nil { - log.Printf("elastickv-admin: encode JSON response: %v", err) + buf, ok := writeJSONBufferPool.Get().(*bytes.Buffer) + if !ok { + buf = new(bytes.Buffer) + } + defer func() { + // Drop very large buffers rather than retaining them in the pool — + // keeps steady-state memory close to the typical response size. + const maxRetainBytes = 1 << 20 + if buf.Cap() > maxRetainBytes { + return + } + buf.Reset() + writeJSONBufferPool.Put(buf) + }() + buf.Reset() + + limited := &cappedWriter{w: buf, max: maxResponseBodyBytes} + if err := json.NewEncoder(limited).Encode(body); err != nil || limited.exceeded { + if limited.exceeded { + log.Printf("elastickv-admin: response exceeded %d-byte cap; returning 500", maxResponseBodyBytes) + } else { + log.Printf("elastickv-admin: encode JSON response: %v", err) + } w.Header().Set("Content-Type", "application/json; charset=utf-8") w.WriteHeader(http.StatusInternalServerError) const fallback = `{"code":500,"message":"internal server error"}` + "\n" @@ -710,6 +743,33 @@ func writeJSON(w http.ResponseWriter, code int, body any) { } } +// cappedWriter wraps an io.Writer and refuses writes once `written` would +// exceed `max`. Used by writeJSON so json.Encoder stops streaming bytes into +// the buffer past the cap; the encoder reports the short-write and writeJSON +// returns a 500 instead of an oversized body. +type cappedWriter struct { + w *bytes.Buffer + max int + written int + exceeded bool +} + +func (c *cappedWriter) Write(p []byte) (int, error) { + if c.exceeded { + return 0, errors.New("response body cap exceeded") + } + if c.written+len(p) > c.max { + c.exceeded = true + return 0, fmt.Errorf("response body would exceed %d bytes", c.max) + } + n, err := c.w.Write(p) + c.written += n + if err != nil { + return n, errors.Wrap(err, "buffer write") + } + return n, nil +} + func writeJSONError(w http.ResponseWriter, code int, msg string) { writeJSON(w, code, map[string]any{"code": code, "message": msg}) } diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index 506b3439..ad43b953 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -478,6 +478,30 @@ func TestHandleOverviewRejectsNonGET(t *testing.T) { } } +// TestWriteJSONCapsResponseBody asserts that an oversized body is rejected +// with 500 instead of streaming MiBs of bytes into the response. Caps memory +// usage in the admin process when fan-out hits a misbehaving downstream that +// returns an enormous payload. +func TestWriteJSONCapsResponseBody(t *testing.T) { + t.Parallel() + rec := httptest.NewRecorder() + // 17 MiB ASCII payload (each entry is a 16-byte string + 3 bytes JSON + // punctuation × 1<<20 entries ≈ 19 MiB encoded), well past + // maxResponseBodyBytes=16 MiB. + const elems = 1 << 20 + huge := make([]string, elems) + for i := range huge { + huge[i] = "0123456789abcdef" + } + writeJSON(rec, http.StatusOK, huge) + if rec.Code != http.StatusInternalServerError { + t.Fatalf("code = %d, want %d (cap exceeded)", rec.Code, http.StatusInternalServerError) + } + if !strings.Contains(rec.Body.String(), "internal server error") { + t.Fatalf("body = %q", rec.Body.String()) + } +} + func TestWriteJSONSurfacesEncodeFailure(t *testing.T) { t.Parallel() rec := httptest.NewRecorder() From e1642f293b6f44d2ce696c460e453215e899c389 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 17:51:07 +0900 Subject: [PATCH 30/30] fix(admin): unbound mutex on conn close + dial, ctx-aware live members, msg-size cap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Gemini medium x3 + Codex P2 on PR #545. Each fix ships with a matching test (run with -race). cmd/elastickv-admin/main.go (Gemini medium x2 + Codex P2): - fanout.Close: collect cached *grpc.ClientConn into a local slice under f.mu (marking each entry closed so the deferred-release path does not double-close), drop the lock, then call conn.Close on each. gRPC Close performs network I/O and waits for the client transport to drain; holding the global fanout mutex for that wall-clock span blocked any concurrent clientFor / invalidateClient / RPC waiting on f.mu for the entire shutdown window. - fanout.clientFor: dial step (grpc.NewClient) now runs outside f.mu. After the dial we re-take the lock and re-check whether another goroutine already inserted a client for the same addr; the loser of that race closes its just-dialed conn so we never leak a duplicate. The dial-time DNS / target-parse work no longer serializes concurrent clientFor calls for distinct addrs. - fanout.clientFor: pass internal.GRPCCallOptions() so the admin fanout's per-call recv/send caps match the node servers' 64 MiB configuration. Without this, gRPC-Go's default ~4 MiB recv cap would silently fail RPCs once aggregated cluster-overview / matrix admin payloads exceed 4 MiB. internal/grpc.go: - New GRPCCallOptions() helper for callers that supply their own transport credentials (admin TLS path); reuses the existing GRPCMaxMessageBytes constant so the dial cap matches the server cap. adapter/admin_grpc.go (Gemini medium): - collectLiveMembers no longer blocks on wg.Wait(). The fan-out Configuration() goroutines drop their results onto a buffered channel; the merge loop reads from the channel with a select on ctx.Done() so a single hung Configuration call cannot wedge the whole GetClusterOverview RPC. Stuck goroutines unwind asynchronously when their per-RPC ctx fires. - The fan-out helper is split out as fanoutConfigurationCalls so collectLiveMembers stays under the cyclop budget. - Results are sorted by group ID before the merge so the lowest-ID-wins tie-break for duplicate NodeIDs stays deterministic even when results land out of completion order. Tests: - TestFanoutClientForRaceDeduplicates (race-mode): 32 goroutines race for the same addr; all must converge on a single cached *nodeClient and the cache must end at size 1. Pre-fix the race was impossible by construction; post-fix it is bounded. - TestFanoutCloseDoesNotHoldLockDuringConnClose: starts Close on a goroutine and asserts a concurrent f.mu.TryLock() succeeds inside 2 seconds — proving the conn.Close calls run outside the lock. - TestCollectLiveMembersHonoursCtxCancel: hangingGroup never returns from Configuration until ctx.Done; collectLiveMembers must return within 1 s of an already-cancelled ctx. Codex P1/P2 from earlier rounds (live-vs-seed reconciliation, empty-address dedup) already addressed; no change required. Build / vet / lint / -race tests clean. --- adapter/admin_grpc.go | 58 +++++++++++++------ adapter/admin_grpc_test.go | 49 ++++++++++++++++ cmd/elastickv-admin/main.go | 76 ++++++++++++++++++++----- cmd/elastickv-admin/main_test.go | 97 ++++++++++++++++++++++++++++++++ internal/grpc.go | 13 +++++ 5 files changed, 264 insertions(+), 29 deletions(-) diff --git a/adapter/admin_grpc.go b/adapter/admin_grpc.go index c1234414..b3a1d0dc 100644 --- a/adapter/admin_grpc.go +++ b/adapter/admin_grpc.go @@ -177,30 +177,56 @@ func (s *AdminServer) cloneGroupsSorted() []groupEntry { // into a pre-allocated slice indexed by the sorted-order position so the // merge step still walks groups in ascending-ID order and preserves the // deterministic tie-break. -func collectLiveMembers( - ctx context.Context, - groups []groupEntry, - selfID string, -) (addrByID map[string]string, order []string) { - type configResult struct { - cfg raftengine.Configuration - err error - } - results := make([]configResult, len(groups)) - var wg sync.WaitGroup +// configResult bundles a Configuration RPC outcome with its position in the +// caller-supplied groups slice so the merge step can re-sort by group-ID +// even when results land out of completion order. +type configResult struct { + i int + cfg raftengine.Configuration + err error +} + +// fanoutConfigurationCalls launches a Configuration(ctx) goroutine per +// group and collects results. Returns whatever has landed by the time ctx +// fires; remaining goroutines drain into the (buffered) channel and exit +// asynchronously when their per-RPC ctx unwinds. The early-return is the +// reason this lives in its own function: reading a shared []configResult +// slice across the cancel boundary would race the still-running goroutines. +func fanoutConfigurationCalls(ctx context.Context, groups []groupEntry) []configResult { + resultsCh := make(chan configResult, len(groups)) for i, entry := range groups { - wg.Add(1) go func(i int, entry groupEntry) { - defer wg.Done() cfg, err := entry.group.Configuration(ctx) - results[i] = configResult{cfg: cfg, err: err} + resultsCh <- configResult{i: i, cfg: cfg, err: err} }(i, entry) } - wg.Wait() + got := make([]configResult, 0, len(groups)) + for range groups { + select { + case res := <-resultsCh: + got = append(got, res) + case <-ctx.Done(): + return got + } + } + return got +} + +func collectLiveMembers( + ctx context.Context, + groups []groupEntry, + selfID string, +) (addrByID map[string]string, order []string) { + got := fanoutConfigurationCalls(ctx, groups) + + // Merge in the original group-ID order so the lowest-ID-wins tie-break + // stays deterministic. (Completion order would otherwise depend on + // which Configuration() returned first.) + sort.Slice(got, func(a, b int) bool { return got[a].i < got[b].i }) addrByID = make(map[string]string) order = make([]string, 0) - for _, res := range results { + for _, res := range got { if res.err != nil { continue } diff --git a/adapter/admin_grpc_test.go b/adapter/admin_grpc_test.go index 5d44511e..f95f4d79 100644 --- a/adapter/admin_grpc_test.go +++ b/adapter/admin_grpc_test.go @@ -414,3 +414,52 @@ func TestAdminTokenAuthEmptyTokenDisabled(t *testing.T) { t.Fatal("empty token should disable interceptors") } } + +// hangingGroup never returns from Configuration until ctx fires. Used to +// prove collectLiveMembers stops blocking the merge phase as soon as the +// caller cancels, even if one Configuration call is stuck. +type hangingGroup struct{ fakeGroup } + +func (h hangingGroup) Configuration(ctx context.Context) (raftengine.Configuration, error) { + <-ctx.Done() + return raftengine.Configuration{}, ctx.Err() +} + +// TestCollectLiveMembersHonoursCtxCancel asserts that collectLiveMembers +// returns promptly when ctx is cancelled, even if one Configuration call +// is stuck. Pre-fix, the wg.Wait() inside collectLiveMembers would block +// the merge phase (and the entire GetClusterOverview RPC) on the slowest +// group regardless of ctx state. Post-fix, the merge runs over whatever +// landed before the cancel; the stuck Configuration goroutine unwinds +// asynchronously when its ctx.Done fires. +func TestCollectLiveMembersHonoursCtxCancel(t *testing.T) { + t.Parallel() + + groups := []groupEntry{ + {id: 1, group: hangingGroup{}}, + {id: 2, group: hangingGroup{}}, + } + + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel immediately + + type result struct { + addrByID map[string]string + order []string + } + resCh := make(chan result, 1) + go func() { + addrByID, order := collectLiveMembers(ctx, groups, "self") + resCh <- result{addrByID: addrByID, order: order} + }() + + select { + case r := <-resCh: + // With ctx already cancelled, no live config landed; expect empty maps. + if len(r.addrByID) != 0 || len(r.order) != 0 { + t.Fatalf("expected empty results on early cancel, got addrByID=%v order=%v", r.addrByID, r.order) + } + case <-time.After(1 * time.Second): + t.Fatal("collectLiveMembers blocked past 1s despite cancelled ctx — wg.Wait() regression?") + } +} diff --git a/cmd/elastickv-admin/main.go b/cmd/elastickv-admin/main.go index d1a9f205..06bd7d88 100644 --- a/cmd/elastickv-admin/main.go +++ b/cmd/elastickv-admin/main.go @@ -347,29 +347,45 @@ func newFanout( func (f *fanout) Close() { f.mu.Lock() - defer f.mu.Unlock() if f.closed { + f.mu.Unlock() return } f.closed = true // Shutdown is an intentional cancellation of any in-flight RPCs; close // connections eagerly and let borrowers see the cancel. Borrowers that // still hold leases will observe the conn as closed on their next call. - // Mark each client closed so the deferred release path does not attempt - // a double-close. + // Mark each client closed inside the lock so the deferred release path + // does not attempt a double-close, then collect the *grpc.ClientConn + // references and run conn.Close() outside the lock — Close() can do + // network I/O and waits for the gRPC client transport to drain, which + // would block any concurrent clientFor / invalidateClient / RPC waiting + // on f.mu for the entire shutdown window. + conns := make([]struct { + addr string + conn *grpc.ClientConn + }, 0, len(f.clients)) for _, c := range f.clients { if c.closed { continue } c.closed = true - if err := c.conn.Close(); err != nil { - log.Printf("elastickv-admin: close gRPC connection to %s: %v", c.addr, err) - } + conns = append(conns, struct { + addr string + conn *grpc.ClientConn + }{addr: c.addr, conn: c.conn}) } // Replace with an empty map rather than nil so the remaining // closed-guarded accessors can still iterate or lookup without panicking // while still releasing the client references for GC. f.clients = map[string]*nodeClient{} + f.mu.Unlock() + + for _, e := range conns { + if err := e.conn.Close(); err != nil { + log.Printf("elastickv-admin: close gRPC connection to %s: %v", e.addr, err) + } + } } // clientFor returns a leased nodeClient that callers must release once they @@ -380,26 +396,60 @@ func (f *fanout) Close() { // but their underlying *grpc.ClientConn is kept alive until every outstanding // borrower has released; this prevents an eviction from canceling a healthy // concurrent GetClusterOverview. +// +// The dial step (grpc.NewClient) runs outside f.mu. Although NewClient is +// non-blocking, it parses the target and may run synchronous DNS resolution +// depending on resolver configuration; holding the global fanout mutex for +// that wall-clock time would serialize concurrent clientFor calls for +// distinct addrs. After the dial we re-take the lock and re-check whether +// another goroutine raced us to insert a client for the same addr — the +// loser closes its just-dialed conn so we never leak a duplicate. func (f *fanout) clientFor(addr string) (*nodeClient, func(), error) { f.mu.Lock() - defer f.mu.Unlock() if f.closed { + f.mu.Unlock() return nil, func() {}, errFanoutClosed } if c, ok := f.clients[addr]; ok { c.refcount++ - return c, f.releaseFunc(c), nil - } - if len(f.clients) >= maxCachedClients { - f.evictOneLocked() + release := f.releaseFunc(c) + f.mu.Unlock() + return c, release, nil } - conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(f.creds)) + f.mu.Unlock() + + conn, err := grpc.NewClient( + addr, + grpc.WithTransportCredentials(f.creds), + internalutil.GRPCCallOptions(), + ) if err != nil { return nil, func() {}, errors.Wrapf(err, "dial %s", addr) } + + f.mu.Lock() + if f.closed { + f.mu.Unlock() + _ = conn.Close() + return nil, func() {}, errFanoutClosed + } + // Race: another goroutine inserted while we were dialing. Close the loser + // conn outside the lock and return the cached entry instead. + if c, ok := f.clients[addr]; ok { + c.refcount++ + release := f.releaseFunc(c) + f.mu.Unlock() + _ = conn.Close() + return c, release, nil + } + if len(f.clients) >= maxCachedClients { + f.evictOneLocked() + } c := &nodeClient{addr: addr, conn: conn, client: pb.NewAdminClient(conn), refcount: 1} f.clients[addr] = c - return c, f.releaseFunc(c), nil + release := f.releaseFunc(c) + f.mu.Unlock() + return c, release, nil } // releaseFunc returns the closer used to drop a lease. On the last release diff --git a/cmd/elastickv-admin/main_test.go b/cmd/elastickv-admin/main_test.go index ad43b953..5d787aa1 100644 --- a/cmd/elastickv-admin/main_test.go +++ b/cmd/elastickv-admin/main_test.go @@ -17,6 +17,7 @@ import ( "os" "path/filepath" "strings" + "sync" "sync/atomic" "testing" "time" @@ -636,3 +637,99 @@ func TestHandleOverviewUsesProtojson(t *testing.T) { t.Fatalf("response missing protojson camelCase field; body=%q", body) } } + +// TestFanoutClientForRaceDeduplicates exercises the dial-outside-the-lock +// path in clientFor: many goroutines racing for the same addr must all +// converge on a single cached *grpc.ClientConn (the loser of each race +// closes its just-dialed conn). Pre-fix, the dial happened under the +// lock so the race was impossible by construction; post-fix, the race +// is intentional but bounded. +func TestFanoutClientForRaceDeduplicates(t *testing.T) { + t.Parallel() + peer := &fakeAdminServer{members: []string{"m:1"}} + addr := startFakeAdmin(t, peer) + f := newFanout([]string{addr}, "", time.Second, insecure.NewCredentials()) + defer f.Close() + + const racers = 32 + var wg sync.WaitGroup + wg.Add(racers) + clients := make([]*nodeClient, racers) + releases := make([]func(), racers) + for i := 0; i < racers; i++ { + go func(i int) { + defer wg.Done() + c, release, err := f.clientFor(addr) + if err != nil { + t.Errorf("racer %d clientFor: %v", i, err) + return + } + clients[i] = c + releases[i] = release + }(i) + } + wg.Wait() + + for _, release := range releases { + if release != nil { + release() + } + } + + first := clients[0] + for i, c := range clients { + if c != first { + t.Fatalf("racer %d got distinct nodeClient %p, want %p — clientFor de-duplication broke", i, c, first) + } + } + f.mu.Lock() + size := len(f.clients) + f.mu.Unlock() + if size != 1 { + t.Fatalf("cache size after race = %d, want 1 (race created %d duplicates)", size, size-1) + } +} + +// TestFanoutCloseDoesNotHoldLockDuringConnClose pins the round-5 fix: +// fanout.Close must release f.mu before invoking conn.Close on each +// cached connection. The test populates the cache, takes the lock from +// another goroutine *after* the Close goroutine has started, and +// asserts the lock is acquirable before Close returns — proving Close +// runs the conn.Close calls outside the lock. Pre-fix, the inverted +// timing would have wedged the test goroutine. +func TestFanoutCloseDoesNotHoldLockDuringConnClose(t *testing.T) { + t.Parallel() + peer := &fakeAdminServer{members: []string{"m:1"}} + addr := startFakeAdmin(t, peer) + f := newFanout([]string{addr}, "", time.Second, insecure.NewCredentials()) + + if _, release, err := f.clientFor(addr); err != nil { + t.Fatal(err) + } else { + release() + } + + closeDone := make(chan struct{}) + go func() { + defer close(closeDone) + f.Close() + }() + + // Race the Close goroutine: by the time we get the lock, Close must + // already have transferred the cached conns into a local slice and + // dropped the lock. A 2-second budget accounts for slow CI runners. + deadline := time.After(2 * time.Second) + for { + select { + case <-deadline: + t.Fatal("could not acquire f.mu while Close was running — Close is holding the lock during conn.Close") + default: + } + if f.mu.TryLock() { + f.mu.Unlock() + break + } + time.Sleep(time.Millisecond) + } + <-closeDone +} diff --git a/internal/grpc.go b/internal/grpc.go index 59092615..c3658622 100644 --- a/internal/grpc.go +++ b/internal/grpc.go @@ -27,3 +27,16 @@ func GRPCDialOptions() []grpc.DialOption { ), } } + +// GRPCCallOptions returns the per-call message-size cap dial option used by +// callers that supply their own transport credentials (e.g. the admin +// binary's TLS-aware fanout). Without this, gRPC-Go's default ~4 MiB recv +// cap would silently fail RPCs once aggregated cluster-overview / matrix +// admin payloads exceed 4 MiB even though node servers (GRPCServerOptions) +// are configured for 64 MiB. +func GRPCCallOptions() grpc.DialOption { + return grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(GRPCMaxMessageBytes), + grpc.MaxCallSendMsgSize(GRPCMaxMessageBytes), + ) +}