From 050b4d93a2ee0529803f2351e23e99e706736f65 Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Fri, 24 Apr 2026 02:29:12 +0900 Subject: [PATCH 1/3] obs(grafana): pair stat + timeseries per Pebble gauge with write-stall thresholds Restructures the Pebble Internals dashboard so every gauge-style signal is represented by two panels: a stat (current per-node value with colour thresholds) and a timeseries (per-node trend, with red threshold lines at the Pebble write-stall points where applicable). Signals paired: - L0 sublevels (stat thresholds 10/15, line at 20) - L0 num files (stat thresholds 500/1000, line at 1000) - Compaction estimated debt (stat thresholds 100/500 MiB) - Compactions in progress (stat thresholds 1/3) - Memtable count (stat thresholds 3/5, line at 5) - Memtable size bytes - Memtable zombie count Rate-of-counter metrics (compaction rate, cache hits/misses rate, store write conflicts by prefix) and the FSM apply sync mode indicator stay single-panel because a point-in-time rate or boolean has no meaningful stat companion. Panels are grouped into rows: Block Cache, LSM Level State (collapsed), Compaction, FSM Apply, Store Write Conflicts. Queries use `max by (node_id)` to coalesce the group / node_address dimensions the Pebble collector emits. Grid layout is two 12-wide panels per visual row at height 8. --- .../elastickv-pebble-internals.json | 1141 ++++++++++++++--- 1 file changed, 938 insertions(+), 203 deletions(-) diff --git a/monitoring/grafana/dashboards/elastickv-pebble-internals.json b/monitoring/grafana/dashboards/elastickv-pebble-internals.json index 31d642bf..8e544d50 100644 --- a/monitoring/grafana/dashboards/elastickv-pebble-internals.json +++ b/monitoring/grafana/dashboards/elastickv-pebble-internals.json @@ -15,13 +15,26 @@ } ] }, - "description": "Pebble storage engine internals for elastickv: block cache, L0 pressure, compactions, memtables, FSM apply sync mode, and store write conflicts.", + "description": "Pebble storage engine internals for elastickv: block cache, LSM level state (L0 + memtables), compaction backlog, FSM apply sync mode, and store write conflicts. Each gauge-style signal is presented as a stat (current value per node) + a timeseries (trend over the selected range), so operators can read both 'what is it now' and 'where is it heading'.", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, "id": null, "links": [], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Block Cache", + "type": "row" + }, { "datasource": "$datasource", "description": "Block cache hit rate = hits / (hits + misses). Greater than 95% is healthy; below 80% means the working set no longer fits in the configured block cache and every read falls through to the filesystem, which usually shows up as a step change in GET p99 and disk read IOPS.", @@ -33,9 +46,18 @@ "thresholds": { "mode": "absolute", "steps": [ - {"color": "red", "value": 0}, - {"color": "yellow", "value": 0.80}, - {"color": "green", "value": 0.95} + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 0.8 + }, + { + "color": "green", + "value": 0.95 + } ] }, "min": 0, @@ -44,13 +66,21 @@ }, "overrides": [] }, - "gridPos": {"h": 6, "w": 8, "x": 0, "y": 0}, - "id": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, "options": { "colorMode": "value", "graphMode": "area", + "orientation": "horizontal", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false } @@ -65,7 +95,7 @@ "instant": true } ], - "title": "Block Cache Hit Rate", + "title": "Block Cache Hit Rate (current)", "type": "stat" }, { @@ -73,24 +103,38 @@ "description": "Block cache hit rate over time. Watch for sudden drops after a deploy, a compaction burst, or a workload shift: these indicate the hot set no longer fits.", "fieldConfig": { "defaults": { - "color": {"mode": "palette-classic"}, + "color": { + "mode": "palette-classic" + }, "custom": { "axisPlacement": "auto", "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "auto" }, + "unit": "percentunit", "min": 0, - "max": 1, - "unit": "percentunit" + "max": 1 }, "overrides": [] }, - "gridPos": {"h": 6, "w": 16, "x": 8, "y": 0}, - "id": 2, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } }, "targets": [ { @@ -110,7 +154,9 @@ "description": "Current block cache size per node versus the configured capacity. If 'size' sits pegged at 'capacity' and hit rate is falling, the cache is full and evicting useful blocks; consider raising the capacity or shrinking the working set.", "fieldConfig": { "defaults": { - "color": {"mode": "palette-classic"}, + "color": { + "mode": "palette-classic" + }, "custom": { "axisPlacement": "auto", "lineInterpolation": "linear", @@ -121,11 +167,23 @@ }, "overrides": [] }, - "gridPos": {"h": 6, "w": 12, "x": 0, "y": 6}, - "id": 3, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 4, "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } }, "targets": [ { @@ -150,10 +208,12 @@ }, { "datasource": "$datasource", - "description": "Block cache hit and miss rates side-by-side. A climbing miss rate is the earliest warning that the cache is under-sized; the hit-rate panel lags it because hit rate is a ratio.", + "description": "Block cache hit and miss rates side-by-side. A climbing miss rate is the earliest warning that the cache is under-sized; the hit-rate panel lags it because hit rate is a ratio. Rate-of-counter only; no current-value stat companion (instantaneous rates are not meaningful as a snapshot).", "fieldConfig": { "defaults": { - "color": {"mode": "palette-classic"}, + "color": { + "mode": "palette-classic" + }, "custom": { "axisPlacement": "auto", "lineInterpolation": "linear", @@ -164,11 +224,23 @@ }, "overrides": [] }, - "gridPos": {"h": 6, "w": 12, "x": 12, "y": 6}, - "id": 4, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 5, "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } }, "targets": [ { @@ -192,74 +264,674 @@ "type": "timeseries" }, { - "datasource": "$datasource", - "description": "Number of L0 sublevels per node. Pebble begins throttling writes around 20 sublevels and will hard-stall the FSM if compaction cannot keep up; sustained red here usually means ingest outran compaction.", - "fieldConfig": { - "defaults": { - "color": {"mode": "thresholds"}, - "thresholds": { - "mode": "absolute", - "steps": [ - {"color": "green", "value": 0}, - {"color": "orange", "value": 10}, - {"color": "red", "value": 20} - ] + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 16, + "panels": [ + { + "datasource": "$datasource", + "description": "Current count of L0 sublevels per node. Pebble begins throttling writes around 20 sublevels and will hard-stall the FSM if compaction cannot keep up; sustained red here means ingest outran compaction. `max by (node_id)` coalesces the group / node_address dimensions emitted by the Pebble collector.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 15 + } + ] + }, + "unit": "short" + }, + "overrides": [] }, - "unit": "short" + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_l0_sublevels{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "refId": "A", + "instant": true + } + ], + "title": "L0 Sublevels (current)", + "type": "stat" }, - "overrides": [] - }, - "gridPos": {"h": 6, "w": 8, "x": 0, "y": 12}, - "id": 5, - "options": { - "colorMode": "value", - "graphMode": "area", - "orientation": "horizontal", - "reduceOptions": { - "calcs": ["lastNotNull"], - "fields": "", - "values": false - } - }, - "targets": [ { "datasource": "$datasource", - "editorMode": "code", - "expr": "elastickv_pebble_l0_sublevels{job=\"elastickv\",node_id=~\"$node_id\"}", - "legendFormat": "{{node_id}}", - "refId": "A", - "instant": true + "description": "L0 sublevels trend per node. The red threshold line at 20 marks where Pebble begins its write-stall heuristic; approaching this value is the early warning that compaction is losing the race.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto", + "thresholdsStyle": { + "mode": "line" + } + }, + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 20 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 7, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_l0_sublevels{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "L0 Sublevels Over Time", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Current number of files in L0 per node. Use alongside the sublevel panel to distinguish 'many small files' (compaction falling behind on count) from 'many overlapping files' (write-stall risk). Thresholds tuned to deploy-size defaults: green <500, yellow 500-1000, red >=1000.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 500 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_l0_num_files{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "refId": "A", + "instant": true + } + ], + "title": "L0 Num Files (current)", + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "L0 file count trend per node. The red threshold line at 1000 marks the practical soft limit for this deploy before L0->Lbase compaction falls terminally behind.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto", + "thresholdsStyle": { + "mode": "line" + } + }, + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 1000 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 9, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_l0_num_files{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "L0 Num Files Over Time", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Active memtable count per node. Normally 1-2; sustained growth means flushes are not keeping up with writes, which cascades into L0 file growth and eventually a write stall. Default Pebble stall kicks in at 5 memtables.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 3 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_memtable_count{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "refId": "A", + "instant": true + } + ], + "title": "Memtable Count (current)", + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "Memtable count trend per node. The red threshold line at 5 marks Pebble's default write-stall trigger for memtable count.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto", + "thresholdsStyle": { + "mode": "line" + } + }, + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 5 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 11, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_memtable_count{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Memtable Count Over Time", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Current bytes held in active memtables per node. Caps at ~MemTableSize * MaxMemtableCount; steady readings near that cap mean flush is the bottleneck. Thresholds intentionally coarse because the meaningful ceiling is deploy-specific.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 268435456 + }, + { + "color": "orange", + "value": 536870912 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_memtable_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "refId": "A", + "instant": true + } + ], + "title": "Memtable Size (current)", + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "Memtable bytes trend per node. Pair with Memtable Count to distinguish 'few large memtables queued for flush' from 'many tiny memtables piling up'.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 13, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_memtable_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Memtable Size Over Time", + "type": "timeseries" + }, + { + "datasource": "$datasource", + "description": "Current count of memtables that have been flushed but cannot yet be freed because an iterator or snapshot still references them. Non-zero for long periods indicates leaked iterators or long-lived snapshots pinning memory.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 6 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_memtable_zombie_count{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "refId": "A", + "instant": true + } + ], + "title": "Memtable Zombies (current)", + "type": "stat" + }, + { + "datasource": "$datasource", + "description": "Memtable zombie trend per node. A rising baseline is the signature of an iterator leak: step-ups at deploys or long-running scans are normal, but the line should return to zero.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "auto" + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 50 + }, + "id": 15, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": "$datasource", + "editorMode": "code", + "expr": "max by (node_id) (elastickv_pebble_memtable_zombie_count{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Memtable Zombies Over Time", + "type": "timeseries" } ], - "title": "L0 Sublevels", - "type": "stat" + "title": "LSM Level State", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 17, + "panels": [], + "title": "Compaction", + "type": "row" }, { "datasource": "$datasource", - "description": "Number of files currently in L0 per node. More than ~20 files and Pebble's write-stall heuristic will start holding up new writes. Use alongside the sublevel panel to distinguish 'many small files' from 'many overlapping files'.", + "description": "Pebble's own estimate of outstanding compaction work per node, in bytes. Growing unboundedly means compaction is losing the race. Thresholds: green <100 MiB, yellow 100-500 MiB, red >=500 MiB. Upper bound is context-dependent so no stall-line is drawn on the trend.", "fieldConfig": { "defaults": { - "color": {"mode": "thresholds"}, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - {"color": "green", "value": 0}, - {"color": "orange", "value": 10}, - {"color": "red", "value": 20} + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 104857600 + }, + { + "color": "red", + "value": 524288000 + } ] }, - "unit": "short" + "unit": "bytes" }, "overrides": [] }, - "gridPos": {"h": 6, "w": 8, "x": 8, "y": 12}, - "id": 6, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 18, "options": { "colorMode": "value", "graphMode": "area", "orientation": "horizontal", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false } @@ -268,21 +940,23 @@ { "datasource": "$datasource", "editorMode": "code", - "expr": "elastickv_pebble_l0_num_files{job=\"elastickv\",node_id=~\"$node_id\"}", + "expr": "max by (node_id) (elastickv_pebble_compact_estimated_debt_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "refId": "A", "instant": true } ], - "title": "L0 Num Files", + "title": "Compaction Estimated Debt (current)", "type": "stat" }, { "datasource": "$datasource", - "description": "Pebble's own estimate of outstanding compaction work, in bytes. Growing unboundedly means compaction is losing the race; compare against the compaction rate below.", + "description": "Compaction debt trend per node. Compare against the Compaction Rate panel: healthy engines oscillate; a monotonically rising line is the ingest-vs-compaction gap.", "fieldConfig": { "defaults": { - "color": {"mode": "palette-classic"}, + "color": { + "mode": "palette-classic" + }, "custom": { "axisPlacement": "auto", "lineInterpolation": "linear", @@ -293,123 +967,81 @@ }, "overrides": [] }, - "gridPos": {"h": 6, "w": 8, "x": 16, "y": 12}, - "id": 7, - "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "elastickv_pebble_compact_estimated_debt_bytes{job=\"elastickv\",node_id=~\"$node_id\"}", - "legendFormat": "{{node_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Compaction Estimated Debt", - "type": "timeseries" - }, - { - "datasource": "$datasource", - "description": "Number of compactions currently running per node. Should move in lockstep with the configured max concurrent compactions; a flat-zero line while debt climbs indicates compactions are being blocked (e.g. by a flush pipeline stall).", - "fieldConfig": { - "defaults": { - "color": {"mode": "palette-classic"}, - "custom": { - "axisPlacement": "auto", - "lineInterpolation": "linear", - "lineWidth": 1, - "showPoints": "auto" - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": {"h": 6, "w": 8, "x": 0, "y": 18}, - "id": 8, + "id": 19, "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} - }, - "targets": [ - { - "datasource": "$datasource", - "editorMode": "code", - "expr": "elastickv_pebble_compact_in_progress{job=\"elastickv\",node_id=~\"$node_id\"}", - "legendFormat": "{{node_id}}", - "range": true, - "refId": "A" - } - ], - "title": "Compactions In Progress", - "type": "timeseries" - }, - { - "datasource": "$datasource", - "description": "Rate of completed compactions per node. Correlate with the debt panel: a healthy engine shows debt oscillating while this rate is non-zero.", - "fieldConfig": { - "defaults": { - "color": {"mode": "palette-classic"}, - "custom": { - "axisPlacement": "auto", - "lineInterpolation": "linear", - "lineWidth": 1, - "showPoints": "auto" - }, - "unit": "ops" + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "overrides": [] - }, - "gridPos": {"h": 6, "w": 8, "x": 8, "y": 18}, - "id": 9, - "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "tooltip": { + "mode": "multi", + "sort": "desc" + } }, "targets": [ { "datasource": "$datasource", "editorMode": "code", - "expr": "rate(elastickv_pebble_compact_count_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])", + "expr": "max by (node_id) (elastickv_pebble_compact_estimated_debt_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "range": true, "refId": "A" } ], - "title": "Compaction Rate", + "title": "Compaction Estimated Debt Over Time", "type": "timeseries" }, { "datasource": "$datasource", - "description": "FSM apply sync mode gauge: the label `mode` is `sync` or `nosync`, and the value is 1 for the active mode. Operators care because nosync gives the fastest Raft apply but loses unflushed writes on a crash; sync pays fsync on every batch. Green = nosync in this deploy's convention. **Requires `elastickv_pebble_fsm_apply_sync_mode` (PR #592). Empty until that PR merges — this is not a scrape failure.**", + "description": "Current number of compactions running per node. Should sit near the configured max concurrent compactions when there is work to do; a flat-zero reading while debt climbs means compactions are being blocked (for example, by a flush pipeline stall).", "fieldConfig": { "defaults": { - "color": {"mode": "thresholds"}, + "color": { + "mode": "thresholds" + }, "thresholds": { "mode": "absolute", "steps": [ - {"color": "red", "value": 0}, - {"color": "green", "value": 1} + { + "color": "green", + "value": 0 + }, + { + "color": "blue", + "value": 1 + }, + { + "color": "yellow", + "value": 3 + } ] }, - "mappings": [ - {"type": "value", "options": {"0": {"text": "off"}, "1": {"text": "on"}}} - ], "unit": "short" }, "overrides": [] }, - "gridPos": {"h": 6, "w": 8, "x": 16, "y": 18}, - "id": 10, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 20, "options": { "colorMode": "value", - "graphMode": "none", + "graphMode": "area", "orientation": "horizontal", "reduceOptions": { - "calcs": ["lastNotNull"], + "calcs": [ + "lastNotNull" + ], "fields": "", "values": false } @@ -418,21 +1050,23 @@ { "datasource": "$datasource", "editorMode": "code", - "expr": "elastickv_pebble_fsm_apply_sync_mode{job=\"elastickv\",node_id=~\"$node_id\",mode=\"nosync\"}", - "legendFormat": "{{node_id}} nosync", + "expr": "max by (node_id) (elastickv_pebble_compact_in_progress{job=\"elastickv\",node_id=~\"$node_id\"})", + "legendFormat": "{{node_id}}", "refId": "A", "instant": true } ], - "title": "FSM Apply Sync Mode (nosync = 1 is desired)", + "title": "Compactions In Progress (current)", "type": "stat" }, { "datasource": "$datasource", - "description": "Active memtable count per node. Normally 1-2; sustained growth here means flushes are not keeping up with writes, which cascades into L0 file growth and eventually a write stall.", + "description": "Compactions-in-progress trend per node. Pair with the debt trend: sustained ceiling at max concurrency during rising debt is the compaction-saturation signature.", "fieldConfig": { "defaults": { - "color": {"mode": "palette-classic"}, + "color": { + "mode": "palette-classic" + }, "custom": { "axisPlacement": "auto", "lineInterpolation": "linear", @@ -443,101 +1077,190 @@ }, "overrides": [] }, - "gridPos": {"h": 6, "w": 8, "x": 0, "y": 24}, - "id": 11, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 21, "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } }, "targets": [ { "datasource": "$datasource", "editorMode": "code", - "expr": "elastickv_pebble_memtable_count{job=\"elastickv\",node_id=~\"$node_id\"}", + "expr": "max by (node_id) (elastickv_pebble_compact_in_progress{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "range": true, "refId": "A" } ], - "title": "Memtable Count", + "title": "Compactions In Progress Over Time", "type": "timeseries" }, { "datasource": "$datasource", - "description": "Total bytes held in active memtables per node. Caps at ~MemTableSize * MaxMemtableCount; steady readings near that cap mean flush is the bottleneck.", + "description": "Rate of completed compactions per node. Correlate with the debt panel: a healthy engine shows debt oscillating while this rate is non-zero. Rate-of-counter metric, so no current-value stat companion.", "fieldConfig": { "defaults": { - "color": {"mode": "palette-classic"}, + "color": { + "mode": "palette-classic" + }, "custom": { "axisPlacement": "auto", "lineInterpolation": "linear", "lineWidth": 1, "showPoints": "auto" }, - "unit": "bytes" + "unit": "ops" }, "overrides": [] }, - "gridPos": {"h": 6, "w": 8, "x": 8, "y": 24}, - "id": 12, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 22, "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } }, "targets": [ { "datasource": "$datasource", "editorMode": "code", - "expr": "elastickv_pebble_memtable_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"}", + "expr": "rate(elastickv_pebble_compact_count_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__rate_interval])", "legendFormat": "{{node_id}}", "range": true, "refId": "A" } ], - "title": "Memtable Size", + "title": "Compaction Rate", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 43 + }, + "id": 23, + "panels": [], + "title": "FSM Apply", + "type": "row" + }, { "datasource": "$datasource", - "description": "Memtables that have been flushed but cannot yet be freed because an iterator or snapshot still references them. Climbing zombie counts indicate leaked iterators or long-lived snapshots pinning memory.", + "description": "FSM apply sync mode gauge: the label `mode` is `sync` or `nosync`, and the value is 1 for the active mode. Operators care because nosync gives the fastest Raft apply but loses unflushed writes on a crash; sync pays fsync on every batch. Green = nosync in this deploy's convention. **Requires `elastickv_pebble_fsm_apply_sync_mode` (PR #592). Empty until that PR merges - this is not a scrape failure.**", "fieldConfig": { "defaults": { - "color": {"mode": "palette-classic"}, - "custom": { - "axisPlacement": "auto", - "lineInterpolation": "linear", - "lineWidth": 1, - "showPoints": "auto" + "color": { + "mode": "thresholds" }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "mappings": [ + { + "type": "value", + "options": { + "0": { + "text": "off" + }, + "1": { + "text": "on" + } + } + } + ], "unit": "short" }, "overrides": [] }, - "gridPos": {"h": 6, "w": 8, "x": 16, "y": 24}, - "id": 13, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 24, "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "colorMode": "value", + "graphMode": "none", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } }, "targets": [ { "datasource": "$datasource", "editorMode": "code", - "expr": "elastickv_pebble_memtable_zombie_count{job=\"elastickv\",node_id=~\"$node_id\"}", - "legendFormat": "{{node_id}}", - "range": true, - "refId": "A" + "expr": "elastickv_pebble_fsm_apply_sync_mode{job=\"elastickv\",node_id=~\"$node_id\",mode=\"nosync\"}", + "legendFormat": "{{node_id}} nosync", + "refId": "A", + "instant": true } ], - "title": "Memtable Zombies", - "type": "timeseries" + "title": "FSM Apply Sync Mode (nosync = 1 is desired)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 25, + "panels": [], + "title": "Store Write Conflicts", + "type": "row" }, { "datasource": "$datasource", - "description": "Rate of OCC-style store write conflicts, grouped by key prefix. Hot prefixes light up here before they show up as Lua retry pressure or client timeouts; use this to find the hotspot key family.", + "description": "Rate of OCC-style store write conflicts, grouped by key prefix. Hot prefixes light up here before they show up as Lua retry pressure or client timeouts; use this to find the hotspot key family. Keyed by `key_prefix` rather than `node_id`, so no per-node stat companion applies.", "fieldConfig": { "defaults": { - "color": {"mode": "palette-classic"}, + "color": { + "mode": "palette-classic" + }, "custom": { "axisPlacement": "auto", "lineInterpolation": "linear", @@ -548,11 +1271,23 @@ }, "overrides": [] }, - "gridPos": {"h": 6, "w": 24, "x": 0, "y": 30}, - "id": 14, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 51 + }, + "id": 26, "options": { - "legend": {"displayMode": "list", "placement": "bottom", "showLegend": true}, - "tooltip": {"mode": "multi", "sort": "desc"} + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } }, "targets": [ { @@ -629,5 +1364,5 @@ "timezone": "browser", "title": "Elastickv Pebble Internals", "uid": "elastickv-pebble-internals", - "version": 1 + "version": 2 } From db6115feb4090eb24d0820262b6af796bdf3170d Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 06:14:50 +0900 Subject: [PATCH 2/3] fix(grafana): sum memtable bytes across raft groups per node Codex P2: the Memtable Size panels used max by (node_id), but elastickv_pebble_memtable_size_bytes is emitted per raft group. On nodes that host more than one group max would pick only the largest group memtable and drop the rest, so two 200 MiB groups still displayed 200 MiB instead of 400 MiB. That under-reports real per-node memtable pressure and could hide exactly the "too many memtables queued for flush" signal the paired trend panel exists to surface. Switched both the "Memtable Size (current)" stat and the "Memtable Size Over Time" timeseries to sum by (node_id). Updated the two panel descriptions to spell out the aggregation choice so a future reader can tell it is intentional (max is correct for L0 sublevels because sublevels is per-group stall risk, but wrong for memory totals). --- .../grafana/dashboards/elastickv-pebble-internals.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/monitoring/grafana/dashboards/elastickv-pebble-internals.json b/monitoring/grafana/dashboards/elastickv-pebble-internals.json index 8e544d50..2b2bbb09 100644 --- a/monitoring/grafana/dashboards/elastickv-pebble-internals.json +++ b/monitoring/grafana/dashboards/elastickv-pebble-internals.json @@ -653,7 +653,7 @@ }, { "datasource": "$datasource", - "description": "Current bytes held in active memtables per node. Caps at ~MemTableSize * MaxMemtableCount; steady readings near that cap mean flush is the bottleneck. Thresholds intentionally coarse because the meaningful ceiling is deploy-specific.", + "description": "Current bytes held in active memtables per node, summed across all raft groups on that node. `sum by (node_id)` — not `max` — because each group owns its own Pebble instance and real per-node memtable memory is the sum of every group's memtables; `max by (node_id)` would pick only the largest group and under-report pressure on nodes hosting more than one group. Thresholds intentionally coarse because the meaningful ceiling is deploy-specific.", "fieldConfig": { "defaults": { "color": { @@ -703,7 +703,7 @@ { "datasource": "$datasource", "editorMode": "code", - "expr": "max by (node_id) (elastickv_pebble_memtable_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", + "expr": "sum by (node_id) (elastickv_pebble_memtable_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "refId": "A", "instant": true @@ -714,7 +714,7 @@ }, { "datasource": "$datasource", - "description": "Memtable bytes trend per node. Pair with Memtable Count to distinguish 'few large memtables queued for flush' from 'many tiny memtables piling up'.", + "description": "Memtable bytes trend per node (summed across all raft groups on that node, matching the paired stat panel). Pair with Memtable Count to distinguish 'few large memtables queued for flush' from 'many tiny memtables piling up'.", "fieldConfig": { "defaults": { "color": { @@ -752,7 +752,7 @@ { "datasource": "$datasource", "editorMode": "code", - "expr": "max by (node_id) (elastickv_pebble_memtable_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", + "expr": "sum by (node_id) (elastickv_pebble_memtable_size_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "range": true, "refId": "A" From b16137e8442be0895db68ddaace74a2cb5ffd7bd Mon Sep 17 00:00:00 2001 From: "Yoshiaki Ueda (bootjp)" Date: Sat, 25 Apr 2026 06:34:04 +0900 Subject: [PATCH 3/3] fix(grafana): sum compaction debt + in-progress across groups per node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex P2 follow-up on db6115fe: same aggregation bug applied to the Compaction section. elastickv_pebble_compact_estimated_debt_bytes and elastickv_pebble_compact_in_progress are both emitted per raft group, but the panels reduced with max by (node_id). On multi-group nodes: - Two groups each holding 400 MiB of compaction debt rendered as 400 MiB instead of 800 MiB, hiding the real per-node backlog. - Two groups each running 2 compactions rendered as 2 instead of 4, hiding node-level compaction saturation. Switched both stat and timeseries panels for both metrics to sum by (node_id). Updated all four descriptions to spell out the aggregation choice and the (per-group max × group-count) ceiling operators should expect, so a future reader can tell the change is intentional. The L0 sublevels / num_files panels remain on max by (node_id) — those are per-group stall-risk signals where the worst group is what matters; summing would mask backpressure on the specific group being throttled. --- .../dashboards/elastickv-pebble-internals.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/monitoring/grafana/dashboards/elastickv-pebble-internals.json b/monitoring/grafana/dashboards/elastickv-pebble-internals.json index 2b2bbb09..ff82bdc1 100644 --- a/monitoring/grafana/dashboards/elastickv-pebble-internals.json +++ b/monitoring/grafana/dashboards/elastickv-pebble-internals.json @@ -890,7 +890,7 @@ }, { "datasource": "$datasource", - "description": "Pebble's own estimate of outstanding compaction work per node, in bytes. Growing unboundedly means compaction is losing the race. Thresholds: green <100 MiB, yellow 100-500 MiB, red >=500 MiB. Upper bound is context-dependent so no stall-line is drawn on the trend.", + "description": "Pebble estimate of outstanding compaction work per node, summed across all raft groups on that node, in bytes. `sum by (node_id)` — not `max` — because each group keeps its own compaction queue and real node-level backlog is the sum; `max` would drop all but the busiest group and under-report compaction pressure on nodes hosting more than one group. Growing unboundedly means compaction is losing the race. Thresholds: green <100 MiB, yellow 100-500 MiB, red >=500 MiB. Upper bound is context-dependent so no stall-line is drawn on the trend.", "fieldConfig": { "defaults": { "color": { @@ -940,7 +940,7 @@ { "datasource": "$datasource", "editorMode": "code", - "expr": "max by (node_id) (elastickv_pebble_compact_estimated_debt_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", + "expr": "sum by (node_id) (elastickv_pebble_compact_estimated_debt_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "refId": "A", "instant": true @@ -951,7 +951,7 @@ }, { "datasource": "$datasource", - "description": "Compaction debt trend per node. Compare against the Compaction Rate panel: healthy engines oscillate; a monotonically rising line is the ingest-vs-compaction gap.", + "description": "Compaction debt trend per node, summed across all raft groups (matching the paired stat panel). Compare against the Compaction Rate panel: healthy engines oscillate; a monotonically rising line is the ingest-vs-compaction gap.", "fieldConfig": { "defaults": { "color": { @@ -989,7 +989,7 @@ { "datasource": "$datasource", "editorMode": "code", - "expr": "max by (node_id) (elastickv_pebble_compact_estimated_debt_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", + "expr": "sum by (node_id) (elastickv_pebble_compact_estimated_debt_bytes{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "range": true, "refId": "A" @@ -1000,7 +1000,7 @@ }, { "datasource": "$datasource", - "description": "Current number of compactions running per node. Should sit near the configured max concurrent compactions when there is work to do; a flat-zero reading while debt climbs means compactions are being blocked (for example, by a flush pipeline stall).", + "description": "Current number of compactions running per node, summed across all raft groups on that node. `sum by (node_id)` — not `max` — because compaction concurrency is per-group, so two groups each running two compactions is four concurrent compactions on the node; `max` would report only two and hide node-level compaction saturation. Should sit near the (per-group max × group-count) ceiling when there is work to do; a flat-zero reading while debt climbs means compactions are being blocked (for example, by a flush pipeline stall).", "fieldConfig": { "defaults": { "color": { @@ -1050,7 +1050,7 @@ { "datasource": "$datasource", "editorMode": "code", - "expr": "max by (node_id) (elastickv_pebble_compact_in_progress{job=\"elastickv\",node_id=~\"$node_id\"})", + "expr": "sum by (node_id) (elastickv_pebble_compact_in_progress{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "refId": "A", "instant": true @@ -1061,7 +1061,7 @@ }, { "datasource": "$datasource", - "description": "Compactions-in-progress trend per node. Pair with the debt trend: sustained ceiling at max concurrency during rising debt is the compaction-saturation signature.", + "description": "Compactions-in-progress trend per node, summed across all raft groups (matching the paired stat panel). Pair with the debt trend: sustained ceiling at (per-group max × group-count) during rising debt is the compaction-saturation signature.", "fieldConfig": { "defaults": { "color": { @@ -1099,7 +1099,7 @@ { "datasource": "$datasource", "editorMode": "code", - "expr": "max by (node_id) (elastickv_pebble_compact_in_progress{job=\"elastickv\",node_id=~\"$node_id\"})", + "expr": "sum by (node_id) (elastickv_pebble_compact_in_progress{job=\"elastickv\",node_id=~\"$node_id\"})", "legendFormat": "{{node_id}}", "range": true, "refId": "A"