From 70bbdda4c8f5ffb635597fe5f428686bbdc69ec6 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 22 Apr 2026 14:32:41 -0700 Subject: [PATCH 01/31] Add CockroachDB schema for historical offload consumer Schema covers versions ingested from Kafka, per-key state mutations (store_name, key, version DESC for point lookups), and tree name upgrades mirroring ChangelogEntry.Upgrades. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/consumer/schema/schema.sql | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/schema/schema.sql diff --git a/sei-db/state_db/ss/offload/consumer/schema/schema.sql b/sei-db/state_db/ss/offload/consumer/schema/schema.sql new file mode 100644 index 0000000000..33f540e12b --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/schema/schema.sql @@ -0,0 +1,29 @@ +-- CockroachDB schema for the historical offload consumer. +-- Applied once per cluster before starting the consumer. + +CREATE TABLE IF NOT EXISTS state_versions ( + version INT8 PRIMARY KEY, + kafka_topic STRING NOT NULL, + kafka_offset INT8 NOT NULL, + ingested_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE TABLE IF NOT EXISTS state_mutations ( + store_name STRING NOT NULL, + key BYTES NOT NULL, + version INT8 NOT NULL, + value BYTES NULL, + deleted BOOL NOT NULL DEFAULT false, + PRIMARY KEY (store_name, key, version DESC) +); + +CREATE INDEX IF NOT EXISTS state_mutations_by_version_idx + ON state_mutations (version); + +CREATE TABLE IF NOT EXISTS state_tree_upgrades ( + version INT8 NOT NULL, + name STRING NOT NULL, + rename_from STRING NOT NULL DEFAULT '', + delete BOOL NOT NULL DEFAULT false, + PRIMARY KEY (version, name) +); From 619f3a933f8bd0f83771ead38680892d9755857b Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 22 Apr 2026 14:34:06 -0700 Subject: [PATCH 02/31] Add Sink interface for historical offload consumer Defines the contract between the Kafka reader and any downstream persistent store. Record bundles the decoded ChangelogEntry with (topic, partition, offset) so sinks can enforce idempotency. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/sink.go | 25 +++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/sink.go diff --git a/sei-db/state_db/ss/offload/consumer/sink.go b/sei-db/state_db/ss/offload/consumer/sink.go new file mode 100644 index 0000000000..1d30b9b2ed --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/sink.go @@ -0,0 +1,25 @@ +package consumer + +import ( + "context" + + dbproto "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +// Record is one Kafka message handed to a Sink, carrying the decoded +// ChangelogEntry plus the Kafka coordinates needed for idempotent writes. +type Record struct { + Topic string + Partition int + Offset int64 + Entry *dbproto.ChangelogEntry +} + +// Sink persists decoded changelog entries to a downstream store. +// Implementations must be safe to call sequentially from a single reader +// loop and should be idempotent on (topic, offset). +type Sink interface { + Write(ctx context.Context, rec Record) error + LastVersion(ctx context.Context) (int64, error) + Close() error +} From 71a267225bb628ce028efd142a827127a13934e2 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 22 Apr 2026 14:35:45 -0700 Subject: [PATCH 03/31] Add CockroachDB sink for historical offload consumer Writes each ChangelogEntry in a single transaction: version row, batched per-key mutations, and tree upgrades. Uses UPSERT so replay from a committed Kafka offset is a no-op. Uses the lib/pq driver already present in go.mod; DSN carries all tunables. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../state_db/ss/offload/consumer/cockroach.go | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/cockroach.go diff --git a/sei-db/state_db/ss/offload/consumer/cockroach.go b/sei-db/state_db/ss/offload/consumer/cockroach.go new file mode 100644 index 0000000000..8e8837fafd --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/cockroach.go @@ -0,0 +1,192 @@ +package consumer + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + _ "github.com/lib/pq" +) + +// CockroachConfig configures the CockroachDB sink. DSN follows the standard +// libpq/pgx format (e.g. postgresql://user@host:26257/db?sslmode=verify-full). +// Use DSN params for knobs like statement_timeout rather than adding fields. +type CockroachConfig struct { + DSN string + MaxOpenConns int + MaxIdleConns int + ConnMaxLifetime time.Duration +} + +func (c *CockroachConfig) ApplyDefaults() { + if c.MaxOpenConns == 0 { + c.MaxOpenConns = 8 + } + if c.MaxIdleConns == 0 { + c.MaxIdleConns = c.MaxOpenConns + } + if c.ConnMaxLifetime == 0 { + c.ConnMaxLifetime = 30 * time.Minute + } +} + +func (c *CockroachConfig) Validate() error { + if strings.TrimSpace(c.DSN) == "" { + return fmt.Errorf("cockroach dsn is required") + } + if c.MaxOpenConns < 0 { + return fmt.Errorf("cockroach max open conns must be non-negative") + } + if c.MaxIdleConns < 0 { + return fmt.Errorf("cockroach max idle conns must be non-negative") + } + return nil +} + +type cockroachSink struct { + db *sql.DB +} + +var _ Sink = (*cockroachSink)(nil) + +// NewCockroachSink opens a pooled connection to CockroachDB. The caller is +// responsible for applying schema.sql beforehand. +func NewCockroachSink(cfg CockroachConfig) (Sink, error) { + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return nil, err + } + + db, err := sql.Open("postgres", cfg.DSN) + if err != nil { + return nil, fmt.Errorf("open cockroach: %w", err) + } + db.SetMaxOpenConns(cfg.MaxOpenConns) + db.SetMaxIdleConns(cfg.MaxIdleConns) + db.SetConnMaxLifetime(cfg.ConnMaxLifetime) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := db.PingContext(ctx); err != nil { + _ = db.Close() + return nil, fmt.Errorf("ping cockroach: %w", err) + } + + return &cockroachSink{db: db}, nil +} + +func (s *cockroachSink) Close() error { + return s.db.Close() +} + +func (s *cockroachSink) LastVersion(ctx context.Context) (int64, error) { + var v sql.NullInt64 + err := s.db.QueryRowContext(ctx, `SELECT max(version) FROM state_versions`).Scan(&v) + if err != nil { + return 0, fmt.Errorf("read last version: %w", err) + } + if !v.Valid { + return 0, nil + } + return v.Int64, nil +} + +func (s *cockroachSink) Write(ctx context.Context, rec Record) error { + if rec.Entry == nil { + return nil + } + + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("begin tx: %w", err) + } + defer func() { _ = tx.Rollback() }() + + if err := insertVersion(ctx, tx, rec); err != nil { + return err + } + if err := insertMutations(ctx, tx, rec); err != nil { + return err + } + if err := insertUpgrades(ctx, tx, rec); err != nil { + return err + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("commit tx: %w", err) + } + return nil +} + +func insertVersion(ctx context.Context, tx *sql.Tx, rec Record) error { + _, err := tx.ExecContext(ctx, ` + INSERT INTO state_versions (version, kafka_topic, kafka_offset) + VALUES ($1, $2, $3) + ON CONFLICT (version) DO NOTHING + `, rec.Entry.Version, rec.Topic, rec.Offset) + if err != nil { + return fmt.Errorf("insert version: %w", err) + } + return nil +} + +// mutationBatchRows caps rows per INSERT; CockroachDB handles large batches +// but smaller batches keep transaction retries cheap under contention. +const mutationBatchRows = 500 + +func insertMutations(ctx context.Context, tx *sql.Tx, rec Record) error { + version := rec.Entry.Version + var ( + args []interface{} + parts []string + ) + flush := func() error { + if len(parts) == 0 { + return nil + } + stmt := `INSERT INTO state_mutations (store_name, key, version, value, deleted) VALUES ` + + strings.Join(parts, ",") + + ` ON CONFLICT (store_name, key, version) DO UPDATE SET value = excluded.value, deleted = excluded.deleted` + if _, err := tx.ExecContext(ctx, stmt, args...); err != nil { + return fmt.Errorf("insert mutations: %w", err) + } + args = args[:0] + parts = parts[:0] + return nil + } + + for _, ncs := range rec.Entry.Changesets { + name := ncs.Name + for _, p := range ncs.Changeset.Pairs { + idx := len(args) + parts = append(parts, fmt.Sprintf("($%d,$%d,$%d,$%d,$%d)", idx+1, idx+2, idx+3, idx+4, idx+5)) + args = append(args, name, p.Key, version, p.Value, p.Delete) + if len(parts) >= mutationBatchRows { + if err := flush(); err != nil { + return err + } + } + } + } + return flush() +} + +func insertUpgrades(ctx context.Context, tx *sql.Tx, rec Record) error { + if len(rec.Entry.Upgrades) == 0 { + return nil + } + for _, up := range rec.Entry.Upgrades { + _, err := tx.ExecContext(ctx, ` + INSERT INTO state_tree_upgrades (version, name, rename_from, delete) + VALUES ($1, $2, $3, $4) + ON CONFLICT (version, name) DO UPDATE + SET rename_from = excluded.rename_from, delete = excluded.delete + `, rec.Entry.Version, up.Name, up.RenameFrom, up.Delete) + if err != nil { + return fmt.Errorf("insert upgrade: %w", err) + } + } + return nil +} From e0a0371f4a54c5f8fb987aa03dfd9c06704271cb Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 22 Apr 2026 14:37:22 -0700 Subject: [PATCH 04/31] Export SASL mechanism helper from offload package Lets the consumer reuse the producer's AWS MSK IAM / SASL code path without duplicating the AWS signing machinery. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/kafka.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sei-db/state_db/ss/offload/kafka.go b/sei-db/state_db/ss/offload/kafka.go index edbe366818..17ae565979 100644 --- a/sei-db/state_db/ss/offload/kafka.go +++ b/sei-db/state_db/ss/offload/kafka.go @@ -212,6 +212,12 @@ func kafkaCompression(name string) compress.Compression { } func kafkaSASLMechanism(cfg KafkaConfig) (sasl.Mechanism, error) { + return NewSASLMechanism(cfg) +} + +// NewSASLMechanism builds a SASL mechanism from a KafkaConfig, so consumers +// that live outside this package can share the same auth path as the producer. +func NewSASLMechanism(cfg KafkaConfig) (sasl.Mechanism, error) { switch strings.ToLower(cfg.SASLMechanism) { case "", kafkaOptionNone: return nil, nil From 8d7f6be46fc9379f5e8edbb83fd50573c7f8ac97 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 22 Apr 2026 14:38:18 -0700 Subject: [PATCH 05/31] Add Kafka reader + ChangelogEntry decoder for consumer KafkaReaderConfig mirrors the producer-side TLS/SASL knobs so AWS MSK IAM clusters work out of the box. NewKafkaReader returns a segmentio kafka.Reader configured for consumer-group reads; DecodeEntry unmarshals the protobuf payload. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/kafka.go | 119 +++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/kafka.go diff --git a/sei-db/state_db/ss/offload/consumer/kafka.go b/sei-db/state_db/ss/offload/consumer/kafka.go new file mode 100644 index 0000000000..f8d812511d --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/kafka.go @@ -0,0 +1,119 @@ +package consumer + +import ( + "crypto/tls" + "fmt" + "strings" + "time" + + gogoproto "github.com/gogo/protobuf/proto" + "github.com/segmentio/kafka-go" + + dbproto "github.com/sei-protocol/sei-chain/sei-db/proto" + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload" +) + +// KafkaReaderConfig mirrors the fields of the producer-side KafkaConfig that +// matter to a consumer. TLS/SASL settings must match the producer cluster. +type KafkaReaderConfig struct { + Brokers []string + Topic string + GroupID string + ClientID string + Region string + StartOffset string // "first" or "last"; defaults to "first" + MinBytes int + MaxBytes int + MaxWait time.Duration + CommitInterval time.Duration + TLSEnabled bool + SASLMechanism string +} + +func (c *KafkaReaderConfig) ApplyDefaults() { + if c.ClientID == "" { + c.ClientID = "cryptosim-historical-offload-consumer" + } + if c.StartOffset == "" { + c.StartOffset = "first" + } + if c.MinBytes == 0 { + c.MinBytes = 1 + } + if c.MaxBytes == 0 { + c.MaxBytes = 10 << 20 + } + if c.MaxWait == 0 { + c.MaxWait = 500 * time.Millisecond + } +} + +func (c *KafkaReaderConfig) Validate() error { + if len(c.Brokers) == 0 { + return fmt.Errorf("kafka brokers are required") + } + if c.Topic == "" { + return fmt.Errorf("kafka topic is required") + } + if c.GroupID == "" { + return fmt.Errorf("kafka group id is required") + } + switch strings.ToLower(c.StartOffset) { + case "", "first", "last": + default: + return fmt.Errorf("unsupported kafka start offset %q", c.StartOffset) + } + return nil +} + +// NewKafkaReader builds a kafka.Reader configured for consumer-group reads. +// The reader uses the same TLS/SASL stack as the producer via the offload pkg. +func NewKafkaReader(cfg KafkaReaderConfig) (*kafka.Reader, error) { + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return nil, err + } + + dialer := &kafka.Dialer{ + ClientID: cfg.ClientID, + Timeout: 10 * time.Second, + } + if cfg.TLSEnabled { + dialer.TLS = &tls.Config{MinVersion: tls.VersionTLS12} + } + mech, err := offload.NewSASLMechanism(offload.KafkaConfig{ + Region: cfg.Region, + TLSEnabled: cfg.TLSEnabled, + SASLMechanism: cfg.SASLMechanism, + }) + if err != nil { + return nil, err + } + dialer.SASLMechanism = mech + + start := kafka.FirstOffset + if strings.EqualFold(cfg.StartOffset, "last") { + start = kafka.LastOffset + } + + return kafka.NewReader(kafka.ReaderConfig{ + Brokers: cfg.Brokers, + Topic: cfg.Topic, + GroupID: cfg.GroupID, + Dialer: dialer, + MinBytes: cfg.MinBytes, + MaxBytes: cfg.MaxBytes, + MaxWait: cfg.MaxWait, + StartOffset: start, + CommitInterval: cfg.CommitInterval, + }), nil +} + +// DecodeEntry unmarshals a Kafka message payload into a ChangelogEntry. +func DecodeEntry(payload []byte) (*dbproto.ChangelogEntry, error) { + entry := &dbproto.ChangelogEntry{} + if err := gogoproto.Unmarshal(payload, entry); err != nil { + return nil, fmt.Errorf("decode changelog entry: %w", err) + } + return entry, nil +} From 537cac4c001a7ecfce1287706bf9d98df1d233aa Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 22 Apr 2026 14:38:57 -0700 Subject: [PATCH 06/31] Add consumer loop: Kafka fetch -> decode -> sink -> commit Single-threaded per reader to preserve per-partition ordering so the CockroachDB PK (store_name, key, version DESC) mirrors producer order. Offsets commit only after the sink has persisted, giving at-least-once delivery across restarts (sink is idempotent). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../state_db/ss/offload/consumer/consumer.go | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/consumer.go diff --git a/sei-db/state_db/ss/offload/consumer/consumer.go b/sei-db/state_db/ss/offload/consumer/consumer.go new file mode 100644 index 0000000000..b42db5416c --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/consumer.go @@ -0,0 +1,70 @@ +package consumer + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/segmentio/kafka-go" +) + +// Consumer pulls messages from a Kafka reader, decodes them, writes to a Sink, +// and commits offsets. It is single-threaded by design: ordering per partition +// is required so the CockroachDB primary key (store_name, key, version DESC) +// reflects producer order. +type Consumer struct { + reader *kafka.Reader + sink Sink + logf func(format string, args ...interface{}) +} + +// Options are optional hooks for the consumer loop. +type Options struct { + Logf func(format string, args ...interface{}) +} + +func New(reader *kafka.Reader, sink Sink, opts Options) *Consumer { + logf := opts.Logf + if logf == nil { + logf = func(string, ...interface{}) {} + } + return &Consumer{reader: reader, sink: sink, logf: logf} +} + +// Run blocks until ctx is cancelled or an unrecoverable error occurs. +// It commits offsets only after the sink has persisted each message, so +// at-least-once delivery is preserved across restarts. +func (c *Consumer) Run(ctx context.Context) error { + for { + msg, err := c.reader.FetchMessage(ctx) + if err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return nil + } + return fmt.Errorf("fetch kafka message: %w", err) + } + + entry, err := DecodeEntry(msg.Value) + if err != nil { + return fmt.Errorf("decode message at offset %d: %w", msg.Offset, err) + } + + rec := Record{ + Topic: msg.Topic, + Partition: msg.Partition, + Offset: msg.Offset, + Entry: entry, + } + start := time.Now() + if err := c.sink.Write(ctx, rec); err != nil { + return fmt.Errorf("sink write version %d: %w", entry.Version, err) + } + c.logf("wrote version=%d partition=%d offset=%d in %s", + entry.Version, msg.Partition, msg.Offset, time.Since(start)) + + if err := c.reader.CommitMessages(ctx, msg); err != nil { + return fmt.Errorf("commit kafka offset %d: %w", msg.Offset, err) + } + } +} From 4311da6c182e30150e282c906f85982eb47f3047 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 22 Apr 2026 14:40:13 -0700 Subject: [PATCH 07/31] Add historical-offload-consumer binary and example config The cmd wires Kafka reader, CockroachDB sink, and consumer loop behind signal-triggered cancellation. Config is JSON, validated at load time. Example config documents the AWS MSK IAM + CockroachDB Cloud deployment shape. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cmd/historical-offload-consumer/main.go | 46 +++++++++++++++++++ sei-db/state_db/ss/offload/consumer/config.go | 39 ++++++++++++++++ .../ss/offload/consumer/config/example.json | 19 ++++++++ 3 files changed, 104 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go create mode 100644 sei-db/state_db/ss/offload/consumer/config.go create mode 100644 sei-db/state_db/ss/offload/consumer/config/example.json diff --git a/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go b/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go new file mode 100644 index 0000000000..963f84643a --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go @@ -0,0 +1,46 @@ +package main + +import ( + "context" + "fmt" + "log" + "os" + "os/signal" + "syscall" + + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/consumer" +) + +func main() { + if len(os.Args) != 2 { + fmt.Fprintf(os.Stderr, "usage: %s \n", os.Args[0]) + os.Exit(2) + } + + cfg, err := consumer.LoadConfig(os.Args[1]) + if err != nil { + log.Fatalf("load config: %v", err) + } + + sink, err := consumer.NewCockroachSink(cfg.Cockroach) + if err != nil { + log.Fatalf("open cockroach sink: %v", err) + } + defer sink.Close() + + reader, err := consumer.NewKafkaReader(cfg.Kafka) + if err != nil { + log.Fatalf("open kafka reader: %v", err) + } + defer reader.Close() + + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + c := consumer.New(reader, sink, consumer.Options{ + Logf: func(format string, args ...interface{}) { log.Printf(format, args...) }, + }) + if err := c.Run(ctx); err != nil { + log.Fatalf("consumer: %v", err) + } +} diff --git a/sei-db/state_db/ss/offload/consumer/config.go b/sei-db/state_db/ss/offload/consumer/config.go new file mode 100644 index 0000000000..0d049d27df --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/config.go @@ -0,0 +1,39 @@ +package consumer + +import ( + "encoding/json" + "fmt" + "os" +) + +// Config is the top-level JSON config for the consumer binary. +type Config struct { + Kafka KafkaReaderConfig + Cockroach CockroachConfig +} + +func (c *Config) Validate() error { + if err := c.Kafka.Validate(); err != nil { + return fmt.Errorf("kafka: %w", err) + } + if err := c.Cockroach.Validate(); err != nil { + return fmt.Errorf("cockroach: %w", err) + } + return nil +} + +// LoadConfig reads a JSON config file from path and validates it. +func LoadConfig(path string) (*Config, error) { + raw, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("read config: %w", err) + } + cfg := &Config{} + if err := json.Unmarshal(raw, cfg); err != nil { + return nil, fmt.Errorf("parse config: %w", err) + } + if err := cfg.Validate(); err != nil { + return nil, err + } + return cfg, nil +} diff --git a/sei-db/state_db/ss/offload/consumer/config/example.json b/sei-db/state_db/ss/offload/consumer/config/example.json new file mode 100644 index 0000000000..2b0d7c3782 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/config/example.json @@ -0,0 +1,19 @@ +{ + "Comment": "Sample historical-offload-consumer config. Replace Kafka and Cockroach values for your environment.", + "Kafka": { + "Brokers": [ + "b-1.example.kafka.amazonaws.com:9098", + "b-2.example.kafka.amazonaws.com:9098" + ], + "Topic": "historical-offload", + "GroupID": "historical-offload-consumer", + "Region": "us-east-1", + "TLSEnabled": true, + "SASLMechanism": "aws-msk-iam", + "StartOffset": "first" + }, + "Cockroach": { + "DSN": "postgresql://offload_user@crdb.example.internal:26257/offload?sslmode=verify-full&statement_timeout=60000", + "MaxOpenConns": 16 + } +} From 2d2d1e39d393f3199c49224bb3a366b4ec33c509 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Wed, 22 Apr 2026 14:41:27 -0700 Subject: [PATCH 08/31] Add validation tests for consumer configs Covers Kafka reader config (required fields, StartOffset whitelist, defaults), CockroachDB config (DSN required, non-negative conn counts, defaults), and composed top-level Config. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/consumer/config_test.go | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/config_test.go diff --git a/sei-db/state_db/ss/offload/consumer/config_test.go b/sei-db/state_db/ss/offload/consumer/config_test.go new file mode 100644 index 0000000000..3cb9b5e196 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/config_test.go @@ -0,0 +1,123 @@ +package consumer + +import ( + "strings" + "testing" + "time" +) + +func TestKafkaReaderConfigValidate(t *testing.T) { + tests := []struct { + name string + cfg KafkaReaderConfig + wantErr string + }{ + { + name: "missing brokers", + cfg: KafkaReaderConfig{Topic: "t", GroupID: "g"}, + wantErr: "brokers", + }, + { + name: "missing topic", + cfg: KafkaReaderConfig{Brokers: []string{"b:9092"}, GroupID: "g"}, + wantErr: "topic", + }, + { + name: "missing group id", + cfg: KafkaReaderConfig{Brokers: []string{"b:9092"}, Topic: "t"}, + wantErr: "group id", + }, + { + name: "bad start offset", + cfg: KafkaReaderConfig{Brokers: []string{"b:9092"}, Topic: "t", GroupID: "g", StartOffset: "middle"}, + wantErr: "start offset", + }, + { + name: "valid minimal", + cfg: KafkaReaderConfig{Brokers: []string{"b:9092"}, Topic: "t", GroupID: "g"}, + wantErr: "", + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + err := tc.cfg.Validate() + if tc.wantErr == "" { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + return + } + if err == nil || !strings.Contains(err.Error(), tc.wantErr) { + t.Fatalf("expected error containing %q, got %v", tc.wantErr, err) + } + }) + } +} + +func TestKafkaReaderConfigApplyDefaults(t *testing.T) { + cfg := KafkaReaderConfig{} + cfg.ApplyDefaults() + if cfg.ClientID == "" { + t.Fatal("client id should default") + } + if cfg.StartOffset != "first" { + t.Fatalf("start offset default = %q, want first", cfg.StartOffset) + } + if cfg.MaxBytes == 0 || cfg.MinBytes == 0 || cfg.MaxWait == 0 { + t.Fatalf("min/max bytes and max wait should default, got %+v", cfg) + } +} + +func TestCockroachConfigValidate(t *testing.T) { + tests := []struct { + name string + cfg CockroachConfig + wantErr string + }{ + {"missing dsn", CockroachConfig{}, "dsn"}, + {"blank dsn", CockroachConfig{DSN: " "}, "dsn"}, + {"negative open", CockroachConfig{DSN: "x", MaxOpenConns: -1}, "max open"}, + {"negative idle", CockroachConfig{DSN: "x", MaxIdleConns: -1}, "max idle"}, + {"valid", CockroachConfig{DSN: "postgresql://host/db"}, ""}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + err := tc.cfg.Validate() + if tc.wantErr == "" { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + return + } + if err == nil || !strings.Contains(err.Error(), tc.wantErr) { + t.Fatalf("expected error containing %q, got %v", tc.wantErr, err) + } + }) + } +} + +func TestCockroachConfigApplyDefaults(t *testing.T) { + cfg := CockroachConfig{DSN: "x"} + cfg.ApplyDefaults() + if cfg.MaxOpenConns == 0 || cfg.MaxIdleConns == 0 { + t.Fatalf("conn counts should default, got %+v", cfg) + } + if cfg.ConnMaxLifetime == 0 || cfg.ConnMaxLifetime > 24*time.Hour { + t.Fatalf("conn max lifetime default unreasonable: %v", cfg.ConnMaxLifetime) + } +} + +func TestConfigValidateComposes(t *testing.T) { + cfg := &Config{} + if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "kafka") { + t.Fatalf("expected kafka error, got %v", err) + } + cfg.Kafka = KafkaReaderConfig{Brokers: []string{"b:9092"}, Topic: "t", GroupID: "g"} + if err := cfg.Validate(); err == nil || !strings.Contains(err.Error(), "cockroach") { + t.Fatalf("expected cockroach error, got %v", err) + } + cfg.Cockroach = CockroachConfig{DSN: "postgresql://host/db"} + if err := cfg.Validate(); err != nil { + t.Fatalf("expected no error, got %v", err) + } +} From 08062508714315d25775711b87400f904942338c Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 23 Apr 2026 10:06:32 -0400 Subject: [PATCH 09/31] Add deploy.sh for historical-offload consumer setup Takes MSK and CockroachDB coordinates as env vars and does the rest: applies schema.sql (via cockroach sql or psql), writes the JSON config with 0600 perms, builds the binary, and optionally execs it. Cloud-side resources (MSK cluster + topic + IAM role, CockroachDB cluster + db + user) remain the operator's job. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/deploy.sh | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100755 sei-db/state_db/ss/offload/consumer/deploy.sh diff --git a/sei-db/state_db/ss/offload/consumer/deploy.sh b/sei-db/state_db/ss/offload/consumer/deploy.sh new file mode 100755 index 0000000000..96a397ef33 --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/deploy.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# Provisions the historical-offload consumer against a pre-existing MSK cluster +# and CockroachDB cluster. The cloud-side resources (MSK cluster, topic, IAM +# role, CockroachDB cluster + database + user) must already exist. +# +# Required env: +# KAFKA_BROKERS comma-separated broker endpoints (e.g. b-1.x.kafka.amazonaws.com:9098,b-2.x...) +# KAFKA_TOPIC topic cryptosim is publishing to +# KAFKA_GROUP_ID consumer group id +# AWS_REGION region for AWS MSK IAM signing (also exported for the binary at runtime) +# COCKROACH_DSN full postgresql:// DSN (include sslmode, statement_timeout, etc.) +# +# Optional env: +# KAFKA_TLS_ENABLED default true +# KAFKA_SASL_MECHANISM default aws-msk-iam ("" or "none" disables) +# KAFKA_START_OFFSET default first (first|last) +# COCKROACH_MAX_CONNS default 16 +# CONFIG_OUT default ./historical-offload-consumer.json +# BIN_OUT default ./bin/historical-offload-consumer +# SKIP_SCHEMA=1 skip applying schema.sql +# SKIP_BUILD=1 skip go build +# RUN=1 exec the binary at the end + +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../../../../.." &>/dev/null && pwd)" + +: "${KAFKA_BROKERS:?set KAFKA_BROKERS}" +: "${KAFKA_TOPIC:?set KAFKA_TOPIC}" +: "${KAFKA_GROUP_ID:?set KAFKA_GROUP_ID}" +: "${AWS_REGION:?set AWS_REGION}" +: "${COCKROACH_DSN:?set COCKROACH_DSN}" + +KAFKA_TLS_ENABLED="${KAFKA_TLS_ENABLED:-true}" +KAFKA_SASL_MECHANISM="${KAFKA_SASL_MECHANISM:-aws-msk-iam}" +KAFKA_START_OFFSET="${KAFKA_START_OFFSET:-first}" +COCKROACH_MAX_CONNS="${COCKROACH_MAX_CONNS:-16}" +CONFIG_OUT="${CONFIG_OUT:-./historical-offload-consumer.json}" +BIN_OUT="${BIN_OUT:-./bin/historical-offload-consumer}" + +log() { printf '[%s] %s\n' "$(date -u +%FT%TZ)" "$*"; } + +apply_schema() { + local schema="${SCRIPT_DIR}/schema/schema.sql" + [[ -f "$schema" ]] || { echo "schema file missing: $schema" >&2; exit 1; } + + if command -v cockroach &>/dev/null; then + log "applying schema with cockroach sql" + cockroach sql --url="${COCKROACH_DSN}" <"$schema" + elif command -v psql &>/dev/null; then + log "applying schema with psql" + psql "${COCKROACH_DSN}" -v ON_ERROR_STOP=1 -f "$schema" + else + echo "need 'cockroach' or 'psql' on PATH to apply schema; set SKIP_SCHEMA=1 to bypass" >&2 + exit 1 + fi +} + +write_config() { + log "writing config to ${CONFIG_OUT}" + mkdir -p "$(dirname "${CONFIG_OUT}")" + + python3 - "$CONFIG_OUT" < Date: Thu, 23 Apr 2026 13:28:59 -0400 Subject: [PATCH 10/31] Add README for historical-offload-consumer Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/README.md | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/README.md diff --git a/sei-db/state_db/ss/offload/consumer/README.md b/sei-db/state_db/ss/offload/consumer/README.md new file mode 100644 index 0000000000..1a3394203d --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/README.md @@ -0,0 +1,36 @@ +# historical-offload-consumer + +Reads `ChangelogEntry` messages from the Kafka topic cryptosim publishes to and writes them to CockroachDB. + +## Layout + +- `schema/schema.sql` — CockroachDB DDL (idempotent) +- `cmd/historical-offload-consumer/` — CLI binary +- `config/example.json` — sample config +- `deploy.sh` — one-shot setup helper + +## Cloud prerequisites (manual) + +- MSK cluster + topic + IAM role with `kafka-cluster:Connect` and read on the topic +- CockroachDB cluster + database + user +- AWS credentials available to the process (env or IAM role) + +## Run + +```bash +export KAFKA_BROKERS="b-1...:9098,b-2...:9098" +export KAFKA_TOPIC="historical-offload" +export KAFKA_GROUP_ID="historical-offload-consumer" +export AWS_REGION="us-east-1" +export COCKROACH_DSN="postgresql://user@host:26257/db?sslmode=verify-full" + +RUN=1 ./deploy.sh +``` + +`deploy.sh` applies the schema, writes the config, builds the binary, and (with `RUN=1`) starts it. Flags: `SKIP_SCHEMA=1`, `SKIP_BUILD=1`. + +## Guarantees + +- At-least-once delivery. Sink UPSERTs on `(store_name, key, version)` so replay is a no-op. +- Per-partition ordering preserved (single-threaded loop per reader). +- Offsets commit only after the sink persists the entry. From ac42c35386d40efb9ee9dbef1400e5a25d6c9aa1 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 23 Apr 2026 16:06:02 -0400 Subject: [PATCH 11/31] Test DecodeEntry roundtrip and LoadConfig parsing Covers protobuf encode/decode symmetry including the Delete flag, rejection of malformed payloads, happy-path JSON config load, and failure cases for invalid + missing config files. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/consumer/config_test.go | 35 ++++++++++++++++ .../ss/offload/consumer/kafka_test.go | 42 +++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 sei-db/state_db/ss/offload/consumer/kafka_test.go diff --git a/sei-db/state_db/ss/offload/consumer/config_test.go b/sei-db/state_db/ss/offload/consumer/config_test.go index 3cb9b5e196..27e4a9ee43 100644 --- a/sei-db/state_db/ss/offload/consumer/config_test.go +++ b/sei-db/state_db/ss/offload/consumer/config_test.go @@ -1,9 +1,13 @@ package consumer import ( + "os" + "path/filepath" "strings" "testing" "time" + + "github.com/stretchr/testify/require" ) func TestKafkaReaderConfigValidate(t *testing.T) { @@ -121,3 +125,34 @@ func TestConfigValidateComposes(t *testing.T) { t.Fatalf("expected no error, got %v", err) } } + +func TestLoadConfig(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "cfg.json") + body := `{ + "Kafka": {"Brokers":["b:9092"], "Topic":"t", "GroupID":"g"}, + "Cockroach": {"DSN":"postgresql://host/db", "MaxOpenConns": 4} + }` + require.NoError(t, os.WriteFile(path, []byte(body), 0o600)) + + cfg, err := LoadConfig(path) + require.NoError(t, err) + require.Equal(t, []string{"b:9092"}, cfg.Kafka.Brokers) + require.Equal(t, "t", cfg.Kafka.Topic) + require.Equal(t, "postgresql://host/db", cfg.Cockroach.DSN) + require.Equal(t, 4, cfg.Cockroach.MaxOpenConns) +} + +func TestLoadConfigRejectsInvalid(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "cfg.json") + require.NoError(t, os.WriteFile(path, []byte(`{"Kafka":{}}`), 0o600)) + + _, err := LoadConfig(path) + require.Error(t, err) +} + +func TestLoadConfigMissingFile(t *testing.T) { + _, err := LoadConfig(filepath.Join(t.TempDir(), "nope.json")) + require.Error(t, err) +} diff --git a/sei-db/state_db/ss/offload/consumer/kafka_test.go b/sei-db/state_db/ss/offload/consumer/kafka_test.go new file mode 100644 index 0000000000..40e2d4866c --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/kafka_test.go @@ -0,0 +1,42 @@ +package consumer + +import ( + "testing" + + gogoproto "github.com/gogo/protobuf/proto" + "github.com/stretchr/testify/require" + + dbproto "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +func TestDecodeEntryRoundtrip(t *testing.T) { + entry := &dbproto.ChangelogEntry{ + Version: 42, + Changesets: []*dbproto.NamedChangeSet{{ + Name: "evm", + Changeset: dbproto.ChangeSet{ + Pairs: []*dbproto.KVPair{ + {Key: []byte("k1"), Value: []byte("v1")}, + {Key: []byte("k2"), Delete: true}, + }, + }, + }}, + } + + payload, err := gogoproto.Marshal(entry) + require.NoError(t, err) + + got, err := DecodeEntry(payload) + require.NoError(t, err) + require.Equal(t, entry.Version, got.Version) + require.Len(t, got.Changesets, 1) + require.Equal(t, "evm", got.Changesets[0].Name) + require.Len(t, got.Changesets[0].Changeset.Pairs, 2) + require.Equal(t, []byte("v1"), got.Changesets[0].Changeset.Pairs[0].Value) + require.True(t, got.Changesets[0].Changeset.Pairs[1].Delete) +} + +func TestDecodeEntryRejectsGarbage(t *testing.T) { + _, err := DecodeEntry([]byte{0xff, 0xff, 0xff}) + require.Error(t, err) +} From 5bb21e1c54ea311eebdaf3e478e9c43a830df683 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 23 Apr 2026 16:07:16 -0400 Subject: [PATCH 12/31] Test consumer loop end-to-end with fake source + sink Extracts a MessageSource interface (satisfied by *kafka.Reader) so the loop can be driven without a running cluster. Tests cover happy-path fetch/decode/write/commit ordering, that offsets are not committed when the sink errors, that decode errors stop the loop cleanly, and that context cancel returns nil. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../state_db/ss/offload/consumer/consumer.go | 13 +- .../ss/offload/consumer/consumer_test.go | 131 ++++++++++++++++++ 2 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 sei-db/state_db/ss/offload/consumer/consumer_test.go diff --git a/sei-db/state_db/ss/offload/consumer/consumer.go b/sei-db/state_db/ss/offload/consumer/consumer.go index b42db5416c..56fa4fac5b 100644 --- a/sei-db/state_db/ss/offload/consumer/consumer.go +++ b/sei-db/state_db/ss/offload/consumer/consumer.go @@ -9,12 +9,19 @@ import ( "github.com/segmentio/kafka-go" ) -// Consumer pulls messages from a Kafka reader, decodes them, writes to a Sink, +// MessageSource is the subset of *kafka.Reader the consumer uses. Extracting +// it lets tests drive the loop with a fake without a running Kafka. +type MessageSource interface { + FetchMessage(ctx context.Context) (kafka.Message, error) + CommitMessages(ctx context.Context, msgs ...kafka.Message) error +} + +// Consumer pulls messages from a MessageSource, decodes them, writes to a Sink, // and commits offsets. It is single-threaded by design: ordering per partition // is required so the CockroachDB primary key (store_name, key, version DESC) // reflects producer order. type Consumer struct { - reader *kafka.Reader + reader MessageSource sink Sink logf func(format string, args ...interface{}) } @@ -24,7 +31,7 @@ type Options struct { Logf func(format string, args ...interface{}) } -func New(reader *kafka.Reader, sink Sink, opts Options) *Consumer { +func New(reader MessageSource, sink Sink, opts Options) *Consumer { logf := opts.Logf if logf == nil { logf = func(string, ...interface{}) {} diff --git a/sei-db/state_db/ss/offload/consumer/consumer_test.go b/sei-db/state_db/ss/offload/consumer/consumer_test.go new file mode 100644 index 0000000000..48ab01fc5f --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/consumer_test.go @@ -0,0 +1,131 @@ +package consumer + +import ( + "context" + "errors" + "sync" + "testing" + "time" + + gogoproto "github.com/gogo/protobuf/proto" + "github.com/segmentio/kafka-go" + "github.com/stretchr/testify/require" + + dbproto "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +type fakeSource struct { + msgs []kafka.Message + fetchIdx int + committed []kafka.Message + fetchErr error + mu sync.Mutex +} + +func (f *fakeSource) FetchMessage(ctx context.Context) (kafka.Message, error) { + f.mu.Lock() + if f.fetchErr != nil { + err := f.fetchErr + f.mu.Unlock() + return kafka.Message{}, err + } + if f.fetchIdx < len(f.msgs) { + m := f.msgs[f.fetchIdx] + f.fetchIdx++ + f.mu.Unlock() + return m, nil + } + f.mu.Unlock() + <-ctx.Done() + return kafka.Message{}, ctx.Err() +} + +func (f *fakeSource) CommitMessages(ctx context.Context, msgs ...kafka.Message) error { + f.mu.Lock() + defer f.mu.Unlock() + f.committed = append(f.committed, msgs...) + return nil +} + +type recordingSink struct { + records []Record + err error +} + +func (s *recordingSink) Write(ctx context.Context, rec Record) error { + if s.err != nil { + return s.err + } + s.records = append(s.records, rec) + return nil +} +func (s *recordingSink) LastVersion(ctx context.Context) (int64, error) { return 0, nil } +func (s *recordingSink) Close() error { return nil } + +func marshalEntry(t *testing.T, version int64, pairs ...*dbproto.KVPair) []byte { + t.Helper() + entry := &dbproto.ChangelogEntry{ + Version: version, + Changesets: []*dbproto.NamedChangeSet{{ + Name: "evm", + Changeset: dbproto.ChangeSet{Pairs: pairs}, + }}, + } + payload, err := gogoproto.Marshal(entry) + require.NoError(t, err) + return payload +} + +func TestConsumerRunWritesAndCommits(t *testing.T) { + src := &fakeSource{msgs: []kafka.Message{ + {Topic: "t", Partition: 0, Offset: 10, Value: marshalEntry(t, 1, &dbproto.KVPair{Key: []byte("a"), Value: []byte("1")})}, + {Topic: "t", Partition: 0, Offset: 11, Value: marshalEntry(t, 2, &dbproto.KVPair{Key: []byte("b"), Delete: true})}, + }} + sink := &recordingSink{} + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + c := New(src, sink, Options{}) + err := c.Run(ctx) + require.NoError(t, err) + + require.Len(t, sink.records, 2) + require.Equal(t, int64(1), sink.records[0].Entry.Version) + require.Equal(t, int64(11), sink.records[1].Offset) + + require.Len(t, src.committed, 2) + require.Equal(t, int64(10), src.committed[0].Offset) + require.Equal(t, int64(11), src.committed[1].Offset) +} + +func TestConsumerRunSinkErrorStopsBeforeCommit(t *testing.T) { + src := &fakeSource{msgs: []kafka.Message{ + {Topic: "t", Offset: 1, Value: marshalEntry(t, 1)}, + }} + sink := &recordingSink{err: errors.New("sink boom")} + + c := New(src, sink, Options{}) + err := c.Run(context.Background()) + require.Error(t, err) + require.Contains(t, err.Error(), "sink boom") + require.Empty(t, src.committed, "offset must not be committed when sink fails") +} + +func TestConsumerRunDecodeErrorStops(t *testing.T) { + src := &fakeSource{msgs: []kafka.Message{ + {Topic: "t", Offset: 1, Value: []byte{0xff, 0xff}}, + }} + sink := &recordingSink{} + + err := New(src, sink, Options{}).Run(context.Background()) + require.Error(t, err) + require.Contains(t, err.Error(), "decode message") + require.Empty(t, sink.records) + require.Empty(t, src.committed) +} + +func TestConsumerRunCancelReturnsNil(t *testing.T) { + src := &fakeSource{fetchErr: context.Canceled} + err := New(src, &recordingSink{}, Options{}).Run(context.Background()) + require.NoError(t, err) +} From ff433d05fb2dfe2f3f933ba61ac2084a208bca6e Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Thu, 23 Apr 2026 16:08:29 -0400 Subject: [PATCH 13/31] Test mutation-batch SQL builder Extracts the SQL/args generation for state_mutations into a pure buildMutationBatches function. Tests cover empty entries, single batch composition (statement shape, placeholder numbering, arg order, delete flag), batch splitting at the row cap, multi-store changesets, and the default cap fallback. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../state_db/ss/offload/consumer/cockroach.go | 52 +++++---- .../ss/offload/consumer/cockroach_test.go | 101 ++++++++++++++++++ 2 files changed, 135 insertions(+), 18 deletions(-) create mode 100644 sei-db/state_db/ss/offload/consumer/cockroach_test.go diff --git a/sei-db/state_db/ss/offload/consumer/cockroach.go b/sei-db/state_db/ss/offload/consumer/cockroach.go index 8e8837fafd..3d83559cb1 100644 --- a/sei-db/state_db/ss/offload/consumer/cockroach.go +++ b/sei-db/state_db/ss/offload/consumer/cockroach.go @@ -136,41 +136,57 @@ func insertVersion(ctx context.Context, tx *sql.Tx, rec Record) error { // but smaller batches keep transaction retries cheap under contention. const mutationBatchRows = 500 -func insertMutations(ctx context.Context, tx *sql.Tx, rec Record) error { +// mutationBatch is one ready-to-execute INSERT with its parameter list. +type mutationBatch struct { + Stmt string + Args []interface{} +} + +// buildMutationBatches turns a ChangelogEntry into one or more parameterized +// INSERT statements. Pure function — no DB access — so it is unit-testable. +func buildMutationBatches(rec Record, maxRows int) []mutationBatch { + if maxRows <= 0 { + maxRows = mutationBatchRows + } version := rec.Entry.Version var ( - args []interface{} - parts []string + batches []mutationBatch + args []interface{} + parts []string ) - flush := func() error { + flush := func() { if len(parts) == 0 { - return nil + return } stmt := `INSERT INTO state_mutations (store_name, key, version, value, deleted) VALUES ` + strings.Join(parts, ",") + ` ON CONFLICT (store_name, key, version) DO UPDATE SET value = excluded.value, deleted = excluded.deleted` - if _, err := tx.ExecContext(ctx, stmt, args...); err != nil { - return fmt.Errorf("insert mutations: %w", err) - } - args = args[:0] - parts = parts[:0] - return nil + batches = append(batches, mutationBatch{Stmt: stmt, Args: args}) + args = nil + parts = nil } for _, ncs := range rec.Entry.Changesets { - name := ncs.Name for _, p := range ncs.Changeset.Pairs { idx := len(args) parts = append(parts, fmt.Sprintf("($%d,$%d,$%d,$%d,$%d)", idx+1, idx+2, idx+3, idx+4, idx+5)) - args = append(args, name, p.Key, version, p.Value, p.Delete) - if len(parts) >= mutationBatchRows { - if err := flush(); err != nil { - return err - } + args = append(args, ncs.Name, p.Key, version, p.Value, p.Delete) + if len(parts) >= maxRows { + flush() } } } - return flush() + flush() + return batches +} + +func insertMutations(ctx context.Context, tx *sql.Tx, rec Record) error { + for _, b := range buildMutationBatches(rec, mutationBatchRows) { + if _, err := tx.ExecContext(ctx, b.Stmt, b.Args...); err != nil { + return fmt.Errorf("insert mutations: %w", err) + } + } + return nil } func insertUpgrades(ctx context.Context, tx *sql.Tx, rec Record) error { diff --git a/sei-db/state_db/ss/offload/consumer/cockroach_test.go b/sei-db/state_db/ss/offload/consumer/cockroach_test.go new file mode 100644 index 0000000000..a5159507cb --- /dev/null +++ b/sei-db/state_db/ss/offload/consumer/cockroach_test.go @@ -0,0 +1,101 @@ +package consumer + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" + + dbproto "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +func makeRecord(version int64, changesets ...*dbproto.NamedChangeSet) Record { + return Record{ + Topic: "historical-offload", + Offset: version, + Entry: &dbproto.ChangelogEntry{ + Version: version, + Changesets: changesets, + }, + } +} + +func TestBuildMutationBatchesEmpty(t *testing.T) { + require.Empty(t, buildMutationBatches(makeRecord(1), 500)) +} + +func TestBuildMutationBatchesSingleBatch(t *testing.T) { + rec := makeRecord(7, &dbproto.NamedChangeSet{ + Name: "evm", + Changeset: dbproto.ChangeSet{Pairs: []*dbproto.KVPair{ + {Key: []byte("k1"), Value: []byte("v1")}, + {Key: []byte("k2"), Delete: true}, + }}, + }) + batches := buildMutationBatches(rec, 500) + require.Len(t, batches, 1) + + b := batches[0] + require.Contains(t, b.Stmt, "INSERT INTO state_mutations") + require.Contains(t, b.Stmt, "ON CONFLICT (store_name, key, version) DO UPDATE") + require.Contains(t, b.Stmt, "($1,$2,$3,$4,$5)") + require.Contains(t, b.Stmt, "($6,$7,$8,$9,$10)") + require.Equal(t, 2, strings.Count(b.Stmt, "($")) + require.Len(t, b.Args, 10) + + // First row: name, key, version, value, deleted. + require.Equal(t, "evm", b.Args[0]) + require.Equal(t, []byte("k1"), b.Args[1]) + require.Equal(t, int64(7), b.Args[2]) + require.Equal(t, []byte("v1"), b.Args[3]) + require.Equal(t, false, b.Args[4]) + // Second row: delete=true. + require.Equal(t, true, b.Args[9]) +} + +func TestBuildMutationBatchesSplits(t *testing.T) { + pairs := make([]*dbproto.KVPair, 250) + for i := range pairs { + pairs[i] = &dbproto.KVPair{Key: []byte{byte(i)}, Value: []byte{0x1}} + } + rec := makeRecord(9, &dbproto.NamedChangeSet{ + Name: "bank", + Changeset: dbproto.ChangeSet{Pairs: pairs}, + }) + + batches := buildMutationBatches(rec, 100) + require.Len(t, batches, 3) // 100 + 100 + 50 + require.Len(t, batches[0].Args, 500) + require.Len(t, batches[1].Args, 500) + require.Len(t, batches[2].Args, 250) +} + +func TestBuildMutationBatchesAcrossStores(t *testing.T) { + rec := makeRecord(3, + &dbproto.NamedChangeSet{ + Name: "evm", + Changeset: dbproto.ChangeSet{Pairs: []*dbproto.KVPair{{Key: []byte("a"), Value: []byte("1")}}}, + }, + &dbproto.NamedChangeSet{ + Name: "bank", + Changeset: dbproto.ChangeSet{Pairs: []*dbproto.KVPair{{Key: []byte("b"), Value: []byte("2")}}}, + }, + ) + batches := buildMutationBatches(rec, 500) + require.Len(t, batches, 1) + require.Equal(t, "evm", batches[0].Args[0]) + require.Equal(t, "bank", batches[0].Args[5]) +} + +func TestBuildMutationBatchesDefaultCap(t *testing.T) { + pairs := make([]*dbproto.KVPair, mutationBatchRows+1) + for i := range pairs { + pairs[i] = &dbproto.KVPair{Key: []byte{byte(i)}} + } + rec := makeRecord(1, &dbproto.NamedChangeSet{ + Name: "x", + Changeset: dbproto.ChangeSet{Pairs: pairs}, + }) + batches := buildMutationBatches(rec, 0) + require.Len(t, batches, 2) +} From a5b577abcd7294247b1f052a6b93b5726c9bd578 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 27 Apr 2026 13:22:16 -0400 Subject: [PATCH 14/31] Retry sink writes with bounded exponential backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps sink.Write with a small retry loop (defaults: 5 attempts, 1s→30s) so transient CockroachDB errors no longer crash the consumer on the first failure. Sleeps are ctx-aware; on giving up, the wrapped error propagates and the process exits non-zero — Kafka offsets remain uncommitted, so the supervisor restart replays from the last committed offset. Retry knobs are exposed on Options with zero-value defaults so tests can use ms-scale backoffs. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/README.md | 3 + .../state_db/ss/offload/consumer/consumer.go | 84 +++++++++++++++++-- .../ss/offload/consumer/consumer_test.go | 43 +++++++++- 3 files changed, 123 insertions(+), 7 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/README.md b/sei-db/state_db/ss/offload/consumer/README.md index 1a3394203d..52354f3645 100644 --- a/sei-db/state_db/ss/offload/consumer/README.md +++ b/sei-db/state_db/ss/offload/consumer/README.md @@ -34,3 +34,6 @@ RUN=1 ./deploy.sh - At-least-once delivery. Sink UPSERTs on `(store_name, key, version)` so replay is a no-op. - Per-partition ordering preserved (single-threaded loop per reader). - Offsets commit only after the sink persists the entry. +- Sink writes use bounded exponential backoff (5 attempts, 1s→30s) before + giving up. On give-up the process exits non-zero so the supervisor restarts; + Kafka offsets stay uncommitted, so the next run replays from the last commit. diff --git a/sei-db/state_db/ss/offload/consumer/consumer.go b/sei-db/state_db/ss/offload/consumer/consumer.go index 56fa4fac5b..cd3e60c88e 100644 --- a/sei-db/state_db/ss/offload/consumer/consumer.go +++ b/sei-db/state_db/ss/offload/consumer/consumer.go @@ -21,14 +21,31 @@ type MessageSource interface { // is required so the CockroachDB primary key (store_name, key, version DESC) // reflects producer order. type Consumer struct { - reader MessageSource - sink Sink - logf func(format string, args ...interface{}) + reader MessageSource + sink Sink + logf func(format string, args ...interface{}) + maxAttempts int + baseBackoff time.Duration + maxBackoff time.Duration } -// Options are optional hooks for the consumer loop. +// Default retry knobs for sink writes. Total wait at defaults ≈ 1+2+4+8 = 15s +// before giving up and letting the process supervisor restart us. +const ( + defaultSinkMaxAttempts = 5 + defaultSinkBaseBackoff = 1 * time.Second + defaultSinkMaxBackoff = 30 * time.Second +) + +// Options are optional hooks for the consumer loop. Zero values pick defaults. type Options struct { Logf func(format string, args ...interface{}) + // SinkMaxAttempts caps total sink.Write attempts per message (>=1). + SinkMaxAttempts int + // SinkBaseBackoff is the initial backoff between retries; doubles each retry. + SinkBaseBackoff time.Duration + // SinkMaxBackoff caps the per-retry backoff. + SinkMaxBackoff time.Duration } func New(reader MessageSource, sink Sink, opts Options) *Consumer { @@ -36,7 +53,26 @@ func New(reader MessageSource, sink Sink, opts Options) *Consumer { if logf == nil { logf = func(string, ...interface{}) {} } - return &Consumer{reader: reader, sink: sink, logf: logf} + maxAttempts := opts.SinkMaxAttempts + if maxAttempts <= 0 { + maxAttempts = defaultSinkMaxAttempts + } + base := opts.SinkBaseBackoff + if base <= 0 { + base = defaultSinkBaseBackoff + } + maxBackoff := opts.SinkMaxBackoff + if maxBackoff <= 0 { + maxBackoff = defaultSinkMaxBackoff + } + return &Consumer{ + reader: reader, + sink: sink, + logf: logf, + maxAttempts: maxAttempts, + baseBackoff: base, + maxBackoff: maxBackoff, + } } // Run blocks until ctx is cancelled or an unrecoverable error occurs. @@ -64,7 +100,10 @@ func (c *Consumer) Run(ctx context.Context) error { Entry: entry, } start := time.Now() - if err := c.sink.Write(ctx, rec); err != nil { + if err := c.writeWithRetry(ctx, rec); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return nil + } return fmt.Errorf("sink write version %d: %w", entry.Version, err) } c.logf("wrote version=%d partition=%d offset=%d in %s", @@ -75,3 +114,36 @@ func (c *Consumer) Run(ctx context.Context) error { } } } + +// writeWithRetry calls sink.Write with bounded exponential backoff. It returns +// the underlying error after the final attempt, or ctx.Err() if cancelled +// while sleeping between retries. +func (c *Consumer) writeWithRetry(ctx context.Context, rec Record) error { + backoff := c.baseBackoff + var lastErr error + for attempt := 1; attempt <= c.maxAttempts; attempt++ { + err := c.sink.Write(ctx, rec) + if err == nil { + return nil + } + lastErr = err + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return err + } + if attempt == c.maxAttempts { + break + } + c.logf("sink write attempt %d/%d failed: %v; retrying in %s", + attempt, c.maxAttempts, err, backoff) + select { + case <-time.After(backoff): + case <-ctx.Done(): + return ctx.Err() + } + backoff *= 2 + if backoff > c.maxBackoff { + backoff = c.maxBackoff + } + } + return fmt.Errorf("sink write failed after %d attempts: %w", c.maxAttempts, lastErr) +} diff --git a/sei-db/state_db/ss/offload/consumer/consumer_test.go b/sei-db/state_db/ss/offload/consumer/consumer_test.go index 48ab01fc5f..349349e3e3 100644 --- a/sei-db/state_db/ss/offload/consumer/consumer_test.go +++ b/sei-db/state_db/ss/offload/consumer/consumer_test.go @@ -62,6 +62,23 @@ func (s *recordingSink) Write(ctx context.Context, rec Record) error { func (s *recordingSink) LastVersion(ctx context.Context) (int64, error) { return 0, nil } func (s *recordingSink) Close() error { return nil } +// flakySink fails the first failuresLeft Write calls, then succeeds. +type flakySink struct { + failuresLeft int + attempts int +} + +func (s *flakySink) Write(ctx context.Context, rec Record) error { + s.attempts++ + if s.failuresLeft > 0 { + s.failuresLeft-- + return errors.New("transient") + } + return nil +} +func (s *flakySink) LastVersion(ctx context.Context) (int64, error) { return 0, nil } +func (s *flakySink) Close() error { return nil } + func marshalEntry(t *testing.T, version int64, pairs ...*dbproto.KVPair) []byte { t.Helper() entry := &dbproto.ChangelogEntry{ @@ -104,13 +121,37 @@ func TestConsumerRunSinkErrorStopsBeforeCommit(t *testing.T) { }} sink := &recordingSink{err: errors.New("sink boom")} - c := New(src, sink, Options{}) + c := New(src, sink, Options{ + SinkMaxAttempts: 2, + SinkBaseBackoff: time.Millisecond, + SinkMaxBackoff: time.Millisecond, + }) err := c.Run(context.Background()) require.Error(t, err) require.Contains(t, err.Error(), "sink boom") require.Empty(t, src.committed, "offset must not be committed when sink fails") } +func TestConsumerRunRetriesSinkUntilSuccess(t *testing.T) { + src := &fakeSource{msgs: []kafka.Message{ + {Topic: "t", Partition: 0, Offset: 5, Value: marshalEntry(t, 1, &dbproto.KVPair{Key: []byte("a"), Value: []byte("1")})}, + }} + sink := &flakySink{failuresLeft: 2} + + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + c := New(src, sink, Options{ + SinkMaxAttempts: 5, + SinkBaseBackoff: time.Millisecond, + SinkMaxBackoff: 2 * time.Millisecond, + }) + require.NoError(t, c.Run(ctx)) + + require.Equal(t, 3, sink.attempts, "sink should be retried until it succeeds") + require.Len(t, src.committed, 1) + require.Equal(t, int64(5), src.committed[0].Offset) +} + func TestConsumerRunDecodeErrorStops(t *testing.T) { src := &fakeSource{msgs: []kafka.Message{ {Topic: "t", Offset: 1, Value: []byte{0xff, 0xff}}, From 0f7ee48fb1a4039ba5dcd3b29393afc6aa172211 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 27 Apr 2026 13:22:56 -0400 Subject: [PATCH 15/31] Fix golangci-lint findings in consumer package - cockroach.go: preallocate args/parts slices in buildMutationBatches with capacities derived from maxRows (prealloc). - config.go: annotate os.ReadFile with #nosec G304; the path is an operator-supplied CLI arg, not user-controlled at runtime. - main.go: wrap deferred sink.Close / reader.Close so errcheck does not flag the discarded return. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../consumer/cmd/historical-offload-consumer/main.go | 4 ++-- sei-db/state_db/ss/offload/consumer/cockroach.go | 9 +++++---- sei-db/state_db/ss/offload/consumer/config.go | 1 + 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go b/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go index 963f84643a..40294f1263 100644 --- a/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go +++ b/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go @@ -26,13 +26,13 @@ func main() { if err != nil { log.Fatalf("open cockroach sink: %v", err) } - defer sink.Close() + defer func() { _ = sink.Close() }() reader, err := consumer.NewKafkaReader(cfg.Kafka) if err != nil { log.Fatalf("open kafka reader: %v", err) } - defer reader.Close() + defer func() { _ = reader.Close() }() ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer cancel() diff --git a/sei-db/state_db/ss/offload/consumer/cockroach.go b/sei-db/state_db/ss/offload/consumer/cockroach.go index 3d83559cb1..06ece5f737 100644 --- a/sei-db/state_db/ss/offload/consumer/cockroach.go +++ b/sei-db/state_db/ss/offload/consumer/cockroach.go @@ -149,10 +149,11 @@ func buildMutationBatches(rec Record, maxRows int) []mutationBatch { maxRows = mutationBatchRows } version := rec.Entry.Version + const colsPerRow = 5 var ( batches []mutationBatch - args []interface{} - parts []string + args = make([]interface{}, 0, maxRows*colsPerRow) + parts = make([]string, 0, maxRows) ) flush := func() { if len(parts) == 0 { @@ -162,8 +163,8 @@ func buildMutationBatches(rec Record, maxRows int) []mutationBatch { strings.Join(parts, ",") + ` ON CONFLICT (store_name, key, version) DO UPDATE SET value = excluded.value, deleted = excluded.deleted` batches = append(batches, mutationBatch{Stmt: stmt, Args: args}) - args = nil - parts = nil + args = make([]interface{}, 0, maxRows*colsPerRow) + parts = make([]string, 0, maxRows) } for _, ncs := range rec.Entry.Changesets { diff --git a/sei-db/state_db/ss/offload/consumer/config.go b/sei-db/state_db/ss/offload/consumer/config.go index 0d049d27df..ac8c84bfe2 100644 --- a/sei-db/state_db/ss/offload/consumer/config.go +++ b/sei-db/state_db/ss/offload/consumer/config.go @@ -24,6 +24,7 @@ func (c *Config) Validate() error { // LoadConfig reads a JSON config file from path and validates it. func LoadConfig(path string) (*Config, error) { + // #nosec G304 -- config path is supplied by the operator on the command line. raw, err := os.ReadFile(path) if err != nil { return nil, fmt.Errorf("read config: %w", err) From 37efd20254e35d08377f1104b57a920807365377 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 27 Apr 2026 17:05:24 -0400 Subject: [PATCH 16/31] Drop CommitInterval from consumer KafkaReaderConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The consumer's at-least-once guarantee depends on synchronous offset commits (commit only after the sink persists). Exposing CommitInterval let operators silently switch to async commits and weaken that, so remove the knob — kafka-go's zero value is the sync behavior we want. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/kafka.go | 44 ++++++++++---------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/kafka.go b/sei-db/state_db/ss/offload/consumer/kafka.go index f8d812511d..80ff5d2118 100644 --- a/sei-db/state_db/ss/offload/consumer/kafka.go +++ b/sei-db/state_db/ss/offload/consumer/kafka.go @@ -15,19 +15,22 @@ import ( // KafkaReaderConfig mirrors the fields of the producer-side KafkaConfig that // matter to a consumer. TLS/SASL settings must match the producer cluster. +// +// Commits are intentionally synchronous (kafka-go's zero CommitInterval): the +// consumer relies on offsets advancing only after the sink has persisted each +// entry, so we don't expose a knob that could silently weaken that guarantee. type KafkaReaderConfig struct { - Brokers []string - Topic string - GroupID string - ClientID string - Region string - StartOffset string // "first" or "last"; defaults to "first" - MinBytes int - MaxBytes int - MaxWait time.Duration - CommitInterval time.Duration - TLSEnabled bool - SASLMechanism string + Brokers []string + Topic string + GroupID string + ClientID string + Region string + StartOffset string // "first" or "last"; defaults to "first" + MinBytes int + MaxBytes int + MaxWait time.Duration + TLSEnabled bool + SASLMechanism string } func (c *KafkaReaderConfig) ApplyDefaults() { @@ -97,15 +100,14 @@ func NewKafkaReader(cfg KafkaReaderConfig) (*kafka.Reader, error) { } return kafka.NewReader(kafka.ReaderConfig{ - Brokers: cfg.Brokers, - Topic: cfg.Topic, - GroupID: cfg.GroupID, - Dialer: dialer, - MinBytes: cfg.MinBytes, - MaxBytes: cfg.MaxBytes, - MaxWait: cfg.MaxWait, - StartOffset: start, - CommitInterval: cfg.CommitInterval, + Brokers: cfg.Brokers, + Topic: cfg.Topic, + GroupID: cfg.GroupID, + Dialer: dialer, + MinBytes: cfg.MinBytes, + MaxBytes: cfg.MaxBytes, + MaxWait: cfg.MaxWait, + StartOffset: start, }), nil } From a00f03a697a8945bc75036a59ae1327665e7aa33 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 27 Apr 2026 17:59:45 -0400 Subject: [PATCH 17/31] Hash-shard monotonic offload schema indexes state_versions(version) and state_mutations_by_version_idx both have strictly-increasing keys (block height). Without sharding, every write lands on the same range at the head of the keyspace, capping ingest throughput on a single leaseholder. USING HASH WITH (bucket_count = 16) spreads the writes across 16 ranges; the by-version index is only used for pruning/diagnostics so the small fan-out cost on range scans is irrelevant. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/consumer/schema/schema.sql | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/schema/schema.sql b/sei-db/state_db/ss/offload/consumer/schema/schema.sql index 33f540e12b..8644c885c9 100644 --- a/sei-db/state_db/ss/offload/consumer/schema/schema.sql +++ b/sei-db/state_db/ss/offload/consumer/schema/schema.sql @@ -1,11 +1,17 @@ -- CockroachDB schema for the historical offload consumer. -- Applied once per cluster before starting the consumer. +-- state_versions and the by-version index on state_mutations both have a +-- strictly-increasing key (block height), which without sharding turns the +-- head of the keyspace into a single-range write hotspot. Hash-sharding +-- spreads writes across 16 ranges. Range scans on `version` still work, with +-- a small per-range fan-out cost that is irrelevant for our query mix. CREATE TABLE IF NOT EXISTS state_versions ( - version INT8 PRIMARY KEY, - kafka_topic STRING NOT NULL, - kafka_offset INT8 NOT NULL, - ingested_at TIMESTAMPTZ NOT NULL DEFAULT now() + version INT8 NOT NULL, + kafka_topic STRING NOT NULL, + kafka_offset INT8 NOT NULL, + ingested_at TIMESTAMPTZ NOT NULL DEFAULT now(), + PRIMARY KEY (version) USING HASH WITH (bucket_count = 16) ); CREATE TABLE IF NOT EXISTS state_mutations ( @@ -18,7 +24,7 @@ CREATE TABLE IF NOT EXISTS state_mutations ( ); CREATE INDEX IF NOT EXISTS state_mutations_by_version_idx - ON state_mutations (version); + ON state_mutations (version) USING HASH WITH (bucket_count = 16); CREATE TABLE IF NOT EXISTS state_tree_upgrades ( version INT8 NOT NULL, From b5714d29cf43eda89902494d1e7070b32afb40df Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 27 Apr 2026 18:00:14 -0400 Subject: [PATCH 18/31] Add Reader interface for historical state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defines the read counterpart to consumer.Sink: a Reader serving point-in-time queries against the offload store. The interface is deliberately minimal — Get, BatchGet, LastVersion, Close — because the hot path for debug_trace is the batch form, which lets a single round trip resolve every (store, key) pair a trace needs at one version. Tombstones collapse to absence at the API boundary so callers don't have to special-case them. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../state_db/ss/offload/historical/reader.go | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 sei-db/state_db/ss/offload/historical/reader.go diff --git a/sei-db/state_db/ss/offload/historical/reader.go b/sei-db/state_db/ss/offload/historical/reader.go new file mode 100644 index 0000000000..63b65ceede --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/reader.go @@ -0,0 +1,54 @@ +// Package historical serves point-in-time historical state queries from the +// CockroachDB-backed offload store written by the consumer package. It is +// intended for read-heavy workloads such as debug_traceTransaction, which +// resolves thousands of (store, key, version) tuples per request. +package historical + +import ( + "context" + "errors" +) + +// ErrNotFound is returned by Get when no row exists for (storeName, key) at +// or before the target version, or when the latest such row is a tombstone. +var ErrNotFound = errors.New("historical state not found") + +// Lookup names a (store, key) pair the caller wants resolved at a target +// version. Key uses string for byte content so Lookup can be a map key — the +// usual []byte-as-string idiom is fine because the bytes are immutable. +type Lookup struct { + StoreName string + Key string +} + +// Value is the resolved row at or before the requested target version. +// Version reports the actual MVCC version that satisfied the lookup, which +// may be older than the requested target. Tombstones are not surfaced via +// Value: the reader collapses them to absence at the API boundary. +type Value struct { + Bytes []byte + Version int64 +} + +// Reader serves historical state queries from the offload store. Reads are +// snapshot-consistent and may be served by follower replicas when configured +// (see CockroachConfig.FollowerReadStaleness). +type Reader interface { + // Get returns the row at the latest version <= targetVersion. Returns + // ErrNotFound if no such row exists or if the latest row is a tombstone. + Get(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) + + // BatchGet resolves many (store, key) pairs at the same target version in + // a single round-trip. Pairs that don't exist at or before the target + // version, or whose latest row is a tombstone, are absent from the + // returned map. This is the primary API for trace-style workloads where + // per-key round-trips would dominate latency. + BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) + + // LastVersion returns the largest version successfully ingested by the + // offload consumer. Trace clients should clamp targetVersion to this so + // they don't query versions still in flight on the Kafka side. + LastVersion(ctx context.Context) (int64, error) + + Close() error +} From 47cda76a27fd991da9ad28e5dbaa07c0d4c73549 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 27 Apr 2026 18:01:16 -0400 Subject: [PATCH 19/31] Add CockroachDB historical Reader implementation Implements Reader on top of the offload schema with two debug_trace-targeted optimizations: 1. Bounded-stale follower reads. When CockroachConfig.FollowerReadStaleness is set, every read is wrapped in a read-only transaction pinned via `SET TRANSACTION AS OF SYSTEM TIME with_max_staleness(...)`. Trace replays only need committed historical state, so any replica can serve and the read avoids a leaseholder hop. 2. Batched LATERAL lookups. BatchGet packs an entire request's (store, key) set into one query that uses unnest() + LATERAL LIMIT 1 against the descending PK, so each pair resolves with a single index seek and the whole batch costs one round-trip. This is the SQL analogue of the per-request cache + reusable iterator tricks used on the pebble side. Tombstones collapse to absence in BatchGet/Get so callers don't have to special-case deleted rows. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/historical/cockroach.go | 226 ++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 sei-db/state_db/ss/offload/historical/cockroach.go diff --git a/sei-db/state_db/ss/offload/historical/cockroach.go b/sei-db/state_db/ss/offload/historical/cockroach.go new file mode 100644 index 0000000000..4294514a4c --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/cockroach.go @@ -0,0 +1,226 @@ +package historical + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/lib/pq" +) + +// CockroachConfig configures the historical-state Reader. DSN follows the +// standard libpq/pgx format (e.g. postgresql://user@host:26257/db?sslmode=verify-full). +// +// FollowerReadStaleness, when non-zero, switches reads to +// +// AS OF SYSTEM TIME with_max_staleness('') +// +// so any replica can serve the request and the read avoids a leaseholder +// hop. This is the single biggest read-latency win for trace-style workloads, +// which only read committed historical state and tolerate a few seconds of +// replication lag. A value of 0 selects strongly-consistent reads. +type CockroachConfig struct { + DSN string + MaxOpenConns int + MaxIdleConns int + ConnMaxLifetime time.Duration + FollowerReadStaleness time.Duration +} + +func (c *CockroachConfig) ApplyDefaults() { + if c.MaxOpenConns == 0 { + c.MaxOpenConns = 16 + } + if c.MaxIdleConns == 0 { + c.MaxIdleConns = c.MaxOpenConns + } + if c.ConnMaxLifetime == 0 { + c.ConnMaxLifetime = 30 * time.Minute + } +} + +func (c *CockroachConfig) Validate() error { + if strings.TrimSpace(c.DSN) == "" { + return fmt.Errorf("cockroach dsn is required") + } + if c.MaxOpenConns < 0 { + return fmt.Errorf("cockroach max open conns must be non-negative") + } + if c.MaxIdleConns < 0 { + return fmt.Errorf("cockroach max idle conns must be non-negative") + } + if c.FollowerReadStaleness < 0 { + return fmt.Errorf("follower read staleness must be non-negative") + } + return nil +} + +type cockroachReader struct { + db *sql.DB + staleness time.Duration +} + +var _ Reader = (*cockroachReader)(nil) + +// NewCockroachReader opens a pooled connection to CockroachDB for historical +// state reads. The caller is responsible for ensuring schema.sql has been +// applied to the cluster. +func NewCockroachReader(cfg CockroachConfig) (Reader, error) { + cfg.ApplyDefaults() + if err := cfg.Validate(); err != nil { + return nil, err + } + + db, err := sql.Open("postgres", cfg.DSN) + if err != nil { + return nil, fmt.Errorf("open cockroach: %w", err) + } + db.SetMaxOpenConns(cfg.MaxOpenConns) + db.SetMaxIdleConns(cfg.MaxIdleConns) + db.SetConnMaxLifetime(cfg.ConnMaxLifetime) + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := db.PingContext(ctx); err != nil { + _ = db.Close() + return nil, fmt.Errorf("ping cockroach: %w", err) + } + + return &cockroachReader{db: db, staleness: cfg.FollowerReadStaleness}, nil +} + +func (r *cockroachReader) Close() error { return r.db.Close() } + +func (r *cockroachReader) LastVersion(ctx context.Context) (int64, error) { + var v sql.NullInt64 + err := r.db.QueryRowContext(ctx, `SELECT max(version) FROM state_versions`).Scan(&v) + if err != nil { + return 0, fmt.Errorf("read last version: %w", err) + } + if !v.Valid { + return 0, nil + } + return v.Int64, nil +} + +func (r *cockroachReader) Get(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) { + lkp := Lookup{StoreName: storeName, Key: string(key)} + res, err := r.BatchGet(ctx, targetVersion, []Lookup{lkp}) + if err != nil { + return Value{}, err + } + v, ok := res[lkp] + if !ok { + return Value{}, ErrNotFound + } + return v, nil +} + +// batchLookupSQL resolves many (store_name, key) pairs at one target version. +// +// Parameters: +// +// $1 STRING[] : store names, parallel to $2 +// $2 BYTES[] : keys, parallel to $1 +// $3 INT8 : target version (inclusive upper bound) +// +// The descending PK on state_mutations(store_name, key, version DESC) makes +// each LATERAL subquery a single index seek + LIMIT 1, so the planner runs +// one PK lookup per pair instead of scanning version history. Replaces what +// would otherwise be N round-trips with one. +const batchLookupSQL = ` +SELECT t.store_name, t.key, m.version, m.value, m.deleted +FROM unnest($1::STRING[], $2::BYTES[]) AS t(store_name, key), + LATERAL ( + SELECT version, value, deleted + FROM state_mutations + WHERE store_name = t.store_name + AND key = t.key + AND version <= $3 + ORDER BY version DESC + LIMIT 1 + ) m` + +// splitLookups peels parallel STRING[] / BYTES[] arrays out of the lookup +// slice. Pulled out so it can be unit-tested without a live database. +func splitLookups(lookups []Lookup) (stores []string, keys [][]byte) { + stores = make([]string, len(lookups)) + keys = make([][]byte, len(lookups)) + for i, l := range lookups { + stores[i] = l.StoreName + keys[i] = []byte(l.Key) + } + return stores, keys +} + +// aostStmt builds the per-transaction AS OF SYSTEM TIME clause used to enable +// follower reads. Cockroach's with_max_staleness accepts any interval string +// Postgres would, including Go's time.Duration.String() output. +func aostStmt(staleness time.Duration) string { + return fmt.Sprintf("SET TRANSACTION AS OF SYSTEM TIME with_max_staleness('%s')", staleness) +} + +// withReadTx runs fn inside a read-only transaction. When staleness > 0 the +// transaction is pinned to a bounded-stale timestamp so any replica can serve +// it without a leaseholder hop. +func (r *cockroachReader) withReadTx(ctx context.Context, fn func(*sql.Tx) error) error { + tx, err := r.db.BeginTx(ctx, &sql.TxOptions{ReadOnly: true}) + if err != nil { + return fmt.Errorf("begin read tx: %w", err) + } + if r.staleness > 0 { + if _, err := tx.ExecContext(ctx, aostStmt(r.staleness)); err != nil { + _ = tx.Rollback() + return fmt.Errorf("set follower read: %w", err) + } + } + if err := fn(tx); err != nil { + _ = tx.Rollback() + return err + } + return tx.Commit() +} + +func (r *cockroachReader) BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) { + if len(lookups) == 0 { + return map[Lookup]Value{}, nil + } + stores, keys := splitLookups(lookups) + out := make(map[Lookup]Value, len(lookups)) + + err := r.withReadTx(ctx, func(tx *sql.Tx) error { + rows, err := tx.QueryContext(ctx, batchLookupSQL, pq.StringArray(stores), pq.ByteaArray(keys), targetVersion) + if err != nil { + return fmt.Errorf("batch lookup: %w", err) + } + defer rows.Close() + for rows.Next() { + var ( + storeName string + key []byte + version int64 + value []byte + deleted bool + ) + if err := rows.Scan(&storeName, &key, &version, &value, &deleted); err != nil { + return fmt.Errorf("scan batch row: %w", err) + } + if deleted { + // Tombstone: collapse to absence at the API boundary so callers + // don't need to special-case deleted rows. + continue + } + out[Lookup{StoreName: storeName, Key: string(key)}] = Value{ + Bytes: value, + Version: version, + } + } + return rows.Err() + }) + if err != nil { + return nil, err + } + return out, nil +} From 2cf0815d4fba0c576a340bb6088dec2fcfd232e3 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Mon, 27 Apr 2026 18:01:52 -0400 Subject: [PATCH 20/31] Test reader helpers and batch-lookup SQL shape Covers what can be exercised without a live CockroachDB: - splitLookups builds parallel STRING[]/BYTES[] arrays in lookup order. - aostStmt formats time.Duration into the with_max_staleness clause. - batchLookupSQL contains the fragments that make the per-pair PK-seek plan work (LATERAL, descending order, LIMIT 1, version filter). An edit that drops any of these would silently regress into a version- history scan, so pin them explicitly. - CockroachConfig validates required and non-negative fields and applies the documented defaults. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/historical/cockroach_test.go | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 sei-db/state_db/ss/offload/historical/cockroach_test.go diff --git a/sei-db/state_db/ss/offload/historical/cockroach_test.go b/sei-db/state_db/ss/offload/historical/cockroach_test.go new file mode 100644 index 0000000000..96433a777f --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/cockroach_test.go @@ -0,0 +1,84 @@ +package historical + +import ( + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestSplitLookupsParallelArrays(t *testing.T) { + stores, keys := splitLookups([]Lookup{ + {StoreName: "evm", Key: "k1"}, + {StoreName: "bank", Key: "k2"}, + {StoreName: "evm", Key: ""}, + }) + require.Equal(t, []string{"evm", "bank", "evm"}, stores) + require.Equal(t, [][]byte{[]byte("k1"), []byte("k2"), {}}, keys) +} + +func TestSplitLookupsEmpty(t *testing.T) { + stores, keys := splitLookups(nil) + require.Empty(t, stores) + require.Empty(t, keys) +} + +func TestAostStmtFormatsDuration(t *testing.T) { + require.Equal(t, + "SET TRANSACTION AS OF SYSTEM TIME with_max_staleness('10s')", + aostStmt(10*time.Second)) + require.Equal(t, + "SET TRANSACTION AS OF SYSTEM TIME with_max_staleness('1m30s')", + aostStmt(90*time.Second)) +} + +// TestBatchLookupSQLShape pins the salient pieces of the batch query so an +// accidental edit that loses LATERAL, the descending order, or the LIMIT 1 +// (each of which is needed for the per-pair PK-seek plan) breaks loudly +// instead of silently regressing into a full version-history scan. +func TestBatchLookupSQLShape(t *testing.T) { + for _, frag := range []string{ + "unnest($1::STRING[], $2::BYTES[])", + "LATERAL", + "FROM state_mutations", + "version <= $3", + "ORDER BY version DESC", + "LIMIT 1", + } { + require.Containsf(t, batchLookupSQL, frag, + "batchLookupSQL missing required fragment %q", frag) + } +} + +func TestCockroachConfigValidate(t *testing.T) { + cases := []struct { + name string + cfg CockroachConfig + err string + }{ + {"missing dsn", CockroachConfig{}, "dsn"}, + {"blank dsn", CockroachConfig{DSN: " "}, "dsn"}, + {"negative open conns", CockroachConfig{DSN: "x", MaxOpenConns: -1}, "open"}, + {"negative idle conns", CockroachConfig{DSN: "x", MaxIdleConns: -1}, "idle"}, + {"negative staleness", CockroachConfig{DSN: "x", FollowerReadStaleness: -1}, "staleness"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := tc.cfg.Validate() + require.Error(t, err) + require.True(t, strings.Contains(err.Error(), tc.err), + "err %q should contain %q", err, tc.err) + }) + } +} + +func TestCockroachConfigApplyDefaults(t *testing.T) { + c := CockroachConfig{DSN: "x"} + c.ApplyDefaults() + require.Equal(t, 16, c.MaxOpenConns) + require.Equal(t, 16, c.MaxIdleConns) + require.Equal(t, 30*time.Minute, c.ConnMaxLifetime) + require.Equal(t, time.Duration(0), c.FollowerReadStaleness, + "staleness defaults to strongly-consistent reads; operators opt in") +} From 13e315481739acd97b49d3b6db762b5584f537b7 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 10:46:14 -0400 Subject: [PATCH 21/31] Add FallbackStateStore adapter Wraps a primary types.StateStore with a historical Reader so that Get/Has reads at versions older than the primary's earliest retained version are served from the offload store. Everything else (iterators, write methods, import, prune) passes through to the primary unchanged. Iterators deliberately stay on the primary: SQL iteration over the MVCC table is expressive but slow, and the trace profile is dominated by point Gets. Operators that need historical iterators on pruned versions can extend this adapter later without touching callers. This is the single integration point that lets a node serve trace replays across pruned heights without changing the StateStore consumers. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../state_db/ss/offload/historical/store.go | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 sei-db/state_db/ss/offload/historical/store.go diff --git a/sei-db/state_db/ss/offload/historical/store.go b/sei-db/state_db/ss/offload/historical/store.go new file mode 100644 index 0000000000..5ef7b5009b --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/store.go @@ -0,0 +1,117 @@ +package historical + +import ( + "context" + "errors" + + "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" + "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +// FallbackStateStore wraps a primary types.StateStore with a historical +// Reader. Get/Has reads at versions older than the primary's earliest +// version are routed to the Reader; everything else passes through +// unchanged. This is the integration point for using the offload-pipeline +// CockroachDB store as a read-fallback for pruned historical state. +// +// Iterators are NOT routed to the Reader: SQL iteration over an MVCC table +// is expressive but slow, and the trace profile is dominated by point Gets. +// A request for an iterator at a pruned version still falls back to the +// primary's behavior (typically empty results). +type FallbackStateStore struct { + primary types.StateStore + reader Reader +} + +var _ types.StateStore = (*FallbackStateStore)(nil) + +// NewFallbackStateStore wraps primary so that Get/Has at versions < +// primary.GetEarliestVersion() consult reader instead. The wrapper takes +// ownership of both the primary and the reader for the purposes of Close. +func NewFallbackStateStore(primary types.StateStore, reader Reader) *FallbackStateStore { + return &FallbackStateStore{primary: primary, reader: reader} +} + +// shouldFallback returns true when version is strictly older than the +// primary's earliest retained version. The primary returns (nil, nil) for +// such versions today, which is indistinguishable from "key never written"; +// using the version watermark gives us a deterministic split. +func (s *FallbackStateStore) shouldFallback(version int64) bool { + earliest := s.primary.GetEarliestVersion() + return earliest > 0 && version < earliest +} + +func (s *FallbackStateStore) Get(storeKey string, version int64, key []byte) ([]byte, error) { + if !s.shouldFallback(version) { + return s.primary.Get(storeKey, version, key) + } + v, err := s.reader.Get(context.Background(), storeKey, key, version) + if err != nil { + if errors.Is(err, ErrNotFound) { + return nil, nil + } + return nil, err + } + return v.Bytes, nil +} + +func (s *FallbackStateStore) Has(storeKey string, version int64, key []byte) (bool, error) { + if !s.shouldFallback(version) { + return s.primary.Has(storeKey, version, key) + } + _, err := s.reader.Get(context.Background(), storeKey, key, version) + if err != nil { + if errors.Is(err, ErrNotFound) { + return false, nil + } + return false, err + } + return true, nil +} + +func (s *FallbackStateStore) Iterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + return s.primary.Iterator(storeKey, version, start, end) +} + +func (s *FallbackStateStore) ReverseIterator(storeKey string, version int64, start, end []byte) (types.DBIterator, error) { + return s.primary.ReverseIterator(storeKey, version, start, end) +} + +func (s *FallbackStateStore) RawIterate(storeKey string, fn func([]byte, []byte, int64) bool) (bool, error) { + return s.primary.RawIterate(storeKey, fn) +} + +func (s *FallbackStateStore) GetLatestVersion() int64 { return s.primary.GetLatestVersion() } + +func (s *FallbackStateStore) SetLatestVersion(version int64) error { + return s.primary.SetLatestVersion(version) +} + +func (s *FallbackStateStore) GetEarliestVersion() int64 { return s.primary.GetEarliestVersion() } + +func (s *FallbackStateStore) SetEarliestVersion(version int64, ignoreVersion bool) error { + return s.primary.SetEarliestVersion(version, ignoreVersion) +} + +func (s *FallbackStateStore) ApplyChangesetSync(version int64, changesets []*proto.NamedChangeSet) error { + return s.primary.ApplyChangesetSync(version, changesets) +} + +func (s *FallbackStateStore) ApplyChangesetAsync(version int64, changesets []*proto.NamedChangeSet) error { + return s.primary.ApplyChangesetAsync(version, changesets) +} + +func (s *FallbackStateStore) Prune(version int64) error { return s.primary.Prune(version) } + +func (s *FallbackStateStore) Import(version int64, ch <-chan types.SnapshotNode) error { + return s.primary.Import(version, ch) +} + +func (s *FallbackStateStore) Close() error { + primaryErr := s.primary.Close() + readerErr := s.reader.Close() + if primaryErr != nil { + return primaryErr + } + return readerErr +} From 656c83e1b2bccec03cf1fe46ab135e6e31d5ee31 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 10:46:57 -0400 Subject: [PATCH 22/31] Test FallbackStateStore routing Covers the full routing matrix: - below earliest -> Reader (primary not consulted) - at or above earliest -> primary (Reader not consulted) - earliest=0 (fresh node) -> primary - Has mirrors Get's routing - non-not-found Reader errors propagate - Close closes both wrapped components - pass-through getters delegate Uses fakes for both types.StateStore and Reader so the tests stay focused on routing logic without standing up a real DB. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/historical/store_test.go | 176 ++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 sei-db/state_db/ss/offload/historical/store_test.go diff --git a/sei-db/state_db/ss/offload/historical/store_test.go b/sei-db/state_db/ss/offload/historical/store_test.go new file mode 100644 index 0000000000..ee6be042a5 --- /dev/null +++ b/sei-db/state_db/ss/offload/historical/store_test.go @@ -0,0 +1,176 @@ +package historical + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" + "github.com/sei-protocol/sei-chain/sei-db/proto" +) + +// fakePrimary is a minimal types.StateStore implementation for routing tests. +// Only the calls FallbackStateStore actually makes are populated; the rest +// return zero values, which keeps the test file focused on routing logic. +type fakePrimary struct { + earliest int64 + latest int64 + gets map[string][]byte // storeKey|key -> value (nil means not present) + getCalls int + hasCalls int + closed bool +} + +func newFakePrimary(earliest, latest int64) *fakePrimary { + return &fakePrimary{earliest: earliest, latest: latest, gets: map[string][]byte{}} +} + +func k(storeKey string, key []byte) string { return storeKey + "|" + string(key) } + +func (f *fakePrimary) Get(storeKey string, _ int64, key []byte) ([]byte, error) { + f.getCalls++ + return f.gets[k(storeKey, key)], nil +} +func (f *fakePrimary) Has(storeKey string, _ int64, key []byte) (bool, error) { + f.hasCalls++ + return f.gets[k(storeKey, key)] != nil, nil +} +func (f *fakePrimary) Iterator(string, int64, []byte, []byte) (types.DBIterator, error) { + return nil, nil +} +func (f *fakePrimary) ReverseIterator(string, int64, []byte, []byte) (types.DBIterator, error) { + return nil, nil +} +func (f *fakePrimary) RawIterate(string, func([]byte, []byte, int64) bool) (bool, error) { + return false, nil +} +func (f *fakePrimary) GetLatestVersion() int64 { return f.latest } +func (f *fakePrimary) SetLatestVersion(int64) error { return nil } +func (f *fakePrimary) GetEarliestVersion() int64 { return f.earliest } +func (f *fakePrimary) SetEarliestVersion(int64, bool) error { return nil } +func (f *fakePrimary) ApplyChangesetSync(int64, []*proto.NamedChangeSet) error { return nil } +func (f *fakePrimary) ApplyChangesetAsync(int64, []*proto.NamedChangeSet) error { return nil } +func (f *fakePrimary) Prune(int64) error { return nil } +func (f *fakePrimary) Import(int64, <-chan types.SnapshotNode) error { return nil } +func (f *fakePrimary) Close() error { f.closed = true; return nil } + +// fakeReader implements Reader for routing tests. It records call counts so +// each test can assert that fallback (or non-fallback) actually happened. +type fakeReader struct { + values map[Lookup]Value + getCalls int + closeCall bool +} + +func newFakeReader() *fakeReader { return &fakeReader{values: map[Lookup]Value{}} } + +func (r *fakeReader) Get(_ context.Context, storeName string, key []byte, _ int64) (Value, error) { + r.getCalls++ + v, ok := r.values[Lookup{StoreName: storeName, Key: string(key)}] + if !ok { + return Value{}, ErrNotFound + } + return v, nil +} +func (r *fakeReader) BatchGet(context.Context, int64, []Lookup) (map[Lookup]Value, error) { + return nil, nil +} +func (r *fakeReader) LastVersion(context.Context) (int64, error) { return 0, nil } +func (r *fakeReader) Close() error { r.closeCall = true; return nil } + +func TestFallbackRoutesBelowEarliest(t *testing.T) { + p := newFakePrimary(100, 200) + r := newFakeReader() + r.values[Lookup{StoreName: "evm", Key: "k"}] = Value{Bytes: []byte("from-cockroach"), Version: 50} + s := NewFallbackStateStore(p, r) + + got, err := s.Get("evm", 50, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("from-cockroach"), got) + require.Equal(t, 1, r.getCalls) + require.Equal(t, 0, p.getCalls, "primary should not be consulted below earliest") +} + +func TestFallbackUsesPrimaryAtOrAboveEarliest(t *testing.T) { + p := newFakePrimary(100, 200) + p.gets[k("evm", []byte("k"))] = []byte("from-primary") + r := newFakeReader() + s := NewFallbackStateStore(p, r) + + for _, version := range []int64{100, 150, 200} { + got, err := s.Get("evm", version, []byte("k")) + require.NoError(t, err) + require.Equal(t, []byte("from-primary"), got, "version=%d should hit primary", version) + } + require.Equal(t, 3, p.getCalls) + require.Equal(t, 0, r.getCalls, "reader should not be consulted at or above earliest") +} + +func TestFallbackUsesPrimaryWhenEarliestIsZero(t *testing.T) { + // earliest=0 means the primary has no data yet (or was never pruned). We + // shouldn't fan out to Cockroach in that case — the primary owns it. + p := newFakePrimary(0, 0) + r := newFakeReader() + s := NewFallbackStateStore(p, r) + + _, err := s.Get("evm", 50, []byte("k")) + require.NoError(t, err) + require.Equal(t, 1, p.getCalls) + require.Equal(t, 0, r.getCalls) +} + +func TestFallbackHasMirrorsGetRouting(t *testing.T) { + p := newFakePrimary(100, 200) + r := newFakeReader() + r.values[Lookup{StoreName: "bank", Key: "addr"}] = Value{Bytes: []byte{1}, Version: 50} + s := NewFallbackStateStore(p, r) + + ok, err := s.Has("bank", 50, []byte("addr")) + require.NoError(t, err) + require.True(t, ok) + + ok, err = s.Has("bank", 50, []byte("missing")) + require.NoError(t, err) + require.False(t, ok) +} + +func TestFallbackPropagatesNonNotFoundReaderErrors(t *testing.T) { + p := newFakePrimary(100, 200) + r := &errReader{err: errors.New("boom")} + s := NewFallbackStateStore(p, r) + + _, err := s.Get("evm", 50, []byte("k")) + require.Error(t, err) + require.Contains(t, err.Error(), "boom") +} + +type errReader struct{ err error } + +func (e *errReader) Get(context.Context, string, []byte, int64) (Value, error) { + return Value{}, e.err +} +func (e *errReader) BatchGet(context.Context, int64, []Lookup) (map[Lookup]Value, error) { + return nil, e.err +} +func (e *errReader) LastVersion(context.Context) (int64, error) { return 0, e.err } +func (e *errReader) Close() error { return nil } + +func TestFallbackCloseClosesBoth(t *testing.T) { + p := newFakePrimary(0, 0) + r := newFakeReader() + s := NewFallbackStateStore(p, r) + + require.NoError(t, s.Close()) + require.True(t, p.closed) + require.True(t, r.closeCall) +} + +func TestFallbackPassthroughGettersDelegate(t *testing.T) { + p := newFakePrimary(123, 456) + s := NewFallbackStateStore(p, newFakeReader()) + + require.Equal(t, int64(123), s.GetEarliestVersion()) + require.Equal(t, int64(456), s.GetLatestVersion()) +} From 96ac9a4c7913605f294d6245ce02ea213a2ca273 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 10:48:57 -0400 Subject: [PATCH 23/31] Wire historical fallback into NewStateStore Adds a single HistoricalOffloadDSN config knob and one-line wrap in ss.NewStateStore: if the DSN is set, the composite store is wrapped with a historical.FallbackStateStore so reads of versions older than the primary's earliest retained version are served from the offload CockroachDB cluster. When the DSN is empty (default) the wrapper is skipped and behavior is identical to today. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/config/ss_config.go | 8 ++++++++ sei-db/config/toml.go | 6 ++++++ sei-db/state_db/ss/store.go | 23 ++++++++++++++++++++++- 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/sei-db/config/ss_config.go b/sei-db/config/ss_config.go index 3fe94e750d..17268332df 100644 --- a/sei-db/config/ss_config.go +++ b/sei-db/config/ss_config.go @@ -76,6 +76,14 @@ type StateStoreConfig struct { // When true, data is routed to separate DBs by EVM key family while // preserving the same logical store key and full key encoding inside each DB. SeparateEVMSubDBs bool `mapstructure:"evm-separate-dbs"` + + // HistoricalOffloadDSN, when non-empty, opens a CockroachDB historical + // Reader at startup and wraps the state store so reads of versions older + // than the primary's earliest retained version fall back to Cockroach. + // The DSN follows the standard libpq/pgx format. See + // sei-db/state_db/ss/offload/historical for the read path and + // sei-db/state_db/ss/offload/consumer for the write side. + HistoricalOffloadDSN string `mapstructure:"historical-offload-dsn"` } // DefaultStateStoreConfig returns the default StateStoreConfig diff --git a/sei-db/config/toml.go b/sei-db/config/toml.go index eb387cb1d5..aa30cb50a6 100644 --- a/sei-db/config/toml.go +++ b/sei-db/config/toml.go @@ -139,6 +139,12 @@ evm-ss-split = {{ .StateStore.EVMSplit }} # When false, all EVM data stays in one DB using the current unified layout. # When true, data is routed to separate DBs while preserving the same evm key prefix format. evm-ss-separate-dbs = {{ .StateStore.SeparateEVMSubDBs }} + +# HistoricalOffloadDSN, when set, points the state store at a CockroachDB +# cluster populated by the historical-offload consumer. Reads of versions +# older than the local SS's earliest retained version are served from +# CockroachDB instead of returning empty. Leave blank to disable. +historical-offload-dsn = "{{ .StateStore.HistoricalOffloadDSN }}" ` // ReceiptStoreConfigTemplate defines the configuration template for receipt-store diff --git a/sei-db/state_db/ss/store.go b/sei-db/state_db/ss/store.go index 0fb4b5184e..1876af0a95 100644 --- a/sei-db/state_db/ss/store.go +++ b/sei-db/state_db/ss/store.go @@ -1,15 +1,36 @@ package ss import ( + "fmt" + "github.com/sei-protocol/sei-chain/sei-db/config" "github.com/sei-protocol/sei-chain/sei-db/db_engine/types" "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/composite" + "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" ) // NewStateStore creates a CompositeStateStore which handles both Cosmos and EVM data. // The backend (pebbledb or rocksdb) is resolved at compile time via build-tag-gated // files in the backend package. When WriteMode/ReadMode are both cosmos_only (the default), // the EVM stores are not opened and the composite store behaves identically to a plain cosmos state store. +// +// If ssConfig.HistoricalOffloadDSN is set, the composite store is wrapped with +// a historical.FallbackStateStore so reads of pruned versions are served from +// the offload-pipeline CockroachDB cluster. func NewStateStore(homeDir string, ssConfig config.StateStoreConfig) (types.StateStore, error) { - return composite.NewCompositeStateStore(ssConfig, homeDir) + cs, err := composite.NewCompositeStateStore(ssConfig, homeDir) + if err != nil { + return nil, err + } + if ssConfig.HistoricalOffloadDSN == "" { + return cs, nil + } + reader, err := historical.NewCockroachReader(historical.CockroachConfig{ + DSN: ssConfig.HistoricalOffloadDSN, + }) + if err != nil { + _ = cs.Close() + return nil, fmt.Errorf("open historical offload reader: %w", err) + } + return historical.NewFallbackStateStore(cs, reader), nil } From 5a2ff06f3bc90279b6a9f9bb7e217265de4b44b7 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 10:53:57 -0400 Subject: [PATCH 24/31] Parallelize consumer writes across partitions The consumer loop was single-threaded: one Kafka fetch, one Cockroach write, repeat. On a fast chain with multi-partition topics, the per-tx COMMIT round-trip caps ingest throughput on a single goroutine. Add a per-partition worker pool. The fetch loop dispatches each message to shards[partition % workers]; each shard has one worker that writes and commits in arrival order. Per-partition ordering is preserved (a partition always lands on the same worker); cross-partition writes parallelize. Set Workers via Config.Workers / WORKERS env var; default 1 keeps existing behavior. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/README.md | 4 +- .../cmd/historical-offload-consumer/main.go | 3 +- sei-db/state_db/ss/offload/consumer/config.go | 2 + .../state_db/ss/offload/consumer/consumer.go | 167 +++++++++++++----- .../ss/offload/consumer/consumer_test.go | 61 +++++++ sei-db/state_db/ss/offload/consumer/deploy.sh | 3 + 6 files changed, 198 insertions(+), 42 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/README.md b/sei-db/state_db/ss/offload/consumer/README.md index 52354f3645..e5c39955d1 100644 --- a/sei-db/state_db/ss/offload/consumer/README.md +++ b/sei-db/state_db/ss/offload/consumer/README.md @@ -32,7 +32,9 @@ RUN=1 ./deploy.sh ## Guarantees - At-least-once delivery. Sink UPSERTs on `(store_name, key, version)` so replay is a no-op. -- Per-partition ordering preserved (single-threaded loop per reader). +- Per-partition ordering preserved. With `WORKERS>1` (recommended for fast + chains) messages are sharded by partition so each partition's writes still + flow through a single worker; cross-partition writes parallelize. - Offsets commit only after the sink persists the entry. - Sink writes use bounded exponential backoff (5 attempts, 1s→30s) before giving up. On give-up the process exits non-zero so the supervisor restarts; diff --git a/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go b/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go index 40294f1263..365a2a24a9 100644 --- a/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go +++ b/sei-db/state_db/ss/offload/consumer/cmd/historical-offload-consumer/main.go @@ -38,7 +38,8 @@ func main() { defer cancel() c := consumer.New(reader, sink, consumer.Options{ - Logf: func(format string, args ...interface{}) { log.Printf(format, args...) }, + Logf: func(format string, args ...interface{}) { log.Printf(format, args...) }, + Workers: cfg.Workers, }) if err := c.Run(ctx); err != nil { log.Fatalf("consumer: %v", err) diff --git a/sei-db/state_db/ss/offload/consumer/config.go b/sei-db/state_db/ss/offload/consumer/config.go index ac8c84bfe2..2f23b0f471 100644 --- a/sei-db/state_db/ss/offload/consumer/config.go +++ b/sei-db/state_db/ss/offload/consumer/config.go @@ -10,6 +10,8 @@ import ( type Config struct { Kafka KafkaReaderConfig Cockroach CockroachConfig + // Workers sets per-partition write parallelism. 0 or 1 means serial. + Workers int } func (c *Config) Validate() error { diff --git a/sei-db/state_db/ss/offload/consumer/consumer.go b/sei-db/state_db/ss/offload/consumer/consumer.go index cd3e60c88e..19b7ff9c70 100644 --- a/sei-db/state_db/ss/offload/consumer/consumer.go +++ b/sei-db/state_db/ss/offload/consumer/consumer.go @@ -7,45 +7,48 @@ import ( "time" "github.com/segmentio/kafka-go" + "golang.org/x/sync/errgroup" ) -// MessageSource is the subset of *kafka.Reader the consumer uses. Extracting -// it lets tests drive the loop with a fake without a running Kafka. +// MessageSource is the subset of *kafka.Reader the consumer uses; the +// indirection lets tests drive the loop with a fake. type MessageSource interface { FetchMessage(ctx context.Context) (kafka.Message, error) CommitMessages(ctx context.Context, msgs ...kafka.Message) error } -// Consumer pulls messages from a MessageSource, decodes them, writes to a Sink, -// and commits offsets. It is single-threaded by design: ordering per partition -// is required so the CockroachDB primary key (store_name, key, version DESC) -// reflects producer order. +// Consumer fans messages out to per-partition workers so cross-partition +// writes parallelize while ordering within a partition is preserved. type Consumer struct { reader MessageSource sink Sink logf func(format string, args ...interface{}) + workers int + shardBuf int maxAttempts int baseBackoff time.Duration maxBackoff time.Duration } -// Default retry knobs for sink writes. Total wait at defaults ≈ 1+2+4+8 = 15s -// before giving up and letting the process supervisor restart us. const ( defaultSinkMaxAttempts = 5 defaultSinkBaseBackoff = 1 * time.Second defaultSinkMaxBackoff = 30 * time.Second + defaultWorkers = 1 + defaultShardBuffer = 32 ) -// Options are optional hooks for the consumer loop. Zero values pick defaults. +// Options configures the consumer loop. Zero values pick defaults. type Options struct { - Logf func(format string, args ...interface{}) - // SinkMaxAttempts caps total sink.Write attempts per message (>=1). + Logf func(format string, args ...interface{}) SinkMaxAttempts int - // SinkBaseBackoff is the initial backoff between retries; doubles each retry. SinkBaseBackoff time.Duration - // SinkMaxBackoff caps the per-retry backoff. - SinkMaxBackoff time.Duration + SinkMaxBackoff time.Duration + // Workers sets per-partition write parallelism. Messages are sharded by + // partition so a partition's writes stay ordered. Default 1 (serial). + Workers int + // ShardBufferSize bounds in-flight messages per worker. Default 32. + ShardBufferSize int } func New(reader MessageSource, sink Sink, opts Options) *Consumer { @@ -65,59 +68,132 @@ func New(reader MessageSource, sink Sink, opts Options) *Consumer { if maxBackoff <= 0 { maxBackoff = defaultSinkMaxBackoff } + workers := opts.Workers + if workers <= 0 { + workers = defaultWorkers + } + shardBuf := opts.ShardBufferSize + if shardBuf <= 0 { + shardBuf = defaultShardBuffer + } return &Consumer{ reader: reader, sink: sink, logf: logf, + workers: workers, + shardBuf: shardBuf, maxAttempts: maxAttempts, baseBackoff: base, maxBackoff: maxBackoff, } } -// Run blocks until ctx is cancelled or an unrecoverable error occurs. -// It commits offsets only after the sink has persisted each message, so -// at-least-once delivery is preserved across restarts. +// Run blocks until ctx is cancelled or an unrecoverable error occurs. Offsets +// commit only after the sink persists each message (at-least-once delivery). func (c *Consumer) Run(ctx context.Context) error { + if c.workers == 1 { + return c.runSerial(ctx) + } + return c.runParallel(ctx) +} + +func (c *Consumer) runSerial(ctx context.Context) error { for { msg, err := c.reader.FetchMessage(ctx) if err != nil { - if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + if isCancellation(err) { return nil } return fmt.Errorf("fetch kafka message: %w", err) } - - entry, err := DecodeEntry(msg.Value) - if err != nil { - return fmt.Errorf("decode message at offset %d: %w", msg.Offset, err) + if err := c.processMessage(ctx, msg); err != nil { + if isCancellation(err) { + return nil + } + return err } + } +} - rec := Record{ - Topic: msg.Topic, - Partition: msg.Partition, - Offset: msg.Offset, - Entry: entry, - } - start := time.Now() - if err := c.writeWithRetry(ctx, rec); err != nil { - if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { +func (c *Consumer) runParallel(ctx context.Context) error { + g, gctx := errgroup.WithContext(ctx) + shards := make([]chan kafka.Message, c.workers) + for i := range shards { + shards[i] = make(chan kafka.Message, c.shardBuf) + ch := shards[i] + g.Go(func() error { return c.workerLoop(gctx, ch) }) + } + g.Go(func() error { + defer func() { + for _, ch := range shards { + close(ch) + } + }() + for { + msg, err := c.reader.FetchMessage(gctx) + if err != nil { + if isCancellation(err) { + return nil + } + return fmt.Errorf("fetch kafka message: %w", err) + } + shard := shardFor(msg.Partition, c.workers) + select { + case shards[shard] <- msg: + case <-gctx.Done(): return nil } - return fmt.Errorf("sink write version %d: %w", entry.Version, err) } - c.logf("wrote version=%d partition=%d offset=%d in %s", - entry.Version, msg.Partition, msg.Offset, time.Since(start)) + }) + if err := g.Wait(); err != nil && !isCancellation(err) { + return err + } + return nil +} - if err := c.reader.CommitMessages(ctx, msg); err != nil { - return fmt.Errorf("commit kafka offset %d: %w", msg.Offset, err) +func (c *Consumer) workerLoop(ctx context.Context, ch <-chan kafka.Message) error { + for { + select { + case <-ctx.Done(): + return nil + case msg, ok := <-ch: + if !ok { + return nil + } + if err := c.processMessage(ctx, msg); err != nil { + if isCancellation(err) { + return nil + } + return err + } } } } -// writeWithRetry calls sink.Write with bounded exponential backoff. It returns -// the underlying error after the final attempt, or ctx.Err() if cancelled -// while sleeping between retries. +func (c *Consumer) processMessage(ctx context.Context, msg kafka.Message) error { + entry, err := DecodeEntry(msg.Value) + if err != nil { + return fmt.Errorf("decode message at offset %d: %w", msg.Offset, err) + } + rec := Record{ + Topic: msg.Topic, + Partition: msg.Partition, + Offset: msg.Offset, + Entry: entry, + } + start := time.Now() + if err := c.writeWithRetry(ctx, rec); err != nil { + return fmt.Errorf("sink write version %d: %w", entry.Version, err) + } + c.logf("wrote version=%d partition=%d offset=%d in %s", + entry.Version, msg.Partition, msg.Offset, time.Since(start)) + if err := c.reader.CommitMessages(ctx, msg); err != nil { + return fmt.Errorf("commit kafka offset %d: %w", msg.Offset, err) + } + return nil +} + +// writeWithRetry calls sink.Write with bounded exponential backoff. func (c *Consumer) writeWithRetry(ctx context.Context, rec Record) error { backoff := c.baseBackoff var lastErr error @@ -127,7 +203,7 @@ func (c *Consumer) writeWithRetry(ctx context.Context, rec Record) error { return nil } lastErr = err - if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + if isCancellation(err) { return err } if attempt == c.maxAttempts { @@ -147,3 +223,14 @@ func (c *Consumer) writeWithRetry(ctx context.Context, rec Record) error { } return fmt.Errorf("sink write failed after %d attempts: %w", c.maxAttempts, lastErr) } + +func shardFor(partition, workers int) int { + if partition < 0 { + partition = -partition + } + return partition % workers +} + +func isCancellation(err error) bool { + return errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) +} diff --git a/sei-db/state_db/ss/offload/consumer/consumer_test.go b/sei-db/state_db/ss/offload/consumer/consumer_test.go index 349349e3e3..c37864df17 100644 --- a/sei-db/state_db/ss/offload/consumer/consumer_test.go +++ b/sei-db/state_db/ss/offload/consumer/consumer_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "sync" + "sync/atomic" "testing" "time" @@ -170,3 +171,63 @@ func TestConsumerRunCancelReturnsNil(t *testing.T) { err := New(src, &recordingSink{}, Options{}).Run(context.Background()) require.NoError(t, err) } + +// concurrentSink locks per call so the test can assert the consumer +// actually fans calls out across goroutines (>1 in flight at a time). +type concurrentSink struct { + mu sync.Mutex + records []Record + maxInFlight int32 + inFlight int32 + delay time.Duration +} + +func (s *concurrentSink) Write(_ context.Context, rec Record) error { + cur := atomic.AddInt32(&s.inFlight, 1) + defer atomic.AddInt32(&s.inFlight, -1) + for { + prev := atomic.LoadInt32(&s.maxInFlight) + if cur <= prev || atomic.CompareAndSwapInt32(&s.maxInFlight, prev, cur) { + break + } + } + if s.delay > 0 { + time.Sleep(s.delay) + } + s.mu.Lock() + s.records = append(s.records, rec) + s.mu.Unlock() + return nil +} +func (s *concurrentSink) LastVersion(context.Context) (int64, error) { return 0, nil } +func (s *concurrentSink) Close() error { return nil } + +func TestConsumerParallelFansOutAcrossPartitions(t *testing.T) { + const nPartitions = 4 + msgs := make([]kafka.Message, 0, nPartitions) + for p := 0; p < nPartitions; p++ { + msgs = append(msgs, kafka.Message{ + Topic: "t", Partition: p, Offset: int64(p), + Value: marshalEntry(t, int64(p+1)), + }) + } + src := &fakeSource{msgs: msgs} + sink := &concurrentSink{delay: 25 * time.Millisecond} + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + c := New(src, sink, Options{Workers: nPartitions}) + require.NoError(t, c.Run(ctx)) + + require.Len(t, sink.records, nPartitions) + require.Greater(t, atomic.LoadInt32(&sink.maxInFlight), int32(1), + "with Workers=%d the sink should see >1 concurrent writes", nPartitions) +} + +func TestShardForStablePerPartition(t *testing.T) { + // Same partition always lands on the same worker (preserves order); two + // different partitions don't necessarily collide. + require.Equal(t, shardFor(7, 4), shardFor(7, 4)) + require.NotEqual(t, shardFor(0, 4), shardFor(1, 4)) + require.GreaterOrEqual(t, shardFor(-3, 4), 0, "negative partition shouldn't go negative") +} diff --git a/sei-db/state_db/ss/offload/consumer/deploy.sh b/sei-db/state_db/ss/offload/consumer/deploy.sh index 96a397ef33..1f86a0471b 100755 --- a/sei-db/state_db/ss/offload/consumer/deploy.sh +++ b/sei-db/state_db/ss/offload/consumer/deploy.sh @@ -15,6 +15,7 @@ # KAFKA_SASL_MECHANISM default aws-msk-iam ("" or "none" disables) # KAFKA_START_OFFSET default first (first|last) # COCKROACH_MAX_CONNS default 16 +# WORKERS default 1 (per-partition parallelism) # CONFIG_OUT default ./historical-offload-consumer.json # BIN_OUT default ./bin/historical-offload-consumer # SKIP_SCHEMA=1 skip applying schema.sql @@ -36,6 +37,7 @@ KAFKA_TLS_ENABLED="${KAFKA_TLS_ENABLED:-true}" KAFKA_SASL_MECHANISM="${KAFKA_SASL_MECHANISM:-aws-msk-iam}" KAFKA_START_OFFSET="${KAFKA_START_OFFSET:-first}" COCKROACH_MAX_CONNS="${COCKROACH_MAX_CONNS:-16}" +WORKERS="${WORKERS:-1}" CONFIG_OUT="${CONFIG_OUT:-./historical-offload-consumer.json}" BIN_OUT="${BIN_OUT:-./bin/historical-offload-consumer}" @@ -78,6 +80,7 @@ cfg = { "DSN": os.environ["COCKROACH_DSN"], "MaxOpenConns": int(os.environ["COCKROACH_MAX_CONNS"]), }, + "Workers": int(os.environ["WORKERS"]), } with open(sys.argv[1], "w") as f: json.dump(cfg, f, indent=2) From 9a128a68018ab4e8a8ec2f80aaab5131fcee6213 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 10:54:17 -0400 Subject: [PATCH 25/31] Index state_mutations by (store_name, version DESC) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PK is (store_name, key, version DESC) so point lookups at a target version are one seek, but "all mutations in store S between versions A and B" — used by block-snapshot and per-store iterator paths — has to scan every key in the store and filter by version. Adding (store_name, version DESC) makes that a tight range scan. Hash-sharded with the same bucket_count (16) as the by-version index because the leading edge of each store's monotonic version range would otherwise concentrate writes on one replica per store. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/schema/schema.sql | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sei-db/state_db/ss/offload/consumer/schema/schema.sql b/sei-db/state_db/ss/offload/consumer/schema/schema.sql index 8644c885c9..3d4ab3c260 100644 --- a/sei-db/state_db/ss/offload/consumer/schema/schema.sql +++ b/sei-db/state_db/ss/offload/consumer/schema/schema.sql @@ -26,6 +26,14 @@ CREATE TABLE IF NOT EXISTS state_mutations ( CREATE INDEX IF NOT EXISTS state_mutations_by_version_idx ON state_mutations (version) USING HASH WITH (bucket_count = 16); +-- Backs "what changed in store S between versions A and B" reads (block +-- snapshots, per-store iterators) which the PK can't serve efficiently +-- because it leads with key. Hash-sharded so the leading edge of each +-- store's monotonic version range doesn't hotspot a single replica. +CREATE INDEX IF NOT EXISTS state_mutations_by_store_version_idx + ON state_mutations (store_name, version DESC) + USING HASH WITH (bucket_count = 16); + CREATE TABLE IF NOT EXISTS state_tree_upgrades ( version INT8 NOT NULL, name STRING NOT NULL, From 6697bba125afb29104114ca6513bfd3b4aa106cb Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 10:56:52 -0400 Subject: [PATCH 26/31] Trim comments to the WHY; drop narrating prose Sweep the offload/historical packages and the SS factory: keep comments that explain a non-obvious WHY (hash-shard rationale, what the LATERAL pattern buys, follower-read trade-off), drop everything else. Default to no comment when the code is self-explanatory. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/config/ss_config.go | 8 +--- sei-db/config/toml.go | 6 +-- .../ss/offload/consumer/schema/schema.sql | 13 ++---- .../ss/offload/historical/cockroach.go | 41 +++---------------- .../state_db/ss/offload/historical/reader.go | 36 ++++------------ .../state_db/ss/offload/historical/store.go | 20 ++------- sei-db/state_db/ss/store.go | 10 +---- 7 files changed, 27 insertions(+), 107 deletions(-) diff --git a/sei-db/config/ss_config.go b/sei-db/config/ss_config.go index 17268332df..6987777b84 100644 --- a/sei-db/config/ss_config.go +++ b/sei-db/config/ss_config.go @@ -77,12 +77,8 @@ type StateStoreConfig struct { // preserving the same logical store key and full key encoding inside each DB. SeparateEVMSubDBs bool `mapstructure:"evm-separate-dbs"` - // HistoricalOffloadDSN, when non-empty, opens a CockroachDB historical - // Reader at startup and wraps the state store so reads of versions older - // than the primary's earliest retained version fall back to Cockroach. - // The DSN follows the standard libpq/pgx format. See - // sei-db/state_db/ss/offload/historical for the read path and - // sei-db/state_db/ss/offload/consumer for the write side. + // HistoricalOffloadDSN, when set, wraps the SS so reads below the + // primary's earliest version fall back to the Cockroach offload store. HistoricalOffloadDSN string `mapstructure:"historical-offload-dsn"` } diff --git a/sei-db/config/toml.go b/sei-db/config/toml.go index aa30cb50a6..1057e22d06 100644 --- a/sei-db/config/toml.go +++ b/sei-db/config/toml.go @@ -140,10 +140,8 @@ evm-ss-split = {{ .StateStore.EVMSplit }} # When true, data is routed to separate DBs while preserving the same evm key prefix format. evm-ss-separate-dbs = {{ .StateStore.SeparateEVMSubDBs }} -# HistoricalOffloadDSN, when set, points the state store at a CockroachDB -# cluster populated by the historical-offload consumer. Reads of versions -# older than the local SS's earliest retained version are served from -# CockroachDB instead of returning empty. Leave blank to disable. +# When set, reads below the local SS's earliest version fall back to the +# CockroachDB offload store populated by the historical-offload consumer. historical-offload-dsn = "{{ .StateStore.HistoricalOffloadDSN }}" ` diff --git a/sei-db/state_db/ss/offload/consumer/schema/schema.sql b/sei-db/state_db/ss/offload/consumer/schema/schema.sql index 3d4ab3c260..c46335cf86 100644 --- a/sei-db/state_db/ss/offload/consumer/schema/schema.sql +++ b/sei-db/state_db/ss/offload/consumer/schema/schema.sql @@ -1,11 +1,8 @@ -- CockroachDB schema for the historical offload consumer. -- Applied once per cluster before starting the consumer. --- state_versions and the by-version index on state_mutations both have a --- strictly-increasing key (block height), which without sharding turns the --- head of the keyspace into a single-range write hotspot. Hash-sharding --- spreads writes across 16 ranges. Range scans on `version` still work, with --- a small per-range fan-out cost that is irrelevant for our query mix. +-- Hash-sharded PK: version is monotonic (block height) and would otherwise +-- pin every write to one range at the head of the keyspace. CREATE TABLE IF NOT EXISTS state_versions ( version INT8 NOT NULL, kafka_topic STRING NOT NULL, @@ -26,10 +23,8 @@ CREATE TABLE IF NOT EXISTS state_mutations ( CREATE INDEX IF NOT EXISTS state_mutations_by_version_idx ON state_mutations (version) USING HASH WITH (bucket_count = 16); --- Backs "what changed in store S between versions A and B" reads (block --- snapshots, per-store iterators) which the PK can't serve efficiently --- because it leads with key. Hash-sharded so the leading edge of each --- store's monotonic version range doesn't hotspot a single replica. +-- Backs per-store version-range scans the PK can't serve (it leads with key); +-- hash-shard avoids a per-store hotspot on the monotonic version edge. CREATE INDEX IF NOT EXISTS state_mutations_by_store_version_idx ON state_mutations (store_name, version DESC) USING HASH WITH (bucket_count = 16); diff --git a/sei-db/state_db/ss/offload/historical/cockroach.go b/sei-db/state_db/ss/offload/historical/cockroach.go index 4294514a4c..dd7df676dd 100644 --- a/sei-db/state_db/ss/offload/historical/cockroach.go +++ b/sei-db/state_db/ss/offload/historical/cockroach.go @@ -10,17 +10,8 @@ import ( "github.com/lib/pq" ) -// CockroachConfig configures the historical-state Reader. DSN follows the -// standard libpq/pgx format (e.g. postgresql://user@host:26257/db?sslmode=verify-full). -// -// FollowerReadStaleness, when non-zero, switches reads to -// -// AS OF SYSTEM TIME with_max_staleness('') -// -// so any replica can serve the request and the read avoids a leaseholder -// hop. This is the single biggest read-latency win for trace-style workloads, -// which only read committed historical state and tolerate a few seconds of -// replication lag. A value of 0 selects strongly-consistent reads. +// FollowerReadStaleness>0 switches reads to AS OF SYSTEM TIME so any replica +// can serve them; 0 means strongly-consistent reads. type CockroachConfig struct { DSN string MaxOpenConns int @@ -64,9 +55,7 @@ type cockroachReader struct { var _ Reader = (*cockroachReader)(nil) -// NewCockroachReader opens a pooled connection to CockroachDB for historical -// state reads. The caller is responsible for ensuring schema.sql has been -// applied to the cluster. +// NewCockroachReader assumes schema.sql has already been applied. func NewCockroachReader(cfg CockroachConfig) (Reader, error) { cfg.ApplyDefaults() if err := cfg.Validate(); err != nil { @@ -118,18 +107,8 @@ func (r *cockroachReader) Get(ctx context.Context, storeName string, key []byte, return v, nil } -// batchLookupSQL resolves many (store_name, key) pairs at one target version. -// -// Parameters: -// -// $1 STRING[] : store names, parallel to $2 -// $2 BYTES[] : keys, parallel to $1 -// $3 INT8 : target version (inclusive upper bound) -// -// The descending PK on state_mutations(store_name, key, version DESC) makes -// each LATERAL subquery a single index seek + LIMIT 1, so the planner runs -// one PK lookup per pair instead of scanning version history. Replaces what -// would otherwise be N round-trips with one. +// LATERAL + LIMIT 1 against the descending PK turns each (store, key) pair +// into a single index seek; $1=stores, $2=keys (parallel arrays), $3=version. const batchLookupSQL = ` SELECT t.store_name, t.key, m.version, m.value, m.deleted FROM unnest($1::STRING[], $2::BYTES[]) AS t(store_name, key), @@ -143,8 +122,6 @@ FROM unnest($1::STRING[], $2::BYTES[]) AS t(store_name, key), LIMIT 1 ) m` -// splitLookups peels parallel STRING[] / BYTES[] arrays out of the lookup -// slice. Pulled out so it can be unit-tested without a live database. func splitLookups(lookups []Lookup) (stores []string, keys [][]byte) { stores = make([]string, len(lookups)) keys = make([][]byte, len(lookups)) @@ -155,16 +132,10 @@ func splitLookups(lookups []Lookup) (stores []string, keys [][]byte) { return stores, keys } -// aostStmt builds the per-transaction AS OF SYSTEM TIME clause used to enable -// follower reads. Cockroach's with_max_staleness accepts any interval string -// Postgres would, including Go's time.Duration.String() output. func aostStmt(staleness time.Duration) string { return fmt.Sprintf("SET TRANSACTION AS OF SYSTEM TIME with_max_staleness('%s')", staleness) } -// withReadTx runs fn inside a read-only transaction. When staleness > 0 the -// transaction is pinned to a bounded-stale timestamp so any replica can serve -// it without a leaseholder hop. func (r *cockroachReader) withReadTx(ctx context.Context, fn func(*sql.Tx) error) error { tx, err := r.db.BeginTx(ctx, &sql.TxOptions{ReadOnly: true}) if err != nil { @@ -208,8 +179,6 @@ func (r *cockroachReader) BatchGet(ctx context.Context, targetVersion int64, loo return fmt.Errorf("scan batch row: %w", err) } if deleted { - // Tombstone: collapse to absence at the API boundary so callers - // don't need to special-case deleted rows. continue } out[Lookup{StoreName: storeName, Key: string(key)}] = Value{ diff --git a/sei-db/state_db/ss/offload/historical/reader.go b/sei-db/state_db/ss/offload/historical/reader.go index 63b65ceede..b6fd300159 100644 --- a/sei-db/state_db/ss/offload/historical/reader.go +++ b/sei-db/state_db/ss/offload/historical/reader.go @@ -1,7 +1,5 @@ -// Package historical serves point-in-time historical state queries from the -// CockroachDB-backed offload store written by the consumer package. It is -// intended for read-heavy workloads such as debug_traceTransaction, which -// resolves thousands of (store, key, version) tuples per request. +// Package historical reads historical state from the CockroachDB store +// written by the offload consumer; sized for trace-style workloads. package historical import ( @@ -9,46 +7,30 @@ import ( "errors" ) -// ErrNotFound is returned by Get when no row exists for (storeName, key) at -// or before the target version, or when the latest such row is a tombstone. var ErrNotFound = errors.New("historical state not found") -// Lookup names a (store, key) pair the caller wants resolved at a target -// version. Key uses string for byte content so Lookup can be a map key — the -// usual []byte-as-string idiom is fine because the bytes are immutable. +// Lookup uses string for Key so it can be a map key (the []byte-as-string idiom). type Lookup struct { StoreName string Key string } -// Value is the resolved row at or before the requested target version. -// Version reports the actual MVCC version that satisfied the lookup, which -// may be older than the requested target. Tombstones are not surfaced via -// Value: the reader collapses them to absence at the API boundary. +// Value.Version is the actual MVCC version that satisfied the lookup, +// which may be older than the requested target. type Value struct { Bytes []byte Version int64 } -// Reader serves historical state queries from the offload store. Reads are -// snapshot-consistent and may be served by follower replicas when configured -// (see CockroachConfig.FollowerReadStaleness). type Reader interface { - // Get returns the row at the latest version <= targetVersion. Returns - // ErrNotFound if no such row exists or if the latest row is a tombstone. + // Get returns ErrNotFound if no row exists at or before targetVersion, + // or if the latest such row is a tombstone. Get(ctx context.Context, storeName string, key []byte, targetVersion int64) (Value, error) - // BatchGet resolves many (store, key) pairs at the same target version in - // a single round-trip. Pairs that don't exist at or before the target - // version, or whose latest row is a tombstone, are absent from the - // returned map. This is the primary API for trace-style workloads where - // per-key round-trips would dominate latency. + // BatchGet resolves many (store, key) pairs in one round-trip. Missing + // or tombstoned pairs are absent from the returned map. BatchGet(ctx context.Context, targetVersion int64, lookups []Lookup) (map[Lookup]Value, error) - // LastVersion returns the largest version successfully ingested by the - // offload consumer. Trace clients should clamp targetVersion to this so - // they don't query versions still in flight on the Kafka side. LastVersion(ctx context.Context) (int64, error) - Close() error } diff --git a/sei-db/state_db/ss/offload/historical/store.go b/sei-db/state_db/ss/offload/historical/store.go index 5ef7b5009b..a50d0a2fa4 100644 --- a/sei-db/state_db/ss/offload/historical/store.go +++ b/sei-db/state_db/ss/offload/historical/store.go @@ -8,16 +8,8 @@ import ( "github.com/sei-protocol/sei-chain/sei-db/proto" ) -// FallbackStateStore wraps a primary types.StateStore with a historical -// Reader. Get/Has reads at versions older than the primary's earliest -// version are routed to the Reader; everything else passes through -// unchanged. This is the integration point for using the offload-pipeline -// CockroachDB store as a read-fallback for pruned historical state. -// -// Iterators are NOT routed to the Reader: SQL iteration over an MVCC table -// is expressive but slow, and the trace profile is dominated by point Gets. -// A request for an iterator at a pruned version still falls back to the -// primary's behavior (typically empty results). +// FallbackStateStore routes Get/Has below the primary's earliest version to +// the Reader; iterators and writes always go to the primary. type FallbackStateStore struct { primary types.StateStore reader Reader @@ -25,17 +17,11 @@ type FallbackStateStore struct { var _ types.StateStore = (*FallbackStateStore)(nil) -// NewFallbackStateStore wraps primary so that Get/Has at versions < -// primary.GetEarliestVersion() consult reader instead. The wrapper takes -// ownership of both the primary and the reader for the purposes of Close. +// NewFallbackStateStore takes ownership of primary and reader for Close. func NewFallbackStateStore(primary types.StateStore, reader Reader) *FallbackStateStore { return &FallbackStateStore{primary: primary, reader: reader} } -// shouldFallback returns true when version is strictly older than the -// primary's earliest retained version. The primary returns (nil, nil) for -// such versions today, which is indistinguishable from "key never written"; -// using the version watermark gives us a deterministic split. func (s *FallbackStateStore) shouldFallback(version int64) bool { earliest := s.primary.GetEarliestVersion() return earliest > 0 && version < earliest diff --git a/sei-db/state_db/ss/store.go b/sei-db/state_db/ss/store.go index 1876af0a95..fe03c38a7a 100644 --- a/sei-db/state_db/ss/store.go +++ b/sei-db/state_db/ss/store.go @@ -9,14 +9,8 @@ import ( "github.com/sei-protocol/sei-chain/sei-db/state_db/ss/offload/historical" ) -// NewStateStore creates a CompositeStateStore which handles both Cosmos and EVM data. -// The backend (pebbledb or rocksdb) is resolved at compile time via build-tag-gated -// files in the backend package. When WriteMode/ReadMode are both cosmos_only (the default), -// the EVM stores are not opened and the composite store behaves identically to a plain cosmos state store. -// -// If ssConfig.HistoricalOffloadDSN is set, the composite store is wrapped with -// a historical.FallbackStateStore so reads of pruned versions are served from -// the offload-pipeline CockroachDB cluster. +// NewStateStore opens the composite SS and, if HistoricalOffloadDSN is set, +// wraps it with a Cockroach-backed fallback for reads of pruned versions. func NewStateStore(homeDir string, ssConfig config.StateStoreConfig) (types.StateStore, error) { cs, err := composite.NewCompositeStateStore(ssConfig, homeDir) if err != nil { From d848b2fc70ed8a8dd7a08842b5382b36ba7b3398 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 14:36:57 -0400 Subject: [PATCH 27/31] Add state_latest + state_at_block tables state_latest pins the most recent value per (store, key) so "current state" reads are a single PK lookup. The sentinel-pointer pattern was net-negative on pebble (commit 42b707788 in the upstream debug-trace profile run, ~+32% avgTotal because of compaction lag); on Cockroach the write tax goes to a system designed to absorb it. state_at_block holds a dense rolling-window snapshot for hot stores, hash-sharded on block_version (32 buckets) to spread the leading-edge hot range. With this in place, trace at any block in the window can fetch its entire (store, key) read set in one BatchGet against state_at_block, one PK lookup per pair. Both tables are written by the consumer (separate commits) and are opt-in via config; existing deployments without the migration applied keep behaving as before. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/consumer/schema/schema.sql | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sei-db/state_db/ss/offload/consumer/schema/schema.sql b/sei-db/state_db/ss/offload/consumer/schema/schema.sql index c46335cf86..23314f32da 100644 --- a/sei-db/state_db/ss/offload/consumer/schema/schema.sql +++ b/sei-db/state_db/ss/offload/consumer/schema/schema.sql @@ -36,3 +36,25 @@ CREATE TABLE IF NOT EXISTS state_tree_upgrades ( delete BOOL NOT NULL DEFAULT false, PRIMARY KEY (version, name) ); + +-- One row per (store, key) pinned to the most recent value. Reads at "latest" +-- become a single PK lookup instead of a descending scan on state_mutations. +CREATE TABLE IF NOT EXISTS state_latest ( + store_name STRING NOT NULL, + key BYTES NOT NULL, + value BYTES, + version INT8 NOT NULL, + deleted BOOL NOT NULL DEFAULT false, + PRIMARY KEY (store_name, key) +); + +-- Dense rolling-window snapshot for hot stores: every selected (store, key) +-- written at every block. Hash-shard on block_version to spread the head. +CREATE TABLE IF NOT EXISTS state_at_block ( + block_version INT8 NOT NULL, + store_name STRING NOT NULL, + key BYTES NOT NULL, + value BYTES, + deleted BOOL NOT NULL DEFAULT false, + PRIMARY KEY (block_version, store_name, key) USING HASH WITH (bucket_count = 32) +); From 68b0d4d9f92d4fa02196f55cc1092be7c1cd2325 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 14:49:36 -0400 Subject: [PATCH 28/31] UPSERT state_latest on every consumer write Adds CockroachConfig.EnableLatest. When set, the Write tx batch-UPSERTs state_latest after state_mutations. The ON CONFLICT clause has a guard (WHERE state_latest.version <= excluded.version) so out-of-order writes from parallel partition workers don't roll the row backwards. Schema migration is required (state_latest table from previous commit). Existing deployments default to EnableLatest=false and are unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../state_db/ss/offload/consumer/cockroach.go | 65 ++++++++++++++++++- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/cockroach.go b/sei-db/state_db/ss/offload/consumer/cockroach.go index 06ece5f737..482c226322 100644 --- a/sei-db/state_db/ss/offload/consumer/cockroach.go +++ b/sei-db/state_db/ss/offload/consumer/cockroach.go @@ -12,12 +12,15 @@ import ( // CockroachConfig configures the CockroachDB sink. DSN follows the standard // libpq/pgx format (e.g. postgresql://user@host:26257/db?sslmode=verify-full). -// Use DSN params for knobs like statement_timeout rather than adding fields. type CockroachConfig struct { DSN string MaxOpenConns int MaxIdleConns int ConnMaxLifetime time.Duration + + // EnableLatest UPSERTs into state_latest on every block so "current + // state" reads are a single PK lookup. Cheap; ~2x the write rate. + EnableLatest bool } func (c *CockroachConfig) ApplyDefaults() { @@ -46,7 +49,8 @@ func (c *CockroachConfig) Validate() error { } type cockroachSink struct { - db *sql.DB + db *sql.DB + enableLatest bool } var _ Sink = (*cockroachSink)(nil) @@ -74,7 +78,7 @@ func NewCockroachSink(cfg CockroachConfig) (Sink, error) { return nil, fmt.Errorf("ping cockroach: %w", err) } - return &cockroachSink{db: db}, nil + return &cockroachSink{db: db, enableLatest: cfg.EnableLatest}, nil } func (s *cockroachSink) Close() error { @@ -110,6 +114,11 @@ func (s *cockroachSink) Write(ctx context.Context, rec Record) error { if err := insertMutations(ctx, tx, rec); err != nil { return err } + if s.enableLatest { + if err := upsertLatest(ctx, tx, rec); err != nil { + return err + } + } if err := insertUpgrades(ctx, tx, rec); err != nil { return err } @@ -190,6 +199,56 @@ func insertMutations(ctx context.Context, tx *sql.Tx, rec Record) error { return nil } +// buildLatestBatches builds UPSERT INTO state_latest batches. The WHERE +// clause guards against out-of-order writes from parallel partition workers +// — a row is only overwritten if the incoming version is at least as new. +func buildLatestBatches(rec Record, maxRows int) []mutationBatch { + if maxRows <= 0 { + maxRows = mutationBatchRows + } + version := rec.Entry.Version + const colsPerRow = 5 + var ( + batches []mutationBatch + args = make([]interface{}, 0, maxRows*colsPerRow) + parts = make([]string, 0, maxRows) + ) + flush := func() { + if len(parts) == 0 { + return + } + stmt := `INSERT INTO state_latest (store_name, key, value, version, deleted) VALUES ` + + strings.Join(parts, ",") + + ` ON CONFLICT (store_name, key) DO UPDATE + SET value = excluded.value, version = excluded.version, deleted = excluded.deleted + WHERE state_latest.version <= excluded.version` + batches = append(batches, mutationBatch{Stmt: stmt, Args: args}) + args = make([]interface{}, 0, maxRows*colsPerRow) + parts = make([]string, 0, maxRows) + } + for _, ncs := range rec.Entry.Changesets { + for _, p := range ncs.Changeset.Pairs { + idx := len(args) + parts = append(parts, fmt.Sprintf("($%d,$%d,$%d,$%d,$%d)", idx+1, idx+2, idx+3, idx+4, idx+5)) + args = append(args, ncs.Name, p.Key, p.Value, version, p.Delete) + if len(parts) >= maxRows { + flush() + } + } + } + flush() + return batches +} + +func upsertLatest(ctx context.Context, tx *sql.Tx, rec Record) error { + for _, b := range buildLatestBatches(rec, mutationBatchRows) { + if _, err := tx.ExecContext(ctx, b.Stmt, b.Args...); err != nil { + return fmt.Errorf("upsert state_latest: %w", err) + } + } + return nil +} + func insertUpgrades(ctx context.Context, tx *sql.Tx, rec Record) error { if len(rec.Entry.Upgrades) == 0 { return nil From c2aeefa5882fd0bd0c349f40b27bbd37065ba0f8 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 14:50:46 -0400 Subject: [PATCH 29/31] Snapshot state_at_block per block with rolling GC Adds CockroachConfig.SnapshotStores and SnapshotWindowBlocks. When SnapshotStores is non-empty, the Write tx INSERT...SELECTs from state_latest into state_at_block at the current version, giving a dense end-of-block snapshot for the listed hot stores. Reads at any block in the window become a single PK lookup per (store, key) instead of a LATERAL probe through MVCC history. SnapshotWindowBlocks > 0 GCs older snapshots inline via DELETE WHERE block_version < cutoff. Per-block this is cheap after the initial run because only the just-aged-out block has rows below cutoff. Snapshots require EnableLatest because the source-of-truth is state_latest. Validate() enforces it. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../state_db/ss/offload/consumer/cockroach.go | 74 ++++++++++++++++++- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/sei-db/state_db/ss/offload/consumer/cockroach.go b/sei-db/state_db/ss/offload/consumer/cockroach.go index 482c226322..c6d574af7b 100644 --- a/sei-db/state_db/ss/offload/consumer/cockroach.go +++ b/sei-db/state_db/ss/offload/consumer/cockroach.go @@ -7,7 +7,7 @@ import ( "strings" "time" - _ "github.com/lib/pq" + "github.com/lib/pq" ) // CockroachConfig configures the CockroachDB sink. DSN follows the standard @@ -21,6 +21,15 @@ type CockroachConfig struct { // EnableLatest UPSERTs into state_latest on every block so "current // state" reads are a single PK lookup. Cheap; ~2x the write rate. EnableLatest bool + + // SnapshotStores enables dense block-level snapshots in state_at_block + // for these stores. Each block writes a full snapshot of state_latest + // for these stores at the current version. Requires EnableLatest. + SnapshotStores []string + + // SnapshotWindowBlocks bounds the rolling snapshot window: rows older + // than (current - SnapshotWindowBlocks) are GC'd inline. 0 disables GC. + SnapshotWindowBlocks int64 } func (c *CockroachConfig) ApplyDefaults() { @@ -45,12 +54,20 @@ func (c *CockroachConfig) Validate() error { if c.MaxIdleConns < 0 { return fmt.Errorf("cockroach max idle conns must be non-negative") } + if c.SnapshotWindowBlocks < 0 { + return fmt.Errorf("snapshot window blocks must be non-negative") + } + if len(c.SnapshotStores) > 0 && !c.EnableLatest { + return fmt.Errorf("snapshot stores require EnableLatest=true") + } return nil } type cockroachSink struct { - db *sql.DB - enableLatest bool + db *sql.DB + enableLatest bool + snapshotStores []string + snapshotWindow int64 } var _ Sink = (*cockroachSink)(nil) @@ -78,7 +95,12 @@ func NewCockroachSink(cfg CockroachConfig) (Sink, error) { return nil, fmt.Errorf("ping cockroach: %w", err) } - return &cockroachSink{db: db, enableLatest: cfg.EnableLatest}, nil + return &cockroachSink{ + db: db, + enableLatest: cfg.EnableLatest, + snapshotStores: append([]string(nil), cfg.SnapshotStores...), + snapshotWindow: cfg.SnapshotWindowBlocks, + }, nil } func (s *cockroachSink) Close() error { @@ -119,6 +141,16 @@ func (s *cockroachSink) Write(ctx context.Context, rec Record) error { return err } } + if len(s.snapshotStores) > 0 { + if err := snapshotAtBlock(ctx, tx, rec.Entry.Version, s.snapshotStores); err != nil { + return err + } + if s.snapshotWindow > 0 { + if err := gcSnapshots(ctx, tx, rec.Entry.Version, s.snapshotWindow); err != nil { + return err + } + } + } if err := insertUpgrades(ctx, tx, rec); err != nil { return err } @@ -249,6 +281,40 @@ func upsertLatest(ctx context.Context, tx *sql.Tx, rec Record) error { return nil } +// snapshotAtBlockSQL copies the current state_latest rows for the given +// stores into state_at_block at the supplied version. ON CONFLICT keeps the +// statement idempotent under retry. +const snapshotAtBlockSQL = ` +INSERT INTO state_at_block (block_version, store_name, key, value, deleted) +SELECT $1, store_name, key, value, deleted +FROM state_latest +WHERE store_name = ANY($2) +ON CONFLICT (block_version, store_name, key) DO UPDATE + SET value = excluded.value, deleted = excluded.deleted` + +func snapshotAtBlock(ctx context.Context, tx *sql.Tx, version int64, stores []string) error { + if _, err := tx.ExecContext(ctx, snapshotAtBlockSQL, version, pq.StringArray(stores)); err != nil { + return fmt.Errorf("snapshot state_at_block: %w", err) + } + return nil +} + +// gcSnapshotSQL deletes state_at_block rows older than the rolling window. +// Per-block invocation is fine because the bulk of work happens once: after +// the first run, only the just-aged-out block has rows below the cutoff. +const gcSnapshotSQL = `DELETE FROM state_at_block WHERE block_version < $1` + +func gcSnapshots(ctx context.Context, tx *sql.Tx, version, window int64) error { + cutoff := version - window + if cutoff <= 0 { + return nil + } + if _, err := tx.ExecContext(ctx, gcSnapshotSQL, cutoff); err != nil { + return fmt.Errorf("gc state_at_block: %w", err) + } + return nil +} + func insertUpgrades(ctx context.Context, tx *sql.Tx, rec Record) error { if len(rec.Entry.Upgrades) == 0 { return nil From e478a39fc35c0e12d5fb84789bfc7ddcc40f9e4a Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 14:51:40 -0400 Subject: [PATCH 30/31] Test state_latest builder and snapshot SQL shapes Pins the bits that matter for correctness and parallel-write safety: - buildLatestBatches: UPSERT shape, ON CONFLICT clause, the version-guard (WHERE state_latest.version <= excluded.version) that keeps out-of-order parallel partition writes from rolling rows back, row layout, batch splitting. - snapshotAtBlockSQL: SELECT-from-state_latest source, store_name = ANY($2) filter, ON CONFLICT idempotence under retry. - gcSnapshotSQL: shape pin so an accidental edit can't drop the block_version filter. - Validate(): SnapshotStores requires EnableLatest; SnapshotWindowBlocks must be non-negative. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ss/offload/consumer/cockroach_test.go | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/sei-db/state_db/ss/offload/consumer/cockroach_test.go b/sei-db/state_db/ss/offload/consumer/cockroach_test.go index a5159507cb..434522cba6 100644 --- a/sei-db/state_db/ss/offload/consumer/cockroach_test.go +++ b/sei-db/state_db/ss/offload/consumer/cockroach_test.go @@ -99,3 +99,88 @@ func TestBuildMutationBatchesDefaultCap(t *testing.T) { batches := buildMutationBatches(rec, 0) require.Len(t, batches, 2) } + +func TestBuildLatestBatchesShape(t *testing.T) { + rec := makeRecord(7, &dbproto.NamedChangeSet{ + Name: "evm", + Changeset: dbproto.ChangeSet{Pairs: []*dbproto.KVPair{ + {Key: []byte("k1"), Value: []byte("v1")}, + {Key: []byte("k2"), Delete: true}, + }}, + }) + batches := buildLatestBatches(rec, 500) + require.Len(t, batches, 1) + + b := batches[0] + require.Contains(t, b.Stmt, "INSERT INTO state_latest") + require.Contains(t, b.Stmt, "ON CONFLICT (store_name, key) DO UPDATE") + // The version-guard is what makes parallel partition writes safe. + require.Contains(t, b.Stmt, "WHERE state_latest.version <= excluded.version") + require.Equal(t, 2, strings.Count(b.Stmt, "($")) + require.Len(t, b.Args, 10) + + // Row layout: name, key, value, version, deleted. + require.Equal(t, "evm", b.Args[0]) + require.Equal(t, []byte("k1"), b.Args[1]) + require.Equal(t, []byte("v1"), b.Args[2]) + require.Equal(t, int64(7), b.Args[3]) + require.Equal(t, false, b.Args[4]) + require.Equal(t, true, b.Args[9]) +} + +func TestBuildLatestBatchesSplits(t *testing.T) { + pairs := make([]*dbproto.KVPair, 250) + for i := range pairs { + pairs[i] = &dbproto.KVPair{Key: []byte{byte(i)}, Value: []byte{0x1}} + } + rec := makeRecord(9, &dbproto.NamedChangeSet{ + Name: "bank", + Changeset: dbproto.ChangeSet{Pairs: pairs}, + }) + batches := buildLatestBatches(rec, 100) + require.Len(t, batches, 3) + require.Len(t, batches[0].Args, 500) + require.Len(t, batches[1].Args, 500) + require.Len(t, batches[2].Args, 250) +} + +func TestBuildLatestBatchesEmpty(t *testing.T) { + require.Empty(t, buildLatestBatches(makeRecord(1), 500)) +} + +func TestSnapshotAtBlockSQLShape(t *testing.T) { + for _, frag := range []string{ + "INSERT INTO state_at_block", + "FROM state_latest", + "store_name = ANY($2)", + "ON CONFLICT (block_version, store_name, key)", + } { + require.Containsf(t, snapshotAtBlockSQL, frag, + "snapshotAtBlockSQL missing required fragment %q", frag) + } +} + +func TestGCSnapshotSQLShape(t *testing.T) { + require.Contains(t, gcSnapshotSQL, "DELETE FROM state_at_block") + require.Contains(t, gcSnapshotSQL, "block_version < $1") +} + +func TestCockroachConfigValidateSnapshotRequiresLatest(t *testing.T) { + cfg := CockroachConfig{ + DSN: "postgres://x", + SnapshotStores: []string{"slashing"}, + } + err := cfg.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "EnableLatest") + + cfg.EnableLatest = true + require.NoError(t, cfg.Validate()) +} + +func TestCockroachConfigValidateNegativeWindow(t *testing.T) { + cfg := CockroachConfig{DSN: "postgres://x", SnapshotWindowBlocks: -1} + err := cfg.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "window") +} From 5fdba288fa51773b1d06765d0d7145905afdef22 Mon Sep 17 00:00:00 2001 From: kbhat1 Date: Tue, 28 Apr 2026 14:52:39 -0400 Subject: [PATCH 31/31] Wire ENABLE_LATEST / SNAPSHOT_STORES / SNAPSHOT_WINDOW_BLOCKS Surfaces the new sink knobs through deploy.sh so an operator can flip them without editing config files, and documents the two new tables in README. Default values keep behavior unchanged for existing deployments. Co-Authored-By: Claude Opus 4.7 (1M context) --- sei-db/state_db/ss/offload/consumer/README.md | 20 +++++++++++++++++++ sei-db/state_db/ss/offload/consumer/deploy.sh | 9 +++++++++ 2 files changed, 29 insertions(+) diff --git a/sei-db/state_db/ss/offload/consumer/README.md b/sei-db/state_db/ss/offload/consumer/README.md index e5c39955d1..0ac50ebecd 100644 --- a/sei-db/state_db/ss/offload/consumer/README.md +++ b/sei-db/state_db/ss/offload/consumer/README.md @@ -39,3 +39,23 @@ RUN=1 ./deploy.sh - Sink writes use bounded exponential backoff (5 attempts, 1s→30s) before giving up. On give-up the process exits non-zero so the supervisor restarts; Kafka offsets stay uncommitted, so the next run replays from the last commit. + +## Read-side optimization tables + +Two optional tables make trace-style reads dramatically faster. Both are off +by default; flip on by setting the matching env var before `deploy.sh`: + +- `state_latest` — one row per `(store, key)` with the most recent value. + Reads at "current state" become a single PK lookup instead of a descending + scan on `state_mutations`. Enable with `ENABLE_LATEST=true`. ~2× the write + rate; cheap on Cockroach. +- `state_at_block` — dense end-of-block snapshot for hot stores. Each block + copies state_latest into state_at_block, so reads at any block in the + rolling window are a single PK lookup per `(store, key)`. Set + `SNAPSHOT_STORES="slashing,distribution,staking,bank,params"` (requires + `ENABLE_LATEST=true`); bound storage with `SNAPSHOT_WINDOW_BLOCKS=2000` so + the consumer GCs older blocks inline. + +The sentinel-pointer pattern was net-negative on pebble (compaction lag); +moved here it's net-positive because the write tax goes to a system designed +to absorb it. diff --git a/sei-db/state_db/ss/offload/consumer/deploy.sh b/sei-db/state_db/ss/offload/consumer/deploy.sh index 1f86a0471b..5489893d9e 100755 --- a/sei-db/state_db/ss/offload/consumer/deploy.sh +++ b/sei-db/state_db/ss/offload/consumer/deploy.sh @@ -16,6 +16,9 @@ # KAFKA_START_OFFSET default first (first|last) # COCKROACH_MAX_CONNS default 16 # WORKERS default 1 (per-partition parallelism) +# ENABLE_LATEST default false (UPSERT state_latest per block) +# SNAPSHOT_STORES default "" (comma-separated; e.g. "slashing,distribution,staking,bank,params") +# SNAPSHOT_WINDOW_BLOCKS default 0 (rolling window; 0 disables GC) # CONFIG_OUT default ./historical-offload-consumer.json # BIN_OUT default ./bin/historical-offload-consumer # SKIP_SCHEMA=1 skip applying schema.sql @@ -38,6 +41,9 @@ KAFKA_SASL_MECHANISM="${KAFKA_SASL_MECHANISM:-aws-msk-iam}" KAFKA_START_OFFSET="${KAFKA_START_OFFSET:-first}" COCKROACH_MAX_CONNS="${COCKROACH_MAX_CONNS:-16}" WORKERS="${WORKERS:-1}" +ENABLE_LATEST="${ENABLE_LATEST:-false}" +SNAPSHOT_STORES="${SNAPSHOT_STORES:-}" +SNAPSHOT_WINDOW_BLOCKS="${SNAPSHOT_WINDOW_BLOCKS:-0}" CONFIG_OUT="${CONFIG_OUT:-./historical-offload-consumer.json}" BIN_OUT="${BIN_OUT:-./bin/historical-offload-consumer}" @@ -79,6 +85,9 @@ cfg = { "Cockroach": { "DSN": os.environ["COCKROACH_DSN"], "MaxOpenConns": int(os.environ["COCKROACH_MAX_CONNS"]), + "EnableLatest": os.environ["ENABLE_LATEST"].lower() == "true", + "SnapshotStores": [s.strip() for s in os.environ["SNAPSHOT_STORES"].split(",") if s.strip()], + "SnapshotWindowBlocks": int(os.environ["SNAPSHOT_WINDOW_BLOCKS"]), }, "Workers": int(os.environ["WORKERS"]), }