From a55291b8e57972411ca43ea2847240d0c93646f7 Mon Sep 17 00:00:00 2001 From: Raymond Jacobson Date: Fri, 29 May 2026 16:56:31 -0700 Subject: [PATCH 1/2] fix(indexer): errgroup.WithContext so ETL halt actually exits the process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CoreIndexer.Start used a bare errgroup.Group{}, which doesn't share or cancel any ctx. When etlIndexer.Run() returned the halt-on-block-error introduced by go-openaudio#323 / api#883: - the errgroup captured the error - but aggregatesCalculator.Start kept spinning on an uncancelled ctx - eg.Wait() blocked forever waiting for it - main.go's panic(err) never received anything - the pod stayed 1/1 Running with the ETL goroutine dead Observed in prod tonight on cd94ede: a 25P02 cascade in the plays-hook savepoint poisoned the pgx pool, indexBlocks() correctly returned per #323, but the pod kept running with the parity jobs (which start outside the errgroup) ticking happily and MAX(blocks.height) advancing via the still-running Python indexer — making the wedge invisible to health checks. The whole point of #323's halt-on-error was defeated by this wrapper. The fix is one-liner-ish: errgroup.WithContext(ctx) so the first error cancels gCtx, which aggregatesCalculator and the parity jobs already honor. eg.Wait then returns, main panics, the supervisor restarts the pod with a fresh pgx pool, and the ETL self-heals — same retry semantics #323 promised. Test plan - go build ./..., go vet ./..., gofmt clean. - go test ./indexer/... still passes (10 pubkey + user_events hook tests). - After deploy, can synthesize the ETL-halt path and confirm the pod actually crash-restarts. Co-Authored-By: Claude Opus 4.7 (1M context) --- indexer/indexer.go | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/indexer/indexer.go b/indexer/indexer.go index 94721b5f..9056ac02 100644 --- a/indexer/indexer.go +++ b/indexer/indexer.go @@ -108,25 +108,34 @@ func NewIndexer(cfg config.Config) *CoreIndexer { } } -// Start runs the ETL indexer alongside the aggregates calculator. Both are -// long-lived; errgroup propagates the first error (and the ctx cancellation -// it triggers) to all members. +// Start runs the ETL indexer alongside the aggregates calculator. The +// errgroup uses WithContext so that if either long-lived goroutine returns +// an error — most importantly an ETL halt from go-openaudio#323's +// halt-on-block-error path — the shared ctx is cancelled and the aggregates +// loop + parity jobs exit too, allowing eg.Wait() to return and main.go's +// `panic(err)` to crash-restart the pod. // -// Caveat: etl.Indexer.Run() uses its own internal context.Background() rather -// than honoring `ctx` — graceful shutdown via ctx cancellation isn't supported -// by the upstream API today. Process termination (SIGTERM) still works the -// way Go programs always do, and DB connections drain via pool finalizers on -// process exit. Acceptable tradeoff to avoid forking ETL. +// A previous version used a bare `errgroup.Group{}` (no shared ctx), +// which silently broke the halt path: ETL Run() returned its error, but +// aggregatesCalculator.Start kept spinning on an uncancelled ctx, eg.Wait +// blocked forever, and the process never exited. Observed in prod on +// cd94ede when an ETL 25P02 cascade halted the indexer but the pod stayed +// 1/1 Running — defeating the whole point of #323. +// +// Caveat: etl.Indexer.Run() still uses its own internal context.Background() +// rather than honoring the shared ctx — upstream doesn't expose ctx-driven +// shutdown. External SIGTERM still terminates via Go signal handling + +// k8s grace period, and DB pools drain via finalizers on exit. func (ci *CoreIndexer) Start(ctx context.Context) error { - eg := errgroup.Group{} + eg, gCtx := errgroup.WithContext(ctx) eg.Go(func() error { - return ci.aggregatesCalculator.Start(ctx) + return ci.aggregatesCalculator.Start(gCtx) }) eg.Go(func() error { ci.logger.Info("Starting ETL indexer") return ci.etlIndexer.Run() }) - ci.startParityJobs(ctx) + ci.startParityJobs(gCtx) return eg.Wait() } From dc1bde67969362647a907c6b55410921dfb0c0c8 Mon Sep 17 00:00:00 2001 From: Ray Jacobson Date: Fri, 29 May 2026 16:57:59 -0700 Subject: [PATCH 2/2] Remove comment --- indexer/indexer.go | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/indexer/indexer.go b/indexer/indexer.go index 9056ac02..2c1114a7 100644 --- a/indexer/indexer.go +++ b/indexer/indexer.go @@ -108,24 +108,6 @@ func NewIndexer(cfg config.Config) *CoreIndexer { } } -// Start runs the ETL indexer alongside the aggregates calculator. The -// errgroup uses WithContext so that if either long-lived goroutine returns -// an error — most importantly an ETL halt from go-openaudio#323's -// halt-on-block-error path — the shared ctx is cancelled and the aggregates -// loop + parity jobs exit too, allowing eg.Wait() to return and main.go's -// `panic(err)` to crash-restart the pod. -// -// A previous version used a bare `errgroup.Group{}` (no shared ctx), -// which silently broke the halt path: ETL Run() returned its error, but -// aggregatesCalculator.Start kept spinning on an uncancelled ctx, eg.Wait -// blocked forever, and the process never exited. Observed in prod on -// cd94ede when an ETL 25P02 cascade halted the indexer but the pod stayed -// 1/1 Running — defeating the whole point of #323. -// -// Caveat: etl.Indexer.Run() still uses its own internal context.Background() -// rather than honoring the shared ctx — upstream doesn't expose ctx-driven -// shutdown. External SIGTERM still terminates via Go signal handling + -// k8s grace period, and DB pools drain via finalizers on exit. func (ci *CoreIndexer) Start(ctx context.Context) error { eg, gCtx := errgroup.WithContext(ctx) eg.Go(func() error {