From a55291b8e57972411ca43ea2847240d0c93646f7 Mon Sep 17 00:00:00 2001
From: Raymond Jacobson <ray@audius.co>
Date: Fri, 29 May 2026 16:56:31 -0700
Subject: [PATCH 1/2] fix(indexer): errgroup.WithContext so ETL halt actually
 exits the process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CoreIndexer.Start used a bare errgroup.Group{}, which doesn't share or
cancel any ctx. When etlIndexer.Run() returned the halt-on-block-error
introduced by go-openaudio#323 / api#883:

  - the errgroup captured the error
  - but aggregatesCalculator.Start kept spinning on an uncancelled ctx
  - eg.Wait() blocked forever waiting for it
  - main.go's panic(err) never received anything
  - the pod stayed 1/1 Running with the ETL goroutine dead

Observed in prod tonight on cd94ede: a 25P02 cascade in the plays-hook
savepoint poisoned the pgx pool, indexBlocks() correctly returned per
#323, but the pod kept running with the parity jobs (which start outside
the errgroup) ticking happily and MAX(blocks.height) advancing via the
still-running Python indexer — making the wedge invisible to health
checks. The whole point of #323's halt-on-error was defeated by this
wrapper.

The fix is one-liner-ish: errgroup.WithContext(ctx) so the first error
cancels gCtx, which aggregatesCalculator and the parity jobs already
honor. eg.Wait then returns, main panics, the supervisor restarts the
pod with a fresh pgx pool, and the ETL self-heals — same retry semantics
#323 promised.

Test plan
- go build ./..., go vet ./..., gofmt clean.
- go test ./indexer/... still passes (10 pubkey + user_events hook tests).
- After deploy, can synthesize the ETL-halt path and confirm the pod
  actually crash-restarts.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 indexer/indexer.go | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/indexer/indexer.go b/indexer/indexer.go
index 94721b5f..9056ac02 100644
--- a/indexer/indexer.go
+++ b/indexer/indexer.go
@@ -108,25 +108,34 @@ func NewIndexer(cfg config.Config) *CoreIndexer {
 	}
 }
 
-// Start runs the ETL indexer alongside the aggregates calculator. Both are
-// long-lived; errgroup propagates the first error (and the ctx cancellation
-// it triggers) to all members.
+// Start runs the ETL indexer alongside the aggregates calculator. The
+// errgroup uses WithContext so that if either long-lived goroutine returns
+// an error — most importantly an ETL halt from go-openaudio#323's
+// halt-on-block-error path — the shared ctx is cancelled and the aggregates
+// loop + parity jobs exit too, allowing eg.Wait() to return and main.go's
+// `panic(err)` to crash-restart the pod.
 //
-// Caveat: etl.Indexer.Run() uses its own internal context.Background() rather
-// than honoring `ctx` — graceful shutdown via ctx cancellation isn't supported
-// by the upstream API today. Process termination (SIGTERM) still works the
-// way Go programs always do, and DB connections drain via pool finalizers on
-// process exit. Acceptable tradeoff to avoid forking ETL.
+// A previous version used a bare `errgroup.Group{}` (no shared ctx),
+// which silently broke the halt path: ETL Run() returned its error, but
+// aggregatesCalculator.Start kept spinning on an uncancelled ctx, eg.Wait
+// blocked forever, and the process never exited. Observed in prod on
+// cd94ede when an ETL 25P02 cascade halted the indexer but the pod stayed
+// 1/1 Running — defeating the whole point of #323.
+//
+// Caveat: etl.Indexer.Run() still uses its own internal context.Background()
+// rather than honoring the shared ctx — upstream doesn't expose ctx-driven
+// shutdown. External SIGTERM still terminates via Go signal handling +
+// k8s grace period, and DB pools drain via finalizers on exit.
 func (ci *CoreIndexer) Start(ctx context.Context) error {
-	eg := errgroup.Group{}
+	eg, gCtx := errgroup.WithContext(ctx)
 	eg.Go(func() error {
-		return ci.aggregatesCalculator.Start(ctx)
+		return ci.aggregatesCalculator.Start(gCtx)
 	})
 	eg.Go(func() error {
 		ci.logger.Info("Starting ETL indexer")
 		return ci.etlIndexer.Run()
 	})
-	ci.startParityJobs(ctx)
+	ci.startParityJobs(gCtx)
 	return eg.Wait()
 }
 

From dc1bde67969362647a907c6b55410921dfb0c0c8 Mon Sep 17 00:00:00 2001
From: Ray Jacobson <raymondshujacobson8@gmail.com>
Date: Fri, 29 May 2026 16:57:59 -0700
Subject: [PATCH 2/2] Remove comment

---
 indexer/indexer.go | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/indexer/indexer.go b/indexer/indexer.go
index 9056ac02..2c1114a7 100644
--- a/indexer/indexer.go
+++ b/indexer/indexer.go
@@ -108,24 +108,6 @@ func NewIndexer(cfg config.Config) *CoreIndexer {
 	}
 }
 
-// Start runs the ETL indexer alongside the aggregates calculator. The
-// errgroup uses WithContext so that if either long-lived goroutine returns
-// an error — most importantly an ETL halt from go-openaudio#323's
-// halt-on-block-error path — the shared ctx is cancelled and the aggregates
-// loop + parity jobs exit too, allowing eg.Wait() to return and main.go's
-// `panic(err)` to crash-restart the pod.
-//
-// A previous version used a bare `errgroup.Group{}` (no shared ctx),
-// which silently broke the halt path: ETL Run() returned its error, but
-// aggregatesCalculator.Start kept spinning on an uncancelled ctx, eg.Wait
-// blocked forever, and the process never exited. Observed in prod on
-// cd94ede when an ETL 25P02 cascade halted the indexer but the pod stayed
-// 1/1 Running — defeating the whole point of #323.
-//
-// Caveat: etl.Indexer.Run() still uses its own internal context.Background()
-// rather than honoring the shared ctx — upstream doesn't expose ctx-driven
-// shutdown. External SIGTERM still terminates via Go signal handling +
-// k8s grace period, and DB pools drain via finalizers on exit.
 func (ci *CoreIndexer) Start(ctx context.Context) error {
 	eg, gCtx := errgroup.WithContext(ctx)
 	eg.Go(func() error {