goadesign
diff --git a/‎examples/weather/services/poller/events.go‎
Lines changed: 10 additions & 13 deletions b/‎examples/weather/services/poller/events.go‎
Lines changed: 10 additions & 13 deletions
diff --git a/‎examples/weather/services/poller/marshal.go‎
Lines changed: 3 additions & 2 deletions b/‎examples/weather/services/poller/marshal.go‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎pool/node.go‎
Lines changed: 113 additions & 50 deletions b/‎pool/node.go‎
Lines changed: 113 additions & 50 deletions
@@ -3,6 +3,7 @@ package poller
 import (
 	"bytes"
 	"encoding/binary"
+	"io"
 
 	genpoller "goa.design/pulse/examples/weather/services/poller/gen/poller"
 )
@@ -37,17 +38,15 @@ func unmarshalLocation(data []byte) (*genpoller.Location, error) {
 		return nil, err
 	}
 	stateBytes := make([]byte, stateLen)
-	_, err = buf.Read(stateBytes)
-	if err != nil {
+	if _, err = io.ReadFull(buf, stateBytes); err != nil {
 		return nil, err
 	}
 	err = binary.Read(buf, binary.LittleEndian, &cityLen)
 	if err != nil {
 		return nil, err
 	}
 	cityBytes := make([]byte, cityLen)
-	_, err = buf.Read(cityBytes)
-	if err != nil {
+	if _, err = io.ReadFull(buf, cityBytes); err != nil {
 		return nil, err
 	}
 	var lat, long float64
@@ -92,8 +91,7 @@ func unmarshalForecastEvent(data []byte) (*genpoller.Forecast, error) {
 		return nil, err
 	}
 	locBytes := make([]byte, locLen)
-	_, err = buf.Read(locBytes)
-	if err != nil {
+	if _, err = io.ReadFull(buf, locBytes); err != nil {
 		return nil, err
 	}
 	loc, err := unmarshalLocation(locBytes)
@@ -112,8 +110,7 @@ func unmarshalForecastEvent(data []byte) (*genpoller.Forecast, error) {
 			return nil, err
 		}
 		periodBytes := make([]byte, periodLen)
-		_, err = buf.Read(periodBytes)
-		if err != nil {
+		if _, err = io.ReadFull(buf, periodBytes); err != nil {
 			return nil, err
 		}
 		period, err := unmarshalPeriod(periodBytes)
@@ -153,7 +150,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {
 		return nil, err
 	}
 	nameBytes := make([]byte, nameLen)
-	if _, err := buf.Read(nameBytes); err != nil {
+	if _, err := io.ReadFull(buf, nameBytes); err != nil {
 		return nil, err
 	}
 
@@ -162,7 +159,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {
 		return nil, err
 	}
 	startTimeBytes := make([]byte, startTimeLen)
-	if _, err := buf.Read(startTimeBytes); err != nil {
+	if _, err := io.ReadFull(buf, startTimeBytes); err != nil {
 		return nil, err
 	}
 
@@ -171,7 +168,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {
 		return nil, err
 	}
 	endTimeBytes := make([]byte, endTimeLen)
-	if _, err := buf.Read(endTimeBytes); err != nil {
+	if _, err := io.ReadFull(buf, endTimeBytes); err != nil {
 		return nil, err
 	}
 
@@ -185,7 +182,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {
 		return nil, err
 	}
 	tempUnitBytes := make([]byte, tempUnitLen)
-	if _, err := buf.Read(tempUnitBytes); err != nil {
+	if _, err := io.ReadFull(buf, tempUnitBytes); err != nil {
 		return nil, err
 	}
 
@@ -194,7 +191,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {
 		return nil, err
 	}
 	summaryBytes := make([]byte, summaryLen)
-	if _, err := buf.Read(summaryBytes); err != nil {
+	if _, err := io.ReadFull(buf, summaryBytes); err != nil {
 		return nil, err
 	}
 
 
@@ -3,6 +3,7 @@ package poller
 import (
 	"bytes"
 	"encoding/binary"
+	"io"
 )
 
 // marshalCityAndState marshals a city and state into a byte slice using binary encoding.
@@ -26,7 +27,7 @@ func unmarshalCityAndState(data []byte) (city, state string, err error) {
 		return
 	}
 	stateBytes := make([]byte, stateLen)
-	_, err = buf.Read(stateBytes)
+	_, err = io.ReadFull(buf, stateBytes)
 	if err != nil {
 		return
 	}
@@ -35,7 +36,7 @@ func unmarshalCityAndState(data []byte) (city, state string, err error) {
 		return
 	}
 	cityBytes := make([]byte, cityLen)
-	_, err = buf.Read(cityBytes)
+	_, err = io.ReadFull(buf, cityBytes)
 	if err != nil {
 		return
 	}
 
@@ -58,6 +58,7 @@ type (
 		nodeStreams        sync.Map // streams for worker acks indexed by ID
 		pendingJobChannels sync.Map // channels used to send DispatchJob results, nil if event is requeued
 		pendingEvents      sync.Map // pending events indexed by sender and event IDs
+		orphanedPayloads   sync.Map // job key -> first time observed orphaned payload (unix nanos)
 
 		lock     sync.RWMutex
 		closing  bool
@@ -355,7 +356,10 @@ func (node *Node) DispatchJob(ctx context.Context, key string, payload []byte) e
 }
 
 func (node *Node) dispatchJob(ctx context.Context, key string, job []byte, requeue bool) error {
-	if node.IsClosed() {
+	// Allow internal requeue operations to proceed while the node is closing.
+	// External callers use DispatchJob which passes requeue=false and should be
+	// rejected once Close begins.
+	if node.IsClosed() && !requeue {
 		return fmt.Errorf("DispatchJob: pool %q is closed", node.PoolName)
 	}
 
@@ -553,16 +557,19 @@ func (node *Node) close(ctx context.Context, shutdown bool) error {
 		node.stopAllJobs(ctx)
 	}
 
-	// Stop all workers before waiting for goroutines
+	// Stop all workers before waiting for goroutines.
+	//
+	// IMPORTANT: do NOT remove workers from the replicated maps here.
+	// Removing the worker deletes the worker->jobs mapping which is what other
+	// nodes use to recover/requeue jobs if this node dies mid-close. We only
+	// remove workers from maps after we've attempted to requeue.
 	var wg sync.WaitGroup
 	node.localWorkers.Range(func(key, value any) bool {
 		worker := value.(*Worker)
 		wg.Add(1)
 		pulse.Go(node.logger, func() {
 			defer wg.Done()
 			worker.stop(ctx)
-			// Remove worker immediately to avoid job requeuing by other nodes
-			node.removeWorker(ctx, worker.ID)
 		})
 		return true
 	})
@@ -572,13 +579,24 @@ func (node *Node) close(ctx context.Context, shutdown bool) error {
 	close(node.stop)
 	node.wg.Wait()
 
-	// Requeue jobs if not shutting down, after stopping goroutines to avoid receiving new jobs
+	// Requeue jobs if not shutting down.
+	//
+	// This is done after stopping node goroutines so we don't route any new pool
+	// events to workers that have already been stopped.
 	if !shutdown {
 		if err := node.requeueAllJobs(ctx); err != nil {
 			node.logger.Error(fmt.Errorf("close: failed to requeue jobs: %w", err))
 		}
 	}
 
+	// Now that we attempted requeue, remove all local workers from pool maps.
+	node.localWorkers.Range(func(key, value any) bool {
+		worker := value.(*Worker)
+		node.removeWorker(ctx, worker.ID)
+		node.localWorkers.Delete(key)
+		return true
+	})
+
 	// Cleanup resources
 	node.cleanupNode(ctx)
 
@@ -959,6 +977,65 @@ func (node *Node) cleanupInactiveWorkers(ctx context.Context) {
 		node.logger.Info("cleanupInactiveWorkers: found inactive worker", "worker", workerID)
 		node.cleanupWorker(ctx, workerID)
 	}
+
+	// Also recover any jobs that still have payloads but are missing from the job map.
+	// This can happen transiently during cascading failures and is preferable to leaving
+	// jobs "stuck" (payload exists, but no worker owns the job).
+	node.requeueOrphanedPayloads(ctx)
+}
+
+// requeueOrphanedPayloads detects payloads for job keys that are not present in
+// the job map and requeues them after a short grace period.
+func (node *Node) requeueOrphanedPayloads(ctx context.Context) {
+	// Build a set of all job keys referenced by the job map.
+	existingJobs := make(map[string]struct{})
+	for _, jobs := range node.jobMap.Map() {
+		for _, key := range strings.Split(jobs, ",") {
+			if key == "" {
+				continue
+			}
+			existingJobs[key] = struct{}{}
+		}
+	}
+
+	// Use a short grace period: we want recovery to be fast under churn,
+	// but still avoid requeuing during brief map inconsistencies.
+	grace := 2 * node.workerTTL
+	if grace < node.ackGracePeriod {
+		grace = node.ackGracePeriod
+	}
+
+	now := time.Now()
+	for key := range node.jobPayloadMap.Map() {
+		if _, ok := existingJobs[key]; ok {
+			node.orphanedPayloads.Delete(key)
+			continue
+		}
+
+		firstAny, ok := node.orphanedPayloads.Load(key)
+		if !ok {
+			node.orphanedPayloads.Store(key, now.UnixNano())
+			continue
+		}
+		firstNS, _ := firstAny.(int64)
+		if firstNS == 0 || now.Sub(time.Unix(0, firstNS)) < grace {
+			continue
+		}
+
+		payload, ok := node.JobPayload(key)
+		if !ok {
+			node.orphanedPayloads.Delete(key)
+			continue
+		}
+		job := &Job{Key: key, Payload: payload, CreatedAt: now, NodeID: node.ID}
+		if _, err := node.poolStream.Add(ctx, evStartJob, marshalJob(job)); err != nil {
+			node.logger.Error(fmt.Errorf("requeueOrphanedPayloads: failed to requeue orphaned job: %w", err), "key", key)
+			continue
+		}
+
+		node.orphanedPayloads.Delete(key)
+		node.logger.Info("requeueOrphanedPayloads: requeued orphaned job", "key", key, "grace", grace)
+	}
 }
 
 // cleanupWorker requeues the jobs assigned to the worker and deletes it from
@@ -981,23 +1058,38 @@ func (node *Node) cleanupWorker(ctx context.Context, workerID string) {
 	}
 
 	// Requeue jobs and process them
-	var requeued int
+	var (
+		requeued  int // jobs successfully requeued
+		processed int // jobs that were either requeued or cleaned up as stale
+	)
 	for _, key := range keys {
 		payload, ok := node.JobPayload(key)
 		if !ok {
-			node.logger.Error(fmt.Errorf("requeueWorkerJobs: failed to get job payload"), "job", key, "worker", workerID)
-			requeued++ // We will never be able to requeue this job
+			// The job key can remain in the jobs map even if the payload has already
+			// been removed (e.g. the job was stopped, or another node already handled
+			// the requeue). Treat it as a stale entry and remove it so future cleanup
+			// attempts don't keep looping on it.
+			if _, _, err := node.jobMap.RemoveValues(ctx, workerID, key); err != nil {
+				node.logger.Error(fmt.Errorf("cleanupWorker: failed to remove stale job from jobs map: %w", err), "job", key, "worker", workerID)
+				continue
+			}
+			node.logger.Info("cleanupWorker: removed stale job key with missing payload", "job", key, "worker", workerID)
+			processed++
 			continue
 		}
-		job := &Job{Key: key, Payload: []byte(payload), CreatedAt: time.Now(), NodeID: node.ID}
-		if err := node.dispatchJob(ctx, job.Key, marshalJob(job), true); err != nil {
+		job := &Job{Key: key, Payload: payload, CreatedAt: time.Now(), NodeID: node.ID}
+		// Requeue by adding an event back to the pool stream.
+		// We intentionally do not wait for the job to start (which can time out
+		// under heavy churn) - the pool sink will retry routing until it is acked.
+		if _, err := node.poolStream.Add(ctx, evStartJob, marshalJob(job)); err != nil {
 			node.logger.Error(fmt.Errorf("requeueWorkerJobs: failed to requeue job: %w", err), "job", job.Key, "worker", workerID)
 			continue
 		}
 		requeued++
+		processed++
 	}
-	if len(keys) != requeued {
-		node.logger.Info("partially requeued stale worker jobs", "requeued", requeued, "jobs", len(keys), "worker", workerID)
+	if len(keys) != processed {
+		node.logger.Info("partially processed stale worker jobs", "requeued", requeued, "processed", processed, "jobs", len(keys), "worker", workerID)
 		return
 	}
 
@@ -1014,21 +1106,12 @@ func (node *Node) processInactiveJobs(ctx context.Context) {
 	ticker := time.NewTicker(node.ackGracePeriod) // Run at ackGracePeriod frequency since pending jobs expire after 2*ackGracePeriod
 	defer ticker.Stop()
 
-	payloadCleanupTicker, err := node.NewTicker(ctx, "jobPayloadCleanup", node.workerTTL)
-	if err != nil {
-		node.logger.Error(fmt.Errorf("processInactiveJobs: failed to create payload cleanup ticker: %w", err))
-		return
-	}
-	defer payloadCleanupTicker.Stop()
-
 	for {
 		select {
 		case <-node.stop:
 			return
 		case <-ticker.C:
 			node.cleanupStalePendingJobs(ctx)
-		case <-payloadCleanupTicker.C:
-			node.cleanupOrphanedJobPayloads(ctx)
 		}
 	}
 }
@@ -1050,29 +1133,6 @@ func (node *Node) cleanupStalePendingJobs(ctx context.Context) {
 	}
 }
 
-// cleanupOrphanedJobPayloads checks for and removes entries in the job payload map
-// that don't have a corresponding entry in the job map.
-func (node *Node) cleanupOrphanedJobPayloads(ctx context.Context) {
-	// Get all existing job keys from the job map
-	existingJobs := make(map[string]struct{})
-	for _, jobs := range node.jobMap.Map() {
-		for _, key := range strings.Split(jobs, ",") {
-			existingJobs[key] = struct{}{}
-		}
-	}
-
-	// Check each payload entry
-	for key := range node.jobPayloadMap.Map() {
-		if _, exists := existingJobs[key]; !exists {
-			if _, err := node.jobPayloadMap.Delete(ctx, key); err != nil {
-				node.logger.Error(fmt.Errorf("cleanupOrphanedJobPayloads: failed to delete orphaned payload for job %q: %w", key, err))
-				continue
-			}
-			node.logger.Info("cleanupOrphanedJobPayloads: removed orphaned payload", "key", key)
-		}
-	}
-}
-
 // acquireCleanupLock tries to acquire the cleanup lock for a worker.
 // It returns true if the lock was acquired, false if another node holds the lock.
 // It will clear any stale or invalid locks it finds.
@@ -1226,12 +1286,15 @@ func (node *Node) removeWorkerFromMaps(ctx context.Context, id string) {
 	if _, err := node.workerCleanupMap.Delete(ctx, id); err != nil {
 		node.logger.Error(fmt.Errorf("removeWorkerFromMaps: failed to remove cleanup timestamp: %w", err), "worker", id)
 	}
-	jobKeys, _ := node.jobMap.GetValues(id)
-	for _, key := range jobKeys {
-		if _, err := node.jobPayloadMap.Delete(ctx, key); err != nil {
-			node.logger.Error(fmt.Errorf("removeWorkerFromMaps: failed to remove job %s from payload map: %w", key, err))
-		}
-	}
+	// NOTE: Do not delete job payloads here.
+	//
+	// Payload entries are job-scoped (not worker-scoped) and are required to
+	// safely requeue jobs from a stale worker during distributed cleanup. Deleting
+	// payloads during worker removal can race with another node performing
+	// cleanup/requeue and lead to permanent job loss.
+	//
+	// Payloads are deleted when jobs stop (see Worker.stopJob) and any remaining
+	// orphaned payloads are eventually collected by cleanupOrphanedJobPayloads.
 	if _, err := node.jobMap.Delete(ctx, id); err != nil {
 		node.logger.Error(fmt.Errorf("removeWorkerFromMaps: failed to remove worker %s from jobs map: %w", id, err))
 	}
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@ package poller`
`3`	`3`	`import (`
`4`	`4`	`"bytes"`
`5`	`5`	`"encoding/binary"`
	`6`	`+ "io"`
`6`	`7`
`7`	`8`	`genpoller "goa.design/pulse/examples/weather/services/poller/gen/poller"`
`8`	`9`	`)`
`@@ -37,17 +38,15 @@ func unmarshalLocation(data []byte) (*genpoller.Location, error) {`
`37`	`38`	`return nil, err`
`38`	`39`	`}`
`39`	`40`	`stateBytes := make([]byte, stateLen)`
`40`		`- _, err = buf.Read(stateBytes)`
`41`		`- if err != nil {`
	`41`	`+ if _, err = io.ReadFull(buf, stateBytes); err != nil {`
`42`	`42`	`return nil, err`
`43`	`43`	`}`
`44`	`44`	`err = binary.Read(buf, binary.LittleEndian, &cityLen)`
`45`	`45`	`if err != nil {`
`46`	`46`	`return nil, err`
`47`	`47`	`}`
`48`	`48`	`cityBytes := make([]byte, cityLen)`
`49`		`- _, err = buf.Read(cityBytes)`
`50`		`- if err != nil {`
	`49`	`+ if _, err = io.ReadFull(buf, cityBytes); err != nil {`
`51`	`50`	`return nil, err`
`52`	`51`	`}`
`53`	`52`	`var lat, long float64`
`@@ -92,8 +91,7 @@ func unmarshalForecastEvent(data []byte) (*genpoller.Forecast, error) {`
`92`	`91`	`return nil, err`
`93`	`92`	`}`
`94`	`93`	`locBytes := make([]byte, locLen)`
`95`		`- _, err = buf.Read(locBytes)`
`96`		`- if err != nil {`
	`94`	`+ if _, err = io.ReadFull(buf, locBytes); err != nil {`
`97`	`95`	`return nil, err`
`98`	`96`	`}`
`99`	`97`	`loc, err := unmarshalLocation(locBytes)`
`@@ -112,8 +110,7 @@ func unmarshalForecastEvent(data []byte) (*genpoller.Forecast, error) {`
`112`	`110`	`return nil, err`
`113`	`111`	`}`
`114`	`112`	`periodBytes := make([]byte, periodLen)`
`115`		`- _, err = buf.Read(periodBytes)`
`116`		`- if err != nil {`
	`113`	`+ if _, err = io.ReadFull(buf, periodBytes); err != nil {`
`117`	`114`	`return nil, err`
`118`	`115`	`}`
`119`	`116`	`period, err := unmarshalPeriod(periodBytes)`
`@@ -153,7 +150,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {`
`153`	`150`	`return nil, err`
`154`	`151`	`}`
`155`	`152`	`nameBytes := make([]byte, nameLen)`
`156`		`- if _, err := buf.Read(nameBytes); err != nil {`
	`153`	`+ if _, err := io.ReadFull(buf, nameBytes); err != nil {`
`157`	`154`	`return nil, err`
`158`	`155`	`}`
`159`	`156`
`@@ -162,7 +159,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {`
`162`	`159`	`return nil, err`
`163`	`160`	`}`
`164`	`161`	`startTimeBytes := make([]byte, startTimeLen)`
`165`		`- if _, err := buf.Read(startTimeBytes); err != nil {`
	`162`	`+ if _, err := io.ReadFull(buf, startTimeBytes); err != nil {`
`166`	`163`	`return nil, err`
`167`	`164`	`}`
`168`	`165`
`@@ -171,7 +168,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {`
`171`	`168`	`return nil, err`
`172`	`169`	`}`
`173`	`170`	`endTimeBytes := make([]byte, endTimeLen)`
`174`		`- if _, err := buf.Read(endTimeBytes); err != nil {`
	`171`	`+ if _, err := io.ReadFull(buf, endTimeBytes); err != nil {`
`175`	`172`	`return nil, err`
`176`	`173`	`}`
`177`	`174`
`@@ -185,7 +182,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {`
`185`	`182`	`return nil, err`
`186`	`183`	`}`
`187`	`184`	`tempUnitBytes := make([]byte, tempUnitLen)`
`188`		`- if _, err := buf.Read(tempUnitBytes); err != nil {`
	`185`	`+ if _, err := io.ReadFull(buf, tempUnitBytes); err != nil {`
`189`	`186`	`return nil, err`
`190`	`187`	`}`
`191`	`188`
`@@ -194,7 +191,7 @@ func unmarshalPeriod(data []byte) (*genpoller.Period, error) {`
`194`	`191`	`return nil, err`
`195`	`192`	`}`
`196`	`193`	`summaryBytes := make([]byte, summaryLen)`
`197`		`- if _, err := buf.Read(summaryBytes); err != nil {`
	`194`	`+ if _, err := io.ReadFull(buf, summaryBytes); err != nil {`
`198`	`195`	`return nil, err`
`199`	`196`	`}`
`200`	`197`