Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions internal/connections/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,21 +192,39 @@ func (s *Client) List(ctx context.Context, sort SortMode) (ConnectionList, error
}

deadline := time.Now().Add(s.retryBudget)
var partial ConnectionList
havePartial := false
for {
list, retryAfter, err := s.tryList(ctx, sort)
if err == nil {
switch {
case err == nil && !list.HasUnreachableInstance():
return list, nil
}
if isTimeoutError(err) && ctx.Err() != nil {
case err == nil:
// 200 OK, but the server flagged one or more instances unreachable
// (a partial fan-out failure). These are usually momentary under
// load and clear within a poll or two, so retry within the budget.
// Retry locally; the response carries no Retry-After hint.
partial, havePartial = list, true
retryAfter = 0
case isTimeoutError(err) && ctx.Err() != nil:
Comment thread
cursor[bot] marked this conversation as resolved.
if callerCtx.Err() != nil {
return ConnectionList{}, fmt.Errorf("list connections: %w", callerCtx.Err())
}
// A later attempt timed out, but an earlier one already returned a
// usable partial. Surface that (matching the wait path) rather than a
// failed-refresh error.
if havePartial {
return partial, nil
}
return ConnectionList{}, fmt.Errorf("list connections: request timed out after %s, please retry", s.cfg.RequestTimeout)
}
if !errors.Is(err, errListWarming) {
case !errors.Is(err, errListWarming):
return ConnectionList{}, err
}

if s.retryBudget <= 0 {
if havePartial {
return partial, nil
}
return ConnectionList{}, errListWarmingExhausted
}

Expand All @@ -215,12 +233,21 @@ func (s *Client) List(ctx context.Context, sort SortMode) (ConnectionList, error
delay = s.retryDelay()
}
if time.Now().Add(delay).After(deadline) {
if havePartial {
return partial, nil
}
return ConnectionList{}, errListWarmingExhausted
}

select {
case <-time.After(delay):
case <-ctx.Done():
if callerCtx.Err() != nil {
return ConnectionList{}, fmt.Errorf("list connections: %w", callerCtx.Err())
}
if havePartial {
return partial, nil
}
return ConnectionList{}, fmt.Errorf("list connections: %w", ctx.Err())
}
}
Expand Down
174 changes: 171 additions & 3 deletions internal/connections/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const sampleListResponse = `{
"captured_at": "2026-04-29T12:34:56.789Z",
"instances": [
{"id": "primary", "role": "primary", "error": null},
{"id": "replica-1", "role": "replica", "error": "timeout after 2s"}
{"id": "replica-1", "role": "replica", "error": null}
],
"data": [
{
Expand Down Expand Up @@ -180,8 +180,8 @@ func TestClient_ListDecodesConnectionList(t *testing.T) {
if list.Instances[0] != (InstanceMeta{ID: "primary", Role: "primary"}) {
t.Errorf("Instances[0] = %+v, want primary/primary/(no error)", list.Instances[0])
}
if list.Instances[1] != (InstanceMeta{ID: "replica-1", Role: "replica", Error: "timeout after 2s"}) {
t.Errorf("Instances[1] = %+v, want replica-1/replica/timeout", list.Instances[1])
if list.Instances[1] != (InstanceMeta{ID: "replica-1", Role: "replica"}) {
t.Errorf("Instances[1] = %+v, want replica-1/replica/(no error)", list.Instances[1])
}

if first.InstanceRole != "primary" {
Expand Down Expand Up @@ -526,6 +526,144 @@ func TestClientListPartialResponseFriendlyMessage(t *testing.T) {
}
}

// partialInstanceListResponse models a 200 OK that the server returns while a
// fan-out to one instance failed: the primary is flagged unreachable while the
// replica reports cleanly. The error string mirrors a real captured trace.
const partialInstanceListResponse = `{
"type": "list",
"captured_at": "2026-04-29T12:34:56.789Z",
"instances": [
{"id": "primary", "role": "primary", "error": "remote service unavailable"},
{"id": "replica-1", "role": "replica", "error": null}
],
"data": []
}`

func TestClient_ListRetriesPartialInstanceErrorThenReturnsClean(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
if calls.Add(1) == 1 {
_, _ = io.WriteString(w, partialInstanceListResponse)
return
}
_, _ = io.WriteString(w, sampleListResponse)
}))
defer srv.Close()

c := newClientWithTimings(t, srv.URL, 500*time.Millisecond, 10*time.Millisecond, 0)
list, err := c.List(context.Background(), SortByTransactionStart)
if err != nil {
t.Fatalf("List: %v", err)
}
if got := calls.Load(); got != 2 {
t.Fatalf("calls = %d, want 2 (retried past the partial-instance failure)", got)
}
for _, inst := range list.Instances {
if inst.Error != "" {
t.Fatalf("returned list still reports unreachable instance %q (%q), want a clean retry result", inst.ID, inst.Error)
}
}
}

func TestClient_ListReturnsPartialAfterRetryBudgetExhausted(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
calls.Add(1)
w.Header().Set("Content-Type", "application/json")
_, _ = io.WriteString(w, partialInstanceListResponse)
}))
defer srv.Close()

c := newClientWithTimings(t, srv.URL, 40*time.Millisecond, 10*time.Millisecond, 0)
start := time.Now()
list, err := c.List(context.Background(), SortByTransactionStart)
elapsed := time.Since(start)
if err != nil {
t.Fatalf("List: want partial list, got error %v", err)
}
if calls.Load() < 2 {
t.Fatalf("calls = %d, want at least 2 (retried before giving up)", calls.Load())
}
if elapsed > time.Second {
t.Fatalf("List took %v, want retry budget to expire promptly", elapsed)
}
// A persistent partial failure still returns useful data: the reachable
// instances plus the per-instance error that drives the unreachable banner.
var found bool
for _, inst := range list.Instances {
if inst.ID == "primary" && inst.Error == "remote service unavailable" {
found = true
}
}
if !found {
t.Fatalf("partial list lost the instance error; instances = %+v", list.Instances)
}
}

func TestClient_ListReturnsSavedPartialWhenLaterAttemptTimesOut(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if calls.Add(1) == 1 {
w.Header().Set("Content-Type", "application/json")
_, _ = io.WriteString(w, partialInstanceListResponse)
return
}
// Hang a later attempt until the request timeout fires; respect the
// request context so server shutdown doesn't block on this handler.
select {
case <-r.Context().Done():
case <-time.After(5 * time.Second):
}
}))
defer srv.Close()

// Budget allows a retry; the per-request timeout fires inside the hung second
// attempt — the timeout-while-holding-a-saved-partial path. It must return
// the partial (matching the wait path), not a failed-refresh error.
c := newClientWithTimings(t, srv.URL, 2*time.Second, 10*time.Millisecond, 0)
c.cfg.RequestTimeout = 300 * time.Millisecond

list, err := c.List(context.Background(), SortByTransactionStart)
if err != nil {
t.Fatalf("List: want the saved partial, got error %v", err)
}
if calls.Load() < 2 {
t.Fatalf("calls = %d, want at least 2 (timed out on a later attempt)", calls.Load())
}
var found bool
for _, inst := range list.Instances {
if inst.ID == "primary" && inst.Error == "remote service unavailable" {
found = true
}
}
if !found {
t.Fatalf("expected the saved partial after the timeout; instances = %+v", list.Instances)
}
}

func TestClient_ListDoesNotRetryPartialWhenBudgetDisabled(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
calls.Add(1)
w.Header().Set("Content-Type", "application/json")
_, _ = io.WriteString(w, partialInstanceListResponse)
}))
defer srv.Close()

c := newClientWithTimings(t, srv.URL, 0, 10*time.Millisecond, 0)
list, err := c.List(context.Background(), SortByTransactionStart)
if err != nil {
t.Fatalf("List: %v", err)
}
if got := calls.Load(); got != 1 {
t.Fatalf("calls = %d, want 1 (no retry when budget disabled)", got)
}
if got := list.Instances[0].Error; got != "remote service unavailable" {
t.Fatalf("Instances[0].Error = %q, want decoded instance error", got)
}
}

func TestClient_ListRetryWaitHonorsContextCancellation(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
Expand Down Expand Up @@ -555,6 +693,36 @@ func TestClient_ListRetryWaitHonorsContextCancellation(t *testing.T) {
}
}

func TestClient_ListPartialRetryWaitHonorsContextCancellation(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
calls.Add(1)
w.Header().Set("Content-Type", "application/json")
_, _ = io.WriteString(w, partialInstanceListResponse)
}))
defer srv.Close()

ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
defer cancel()

c := newClientWithTimings(t, srv.URL, 500*time.Millisecond, 250*time.Millisecond, 0)
start := time.Now()
_, err := c.List(ctx, SortByTransactionStart)
elapsed := time.Since(start)
if err == nil {
t.Fatal("List: want context error, got nil")
}
if !errors.Is(err, context.DeadlineExceeded) {
t.Fatalf("err = %v, want errors.Is(err, context.DeadlineExceeded)", err)
}
if got := calls.Load(); got != 1 {
t.Fatalf("calls = %d, want 1 because context expired during partial retry wait", got)
}
if elapsed > time.Second {
t.Fatalf("List took %v, want context cancellation to interrupt partial retry wait", elapsed)
}
}

func TestClient_ListHonorsRetryAfterHeader(t *testing.T) {
var calls atomic.Int32
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
Expand Down
12 changes: 12 additions & 0 deletions internal/connections/connection_list.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,18 @@ type ConnectionList struct {
Topology *Topology `json:"topology,omitempty"`
}

// HasUnreachableInstance reports whether the server flagged any instance as
// unreachable in the list response. These partial fan-out failures drive the
// "N of M instances unreachable" banner.
func (l ConnectionList) HasUnreachableInstance() bool {
for _, inst := range l.Instances {
if inst.Error != "" {
return true
}
}
return false
}

type Connection struct {
PID int `json:"pid"`
Instance string `json:"instance"`
Expand Down
4 changes: 3 additions & 1 deletion internal/connections/tui/detail.go
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,9 @@ func blockerTreeLabel(row blockerRow, selected bool) string {
// consistent across views.
return selectedRowStyle.Render("▶ " + label)
}
return label
// Reserve the caret's width on unselected rows so moving the selection
// doesn't shift the row text, matching the table view's fixed-width marker.
return " " + label
}

func wrapLines(text string, width int) []string {
Expand Down
17 changes: 17 additions & 0 deletions internal/connections/tui/detail_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -688,3 +688,20 @@ func TestDetailHelpHidesRefreshWhilePaused(t *testing.T) {
c.Assert(footer, qt.Not(qt.Contains), "r refresh")
c.Assert(footer, qt.Contains, "space resume")
}

func TestBlockerTreeLabelReservesCaretWidth(t *testing.T) {
c := qt.New(t)
row := blockerRow{PID: 123, Present: false}

// The selection caret ("▶ ") must not shift the row text: an unselected
// row reserves the caret's width so the label sits at the same column
// whether or not it is selected (matching the table view's fixed marker).
unselected := blockerTreeLabel(row, false)
c.Assert(unselected, qt.Equals, " "+blockerLabel(row))

// Selected rows use the caret + a space — the same two display columns as
// the unselected padding, so the label text never shifts between states.
selectedInner := ansi.Strip(blockerTreeLabel(row, true))
c.Assert(strings.HasPrefix(selectedInner, "▶ "), qt.IsTrue)
c.Assert(strings.TrimPrefix(selectedInner, "▶ "), qt.Equals, strings.TrimPrefix(unselected, " "))
}
9 changes: 9 additions & 0 deletions internal/connections/tui/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,15 @@ func (m Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
m.consecutiveErrors++
return m, nil
}
// A degraded result reached us after the client exhausted its retries on
// a partial-instance failure. Don't swap it into the live view: hold the
// last good frame and let the staleness dot and captured-age surface it,
// rather than flashing the "N unreachable" banner over good data. First
// load has no good frame to hold, so fall through and show what we have.
if m.hasList && msg.list.HasUnreachableInstance() {
m.consecutiveErrors++
return m, nil
}
m.consecutiveErrors = 0
m.initialAccessDenied = false
cursor := m.samples.Push(msg.list)
Expand Down
43 changes: 43 additions & 0 deletions internal/connections/tui/model_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,49 @@ func TestModelKeepsLastListWhenRefreshFails(t *testing.T) {
c.Assert(view, qt.Contains, "error: connection refused")
}

func degradedList(captured time.Time, pid int) live.ConnectionList {
list := live.NewConnectionList(captured, []live.Connection{{PID: pid, Instance: "primary"}}, live.SortByTransactionStart)
list.Instances = []live.InstanceMeta{
{ID: "primary", Role: "primary", Error: "remote service unavailable"},
{ID: "replica-1", Role: "replica"},
}
return list
}

func TestModelHoldsLastGoodListOnPersistentPartial(t *testing.T) {
c := qt.New(t)
model := NewModel(context.Background(), &clientStub{}, time.Second, 0)
good := live.NewConnectionList(time.Now(), []live.Connection{{PID: 10, Instance: "primary"}}, live.SortByTransactionStart)

updated, _ := model.Update(listMsg{list: good})
// A degraded refresh (client retries already exhausted) must not replace the
// good frame or show the unreachable banner — it holds and goes stale.
updated, _ = updated.(Model).Update(listMsg{list: degradedList(time.Now(), 20)})
got := updated.(Model)

// Held the last good frame (PID 10); the degraded frame (PID 20) was not
// swapped in. Assert on the held data, not the rendered string — the header
// renders a wall-clock timestamp that can incidentally contain a PID.
held := got.currentList().Connections
c.Assert(len(held), qt.Equals, 1)
c.Assert(held[0].PID, qt.Equals, 10)
c.Assert(got.View(), qt.Not(qt.Contains), "unreachable")
c.Assert(got.consecutiveErrors, qt.Equals, 1)
}

func TestModelShowsPartialOnFirstLoadWithNoPriorFrame(t *testing.T) {
c := qt.New(t)
model := NewModel(context.Background(), &clientStub{}, time.Second, 0)

// No good frame to hold yet, so the first result is shown even if degraded.
updated, _ := model.Update(listMsg{list: degradedList(time.Now(), 30)})
got := updated.(Model)

// No prior frame to hold, so the degraded result is shown (banner and all).
c.Assert(got.currentList().Connections[0].PID, qt.Equals, 30)
c.Assert(got.View(), qt.Contains, "unreachable")
}

func TestModelShowsInitialListErrorBeforeAnySuccessfulList(t *testing.T) {
c := qt.New(t)
model := NewModel(context.Background(), &clientStub{}, time.Second, 0)
Expand Down
Loading