diff --git a/pkg/monitortests/node/watchnodes/monitortest.go b/pkg/monitortests/node/watchnodes/monitortest.go index 0bf8d861221b..a09350241851 100644 --- a/pkg/monitortests/node/watchnodes/monitortest.go +++ b/pkg/monitortests/node/watchnodes/monitortest.go @@ -109,7 +109,7 @@ func unreachableNodeTaint(finalIntervals monitorapi.Intervals) []*junitapi.JUnit return tests } -func intervalStartDuring(needle monitorapi.Interval, haystack monitorapi.Intervals) bool { +func intervalStartDuring(needle monitorapi.Interval, haystack monitorapi.Intervals, grace time.Duration) bool { if len(haystack) == 0 { // If there are no deleted intervals // we can assume that the unexpected event is significant. @@ -117,8 +117,9 @@ func intervalStartDuring(needle monitorapi.Interval, haystack monitorapi.Interva } for _, curr := range haystack { needleStartEqualOrAfterFrom := needle.From.Equal(curr.From) || needle.From.After(curr.From) - needleStartEqualOrBeforeTo := needle.From.Equal(curr.To) || needle.From.Before(curr.To) - if needleStartEqualOrAfterFrom || needleStartEqualOrBeforeTo { + effectiveTo := curr.To.Add(grace) + needleStartEqualOrBeforeTo := needle.From.Equal(effectiveTo) || needle.From.Before(effectiveTo) + if needleStartEqualOrAfterFrom && needleStartEqualOrBeforeTo { return true } } diff --git a/pkg/monitortests/node/watchnodes/node.go b/pkg/monitortests/node/watchnodes/node.go index 1513163dc2dd..41a6e3027945 100644 --- a/pkg/monitortests/node/watchnodes/node.go +++ b/pkg/monitortests/node/watchnodes/node.go @@ -405,12 +405,15 @@ func reportUnexpectedNodeDownFailures(intervals monitorapi.Intervals, targetedRe machineDeletingIntervals := machineNameToDeletePhases[machineNameForNode] - if !intervalStartDuring(unexpectedNodeUnready, machineDeletingIntervals) { - // Skip NotReady events caused by NetworkPluginNotReady during network operator rollout - // NetworkPluginNotReady is a RuntimeStatus reported by cri-o and exposed by kubelet in the condition's message. + if !intervalStartDuring(unexpectedNodeUnready, machineDeletingIntervals, 0) { + // Skip NotReady events caused by NetworkPluginNotReady during network operator rollout. + // NetworkPluginNotReady is a RuntimeStatus reported by cri-o and exposed by kubelet in + // the condition's message. A 30s grace period is applied beyond the end of the + // networkProgressingInterval because kubelet may take a few seconds to observe that CNI + // is ready again after the network operator finishes progressing. conditionMsg := unexpectedNodeUnready.Message.Annotations[monitorapi.AnnotationCause] if strings.Contains(conditionMsg, "NetworkPluginNotReady") { - if intervalStartDuring(unexpectedNodeUnready, networkProgressingIntervals) { + if intervalStartDuring(unexpectedNodeUnready, networkProgressingIntervals, 30*time.Second) { continue } } diff --git a/pkg/monitortests/node/watchnodes/node_test.go b/pkg/monitortests/node/watchnodes/node_test.go index f0bfa876b9d2..8068932d71e0 100644 --- a/pkg/monitortests/node/watchnodes/node_test.go +++ b/pkg/monitortests/node/watchnodes/node_test.go @@ -371,6 +371,106 @@ func TestReportUnexpectedNodeDownFailures(t *testing.T) { expected: []string{}, unexpectedReason: monitorapi.NodeUnexpectedReadyReason, }, + { + name: "node unexpected ready caused by NetworkPluginNotReady within 30s grace after network operator rollout ends", + rawIntervals: monitorapi.Intervals{ + // NotReady event starts 15s after the networkProgressingInterval ends — within the 30s grace window. + { + Condition: monitorapi.Condition{ + Level: monitorapi.Error, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeNode, + Keys: map[monitorapi.LocatorKey]string{ + "node": "node1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.NodeUnexpectedReadyReason, + HumanMessage: "unexpected node not ready", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationReason: "UnexpectedNotReady", + monitorapi.AnnotationCause: "container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:47:15", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:15", 2024), + }, + // networkProgressingInterval ended at 19:47:00 — 15s before the NotReady event. + { + Condition: monitorapi.Condition{ + Level: monitorapi.Warning, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeClusterOperator, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorClusterOperatorKey: "network", + }, + }, + Message: monitorapi.Message{ + HumanMessage: "Progressing because OVN is updating", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationCondition: "Progressing", + monitorapi.AnnotationStatus: "True", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024), + }, + }, + expected: []string{}, + unexpectedReason: monitorapi.NodeUnexpectedReadyReason, + }, + { + name: "node unexpected ready caused by NetworkPluginNotReady beyond 30s grace after network operator rollout ends", + rawIntervals: monitorapi.Intervals{ + // NotReady event starts 31s after the networkProgressingInterval ends — beyond the 30s grace window. + { + Condition: monitorapi.Condition{ + Level: monitorapi.Error, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeNode, + Keys: map[monitorapi.LocatorKey]string{ + "node": "node1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.NodeUnexpectedReadyReason, + HumanMessage: "unexpected node not ready", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationReason: "UnexpectedNotReady", + monitorapi.AnnotationCause: "container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:47:31", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:31", 2024), + }, + // networkProgressingInterval ended at 19:47:00 — 31s before the NotReady event. + { + Condition: monitorapi.Condition{ + Level: monitorapi.Warning, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeClusterOperator, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorClusterOperatorKey: "network", + }, + }, + Message: monitorapi.Message{ + HumanMessage: "Progressing because OVN is updating", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationCondition: "Progressing", + monitorapi.AnnotationStatus: "True", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024), + }, + }, + expected: []string{"node/node1 - cause/container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started? reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:47:31 +0000 UTC - to: 2024-11-11 19:47:31 +0000 UTC"}, + unexpectedReason: monitorapi.NodeUnexpectedReadyReason, + }, { name: "node unexpected ready caused by NetworkPluginNotReady WITHOUT network operator rollout", rawIntervals: monitorapi.Intervals{ @@ -447,6 +547,159 @@ func TestReportUnexpectedNodeDownFailures(t *testing.T) { expected: []string{"node/node1 - cause/kubelet stopped posting node status reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:46:00 +0000 UTC - to: 2024-11-11 19:46:00 +0000 UTC"}, unexpectedReason: monitorapi.NodeUnexpectedReadyReason, }, + { + // Test case for bug fix: node failure AFTER machine deletion should NOT be suppressed + name: "node unexpected ready reason after machine deletion completes", + rawIntervals: monitorapi.Intervals{ + { + Condition: monitorapi.Condition{ + Level: monitorapi.Error, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeNode, + Keys: map[monitorapi.LocatorKey]string{ + "node": "node1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.NodeUnexpectedReadyReason, + HumanMessage: "unexpected node not ready", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationReason: "UnexpectedNotReady", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:50:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:50:00", 2024), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeMachine, + Keys: map[monitorapi.LocatorKey]string{ + "machine": "machine1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.MachinePhase, + HumanMessage: "Machine is in deleted", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationConstructed: "machine-lifecycle-constructor", + monitorapi.AnnotationNode: "node1", + monitorapi.AnnotationPhase: "Deleting", + monitorapi.AnnotationReason: "MachinePhase", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024), + }, + }, + expected: []string{"node/node1 - reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:50:00 +0000 UTC - to: 2024-11-11 19:50:00 +0000 UTC"}, + unexpectedReason: monitorapi.NodeUnexpectedReadyReason, + }, + { + // Test case for bug fix: node failure BEFORE machine deletion should NOT be suppressed + name: "node unexpected ready reason before machine deletion starts", + rawIntervals: monitorapi.Intervals{ + { + Condition: monitorapi.Condition{ + Level: monitorapi.Error, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeNode, + Keys: map[monitorapi.LocatorKey]string{ + "node": "node1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.NodeUnexpectedReadyReason, + HumanMessage: "unexpected node not ready", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationReason: "UnexpectedNotReady", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:40:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:40:00", 2024), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeMachine, + Keys: map[monitorapi.LocatorKey]string{ + "machine": "machine1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.MachinePhase, + HumanMessage: "Machine is in deleted", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationConstructed: "machine-lifecycle-constructor", + monitorapi.AnnotationNode: "node1", + monitorapi.AnnotationPhase: "Deleting", + monitorapi.AnnotationReason: "MachinePhase", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024), + }, + }, + expected: []string{"node/node1 - reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:40:00 +0000 UTC - to: 2024-11-11 19:40:00 +0000 UTC"}, + unexpectedReason: monitorapi.NodeUnexpectedReadyReason, + }, + { + // Test case for bug fix: node failure AFTER machine deletion should NOT be suppressed (unreachable variant) + name: "node unexpected unreachable reason after machine deletion completes", + rawIntervals: monitorapi.Intervals{ + { + Condition: monitorapi.Condition{ + Level: monitorapi.Error, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeNode, + Keys: map[monitorapi.LocatorKey]string{ + "node": "node1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.NodeUnexpectedUnreachableReason, + HumanMessage: "unexpected node unreachable", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationReason: "UnexpectedUnreachable", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:50:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:50:00", 2024), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Info, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeMachine, + Keys: map[monitorapi.LocatorKey]string{ + "machine": "machine1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.MachinePhase, + HumanMessage: "Machine is in deleted", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationConstructed: "machine-lifecycle-constructor", + monitorapi.AnnotationNode: "node1", + monitorapi.AnnotationPhase: "Deleting", + monitorapi.AnnotationReason: "MachinePhase", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024), + }, + }, + expected: []string{"node/node1 - reason/UnexpectedUnreachable unexpected node unreachable at from: 2024-11-11 19:50:00 +0000 UTC - to: 2024-11-11 19:50:00 +0000 UTC"}, + unexpectedReason: monitorapi.NodeUnexpectedUnreachableReason, + }, } for _, tc := range testCases {