diff --git a/pkg/monitortests/node/watchnodes/node.go b/pkg/monitortests/node/watchnodes/node.go index 60c1f79e197b..1513163dc2dd 100644 --- a/pkg/monitortests/node/watchnodes/node.go +++ b/pkg/monitortests/node/watchnodes/node.go @@ -154,7 +154,7 @@ func startNodeMonitoring(ctx context.Context, m monitorapi.RecorderWriter, clien // We want to fail the monitor test if a node goes not ready // if it is unexpected. // Unexpected in this case means that it went not ready outside - // of a MCO config update. + // of a MCO config update or CNI rollout. func(node, oldNode *corev1.Node) []monitorapi.Interval { var intervals []monitorapi.Interval @@ -167,11 +167,16 @@ func startNodeMonitoring(ctx context.Context, m monitorapi.RecorderWriter, clien now := time.Now() if isOldNodeReady && !isNewNodeReady && isConfigTheSame && !isNodeUnscheduable { + msg := monitorapi.NewMessage().Reason(monitorapi.NodeUnexpectedReadyReason). + HumanMessage("unexpected node not ready") + // Extract the NodeReady condition message for downstream filtering + if c := findNodeCondition(node.Status.Conditions, corev1.NodeReady, 0); c != nil && c.Message != "" { + msg = msg.WithAnnotation(monitorapi.AnnotationCause, c.Message) + } intervals = append(intervals, monitorapi.NewInterval(monitorapi.SourceUnexpectedReady, monitorapi.Error). Locator(monitorapi.NewLocator().NodeFromName(node.Name)). - Message(monitorapi.NewMessage().Reason(monitorapi.NodeUnexpectedReadyReason). - HumanMessage("unexpected node not ready")). + Message(msg). Display(). Build(now, now)) } @@ -370,6 +375,13 @@ func reportUnexpectedNodeDownFailures(intervals monitorapi.Intervals, targetedRe return false }) + // Get all network ClusterOperator Progressing=True intervals + networkProgressingIntervals := intervals.Filter(func(eventInterval monitorapi.Interval) bool { + return eventInterval.Locator.Keys[monitorapi.LocatorClusterOperatorKey] == "network" && + eventInterval.Message.Annotations[monitorapi.AnnotationCondition] == "Progressing" && + eventInterval.Message.Annotations[monitorapi.AnnotationStatus] == "True" + }) + // We need to build a map of node to machine name nodeNameToMachineName := map[string]string{} // Given the deleted machine, store the deleted intervals. @@ -394,6 +406,15 @@ func reportUnexpectedNodeDownFailures(intervals monitorapi.Intervals, targetedRe machineDeletingIntervals := machineNameToDeletePhases[machineNameForNode] if !intervalStartDuring(unexpectedNodeUnready, machineDeletingIntervals) { + // Skip NotReady events caused by NetworkPluginNotReady during network operator rollout + // NetworkPluginNotReady is a RuntimeStatus reported by cri-o and exposed by kubelet in the condition's message. + conditionMsg := unexpectedNodeUnready.Message.Annotations[monitorapi.AnnotationCause] + if strings.Contains(conditionMsg, "NetworkPluginNotReady") { + if intervalStartDuring(unexpectedNodeUnready, networkProgressingIntervals) { + continue + } + } + failures = append(failures, fmt.Sprintf("%v - %v at from: %v - to: %v", unexpectedNodeUnready.Locator.OldLocator(), unexpectedNodeUnready.Message.OldMessage(), unexpectedNodeUnready.From, unexpectedNodeUnready.To)) } } diff --git a/pkg/monitortests/node/watchnodes/node_test.go b/pkg/monitortests/node/watchnodes/node_test.go index 87f294204075..f0bfa876b9d2 100644 --- a/pkg/monitortests/node/watchnodes/node_test.go +++ b/pkg/monitortests/node/watchnodes/node_test.go @@ -321,6 +321,132 @@ func TestReportUnexpectedNodeDownFailures(t *testing.T) { expected: []string{}, unexpectedReason: monitorapi.NodeUnexpectedUnreachableReason, }, + { + name: "node unexpected ready caused by NetworkPluginNotReady during network operator rollout", + rawIntervals: monitorapi.Intervals{ + // The UnexpectedNotReady interval with NetworkPluginNotReady in AnnotationCause + { + Condition: monitorapi.Condition{ + Level: monitorapi.Error, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeNode, + Keys: map[monitorapi.LocatorKey]string{ + "node": "node1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.NodeUnexpectedReadyReason, + HumanMessage: "unexpected node not ready", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationReason: "UnexpectedNotReady", + monitorapi.AnnotationCause: "container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:46:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:46:00", 2024), + }, + // The network CO Progressing=True interval that brackets it + { + Condition: monitorapi.Condition{ + Level: monitorapi.Warning, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeClusterOperator, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorClusterOperatorKey: "network", + }, + }, + Message: monitorapi.Message{ + HumanMessage: "Progressing because OVN is updating", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationCondition: "Progressing", + monitorapi.AnnotationStatus: "True", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024), + }, + }, + expected: []string{}, + unexpectedReason: monitorapi.NodeUnexpectedReadyReason, + }, + { + name: "node unexpected ready caused by NetworkPluginNotReady WITHOUT network operator rollout", + rawIntervals: monitorapi.Intervals{ + { + Condition: monitorapi.Condition{ + Level: monitorapi.Error, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeNode, + Keys: map[monitorapi.LocatorKey]string{ + "node": "node1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.NodeUnexpectedReadyReason, + HumanMessage: "unexpected node not ready", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationReason: "UnexpectedNotReady", + monitorapi.AnnotationCause: "container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:46:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:46:00", 2024), + }, + }, + expected: []string{"node/node1 - cause/container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started? reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:46:00 +0000 UTC - to: 2024-11-11 19:46:00 +0000 UTC"}, + unexpectedReason: monitorapi.NodeUnexpectedReadyReason, + }, + { + name: "node unexpected ready NOT caused by NetworkPluginNotReady during network operator rollout", + rawIntervals: monitorapi.Intervals{ + { + Condition: monitorapi.Condition{ + Level: monitorapi.Error, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeNode, + Keys: map[monitorapi.LocatorKey]string{ + "node": "node1", + }, + }, + Message: monitorapi.Message{ + Reason: monitorapi.NodeUnexpectedReadyReason, + HumanMessage: "unexpected node not ready", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationReason: "UnexpectedNotReady", + monitorapi.AnnotationCause: "kubelet stopped posting node status", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:46:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:46:00", 2024), + }, + { + Condition: monitorapi.Condition{ + Level: monitorapi.Warning, + Locator: monitorapi.Locator{ + Type: monitorapi.LocatorTypeClusterOperator, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorClusterOperatorKey: "network", + }, + }, + Message: monitorapi.Message{ + HumanMessage: "Progressing because OVN is updating", + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationCondition: "Progressing", + monitorapi.AnnotationStatus: "True", + }, + }, + }, + From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024), + To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024), + }, + }, + expected: []string{"node/node1 - cause/kubelet stopped posting node status reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:46:00 +0000 UTC - to: 2024-11-11 19:46:00 +0000 UTC"}, + unexpectedReason: monitorapi.NodeUnexpectedReadyReason, + }, } for _, tc := range testCases {