Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pkg/monitortests/node/watchnodes/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,16 +109,17 @@ func unreachableNodeTaint(finalIntervals monitorapi.Intervals) []*junitapi.JUnit
return tests
}

func intervalStartDuring(needle monitorapi.Interval, haystack monitorapi.Intervals) bool {
func intervalStartDuring(needle monitorapi.Interval, haystack monitorapi.Intervals, grace time.Duration) bool {
if len(haystack) == 0 {
// If there are no deleted intervals
// we can assume that the unexpected event is significant.
return false
}
for _, curr := range haystack {
needleStartEqualOrAfterFrom := needle.From.Equal(curr.From) || needle.From.After(curr.From)
needleStartEqualOrBeforeTo := needle.From.Equal(curr.To) || needle.From.Before(curr.To)
if needleStartEqualOrAfterFrom || needleStartEqualOrBeforeTo {
effectiveTo := curr.To.Add(grace)
needleStartEqualOrBeforeTo := needle.From.Equal(effectiveTo) || needle.From.Before(effectiveTo)
if needleStartEqualOrAfterFrom && needleStartEqualOrBeforeTo {
return true
}
}
Expand Down
11 changes: 7 additions & 4 deletions pkg/monitortests/node/watchnodes/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,15 @@ func reportUnexpectedNodeDownFailures(intervals monitorapi.Intervals, targetedRe

machineDeletingIntervals := machineNameToDeletePhases[machineNameForNode]

if !intervalStartDuring(unexpectedNodeUnready, machineDeletingIntervals) {
// Skip NotReady events caused by NetworkPluginNotReady during network operator rollout
// NetworkPluginNotReady is a RuntimeStatus reported by cri-o and exposed by kubelet in the condition's message.
if !intervalStartDuring(unexpectedNodeUnready, machineDeletingIntervals, 0) {
// Skip NotReady events caused by NetworkPluginNotReady during network operator rollout.
// NetworkPluginNotReady is a RuntimeStatus reported by cri-o and exposed by kubelet in
// the condition's message. A 30s grace period is applied beyond the end of the
// networkProgressingInterval because kubelet may take a few seconds to observe that CNI
// is ready again after the network operator finishes progressing.
conditionMsg := unexpectedNodeUnready.Message.Annotations[monitorapi.AnnotationCause]
if strings.Contains(conditionMsg, "NetworkPluginNotReady") {
if intervalStartDuring(unexpectedNodeUnready, networkProgressingIntervals) {
if intervalStartDuring(unexpectedNodeUnready, networkProgressingIntervals, 30*time.Second) {
continue
}
}
Expand Down
253 changes: 253 additions & 0 deletions pkg/monitortests/node/watchnodes/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,106 @@ func TestReportUnexpectedNodeDownFailures(t *testing.T) {
expected: []string{},
unexpectedReason: monitorapi.NodeUnexpectedReadyReason,
},
{
name: "node unexpected ready caused by NetworkPluginNotReady within 30s grace after network operator rollout ends",
rawIntervals: monitorapi.Intervals{
// NotReady event starts 15s after the networkProgressingInterval ends — within the 30s grace window.
{
Condition: monitorapi.Condition{
Level: monitorapi.Error,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeNode,
Keys: map[monitorapi.LocatorKey]string{
"node": "node1",
},
},
Message: monitorapi.Message{
Reason: monitorapi.NodeUnexpectedReadyReason,
HumanMessage: "unexpected node not ready",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationReason: "UnexpectedNotReady",
monitorapi.AnnotationCause: "container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:47:15", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:47:15", 2024),
},
// networkProgressingInterval ended at 19:47:00 — 15s before the NotReady event.
{
Condition: monitorapi.Condition{
Level: monitorapi.Warning,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeClusterOperator,
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorClusterOperatorKey: "network",
},
},
Message: monitorapi.Message{
HumanMessage: "Progressing because OVN is updating",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationCondition: "Progressing",
monitorapi.AnnotationStatus: "True",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024),
},
},
expected: []string{},
unexpectedReason: monitorapi.NodeUnexpectedReadyReason,
},
{
name: "node unexpected ready caused by NetworkPluginNotReady beyond 30s grace after network operator rollout ends",
rawIntervals: monitorapi.Intervals{
// NotReady event starts 31s after the networkProgressingInterval ends — beyond the 30s grace window.
{
Condition: monitorapi.Condition{
Level: monitorapi.Error,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeNode,
Keys: map[monitorapi.LocatorKey]string{
"node": "node1",
},
},
Message: monitorapi.Message{
Reason: monitorapi.NodeUnexpectedReadyReason,
HumanMessage: "unexpected node not ready",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationReason: "UnexpectedNotReady",
monitorapi.AnnotationCause: "container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:47:31", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:47:31", 2024),
},
// networkProgressingInterval ended at 19:47:00 — 31s before the NotReady event.
{
Condition: monitorapi.Condition{
Level: monitorapi.Warning,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeClusterOperator,
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorClusterOperatorKey: "network",
},
},
Message: monitorapi.Message{
HumanMessage: "Progressing because OVN is updating",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationCondition: "Progressing",
monitorapi.AnnotationStatus: "True",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024),
},
},
expected: []string{"node/node1 - cause/container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: no CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started? reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:47:31 +0000 UTC - to: 2024-11-11 19:47:31 +0000 UTC"},
unexpectedReason: monitorapi.NodeUnexpectedReadyReason,
},
{
name: "node unexpected ready caused by NetworkPluginNotReady WITHOUT network operator rollout",
rawIntervals: monitorapi.Intervals{
Expand Down Expand Up @@ -447,6 +547,159 @@ func TestReportUnexpectedNodeDownFailures(t *testing.T) {
expected: []string{"node/node1 - cause/kubelet stopped posting node status reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:46:00 +0000 UTC - to: 2024-11-11 19:46:00 +0000 UTC"},
unexpectedReason: monitorapi.NodeUnexpectedReadyReason,
},
{
// Test case for bug fix: node failure AFTER machine deletion should NOT be suppressed
name: "node unexpected ready reason after machine deletion completes",
rawIntervals: monitorapi.Intervals{
{
Condition: monitorapi.Condition{
Level: monitorapi.Error,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeNode,
Keys: map[monitorapi.LocatorKey]string{
"node": "node1",
},
},
Message: monitorapi.Message{
Reason: monitorapi.NodeUnexpectedReadyReason,
HumanMessage: "unexpected node not ready",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationReason: "UnexpectedNotReady",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:50:00", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:50:00", 2024),
},
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeMachine,
Keys: map[monitorapi.LocatorKey]string{
"machine": "machine1",
},
},
Message: monitorapi.Message{
Reason: monitorapi.MachinePhase,
HumanMessage: "Machine is in deleted",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationConstructed: "machine-lifecycle-constructor",
monitorapi.AnnotationNode: "node1",
monitorapi.AnnotationPhase: "Deleting",
monitorapi.AnnotationReason: "MachinePhase",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024),
},
},
expected: []string{"node/node1 - reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:50:00 +0000 UTC - to: 2024-11-11 19:50:00 +0000 UTC"},
unexpectedReason: monitorapi.NodeUnexpectedReadyReason,
},
{
// Test case for bug fix: node failure BEFORE machine deletion should NOT be suppressed
name: "node unexpected ready reason before machine deletion starts",
rawIntervals: monitorapi.Intervals{
{
Condition: monitorapi.Condition{
Level: monitorapi.Error,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeNode,
Keys: map[monitorapi.LocatorKey]string{
"node": "node1",
},
},
Message: monitorapi.Message{
Reason: monitorapi.NodeUnexpectedReadyReason,
HumanMessage: "unexpected node not ready",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationReason: "UnexpectedNotReady",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:40:00", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:40:00", 2024),
},
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeMachine,
Keys: map[monitorapi.LocatorKey]string{
"machine": "machine1",
},
},
Message: monitorapi.Message{
Reason: monitorapi.MachinePhase,
HumanMessage: "Machine is in deleted",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationConstructed: "machine-lifecycle-constructor",
monitorapi.AnnotationNode: "node1",
monitorapi.AnnotationPhase: "Deleting",
monitorapi.AnnotationReason: "MachinePhase",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024),
},
},
expected: []string{"node/node1 - reason/UnexpectedNotReady unexpected node not ready at from: 2024-11-11 19:40:00 +0000 UTC - to: 2024-11-11 19:40:00 +0000 UTC"},
unexpectedReason: monitorapi.NodeUnexpectedReadyReason,
},
{
// Test case for bug fix: node failure AFTER machine deletion should NOT be suppressed (unreachable variant)
name: "node unexpected unreachable reason after machine deletion completes",
rawIntervals: monitorapi.Intervals{
{
Condition: monitorapi.Condition{
Level: monitorapi.Error,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeNode,
Keys: map[monitorapi.LocatorKey]string{
"node": "node1",
},
},
Message: monitorapi.Message{
Reason: monitorapi.NodeUnexpectedUnreachableReason,
HumanMessage: "unexpected node unreachable",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationReason: "UnexpectedUnreachable",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:50:00", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:50:00", 2024),
},
{
Condition: monitorapi.Condition{
Level: monitorapi.Info,
Locator: monitorapi.Locator{
Type: monitorapi.LocatorTypeMachine,
Keys: map[monitorapi.LocatorKey]string{
"machine": "machine1",
},
},
Message: monitorapi.Message{
Reason: monitorapi.MachinePhase,
HumanMessage: "Machine is in deleted",
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationConstructed: "machine-lifecycle-constructor",
monitorapi.AnnotationNode: "node1",
monitorapi.AnnotationPhase: "Deleting",
monitorapi.AnnotationReason: "MachinePhase",
},
},
},
From: utility.SystemdJournalLogTime("Nov 11 19:45:00", 2024),
To: utility.SystemdJournalLogTime("Nov 11 19:47:00", 2024),
},
},
expected: []string{"node/node1 - reason/UnexpectedUnreachable unexpected node unreachable at from: 2024-11-11 19:50:00 +0000 UTC - to: 2024-11-11 19:50:00 +0000 UTC"},
unexpectedReason: monitorapi.NodeUnexpectedUnreachableReason,
},
}

for _, tc := range testCases {
Expand Down