Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion aci-preupgrade-validation-script.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from textwrap import TextWrapper
from getpass import getpass
from collections import defaultdict, OrderedDict
from datetime import datetime
from datetime import datetime, timedelta
from argparse import ArgumentParser
from itertools import chain
import threading
Expand Down Expand Up @@ -6410,6 +6410,40 @@ def svccore_excessive_data_check(**kwargs):
return Result(result=ERROR, msg="Error occurred while fetching svccore object counts: {}".format(str(e)), doc_url=doc_url)


@check_wrapper(check_title="Stale dbgacEpgSummaryTask Objects")
def stale_epg_summary_task_check(tversion, **kwargs):
result = PASS
headers = ["DN", "Start Time"]
data = []
recommended_action = "Delete the listed stale dbgacEpgSummaryTask objects to prevent policymgr crash."
doc_url = "https://datacenter.github.io/ACI-Pre-Upgrade-Validation-Script/validations/#stale-dbgacepgsummarytask-objects"

if not tversion:
return Result(result=MANUAL, msg=TVER_MISSING)

version_affected = (
(tversion.major1 == "6" and tversion.major2 == "1" and (tversion.older_than("6.1(5e)") or tversion.same_as("6.1(5e)")))
or (tversion.major1 == "6" and tversion.major2 == "2" and (tversion.older_than("6.2(1g)") or tversion.same_as("6.2(1g)")))
)
if not version_affected:
return Result(result=NA, msg=VER_NOT_AFFECTED)

threshold = datetime.utcnow() - timedelta(hours=24)
for obj in icurl("class", 'dbgacEpgSummaryTask.json?query-target-filter=eq(dbgacEpgSummaryTask.operSt,"processing")'):
attr = obj["dbgacEpgSummaryTask"]["attributes"]
dn = attr.get("dn", "")
start_ts = attr.get("startTs", "")
try:
task_dt = datetime.strptime(start_ts[:19], "%Y-%m-%dT%H:%M:%S")
except ValueError:
continue
if task_dt < threshold:
data.append([dn, start_ts])
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see node_id in the output. Pls add it to know on which node issue is encountered.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

node_id is not available in the object's attributes or DN. The DN is already unique enough to identify and delete the specific object.


if data:
result = FAIL_O
return Result(result=result, headers=headers, data=data, recommended_action=recommended_action, doc_url=doc_url)

# ---- Script Execution ----


Expand Down Expand Up @@ -6581,6 +6615,7 @@ class CheckManager:
rogue_ep_coop_exception_mac_check,
n9k_c9408_model_lem_count_check,
inband_management_policy_misconfig_check,
stale_epg_summary_task_check,
]
ssh_checks = [
# General
Expand Down
16 changes: 14 additions & 2 deletions docs/docs/validations.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ Items | Defect | This Script
[N9K-C9408 with more than 5 N9K-X9400-16W LEMs][d31] | CSCws82819 | :white_check_mark: | :no_entry_sign:
[Multi-Pod Modular Spine Bootscript File][d32] | CSCwr66848 | :white_check_mark: | :no_entry_sign:
[Inband Management Policy Misconfiguration][d33]| CSCwd40071 | :white_check_mark: | :no_entry_sign:
[Stale dbgacEpgSummaryTask Objects][d34] | CSCwt69100 | :white_check_mark: | :no_entry_sign:

[d1]: #ep-announce-compatibility
[d2]: #eventmgr-db-size-defect-susceptibility
Expand Down Expand Up @@ -237,6 +238,7 @@ Items | Defect | This Script
[d31]: #n9k-c9408-with-more-than-5-n9k-x9400-16w-lems
[d32]: #multi-pod-modular-spine-bootscript-file
[d33]: #inband-management-policy-misconfiguration
[d34]: #stale-dbgacepgsummarytask-objects

## General Check Details

Expand Down Expand Up @@ -2792,13 +2794,22 @@ Due to excessive `svccoreCtrlr` or `svccoreNode` managed objects, Apic gui stuck

The svccoreCtrlr and svccoreNode objects represent core files related to Apic and Leaf/Spines process respectively.

Due to [CSCws84232][67], the APIC GUI may become unresponsive after login, with dashboards stuck in a continuous Loading…state.
Due to [CSCws84232][67], the APIC GUI may become unresponsive after login, with dashboards stuck in a continuous "Loading…"state.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pls revert this if it's done by mistake.

Administrators may be unable to access or operate the APIC GUI, potentially impacting day-to-day management or upgrade.

This check will verify the count of the `svccoreCtrlr` Managed Object and raise and alarm with the bug if object count found more than 240. Remove the content or objects of `svccoreCtrlr` or `svccoreNode`. Contact Cisco TAC or upgrade to a release containing the fix for CSCws84232 before proceeding with an upgrade.

### Stale dbgacEpgSummaryTask Objects

Due to [CSCwt69100][70], a stale `dbgacEpgSummaryTask` object stuck in `processing` state with empty content can cause the policymgr process to crash on all APICs during an upgrade or process restart.

Affected versions: version <= 6.1(5e) or version <= 6.2(1g).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pls remove the affected versions, no need to mention here


The check queries for `dbgacEpgSummaryTask` objects with `operSt="processing"` and `startTs` older than 24 hours. Such objects are considered stale and unexpected. If found, delete them before proceeding with the upgrade to prevent policymgr from crashing on restart.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to mention about the check and what it does. Just provide appropriate recommended action.



[0]: https://github.com/datacenter/ACI-Pre-Upgrade-Validation-Script
[70]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwt69100
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pls remove it from here as it needs to be added at the end after [69]

[1]: https://www.cisco.com/c/dam/en/us/td/docs/Website/datacenter/apicmatrix/index.html
[2]: https://www.cisco.com/c/en/us/support/switches/nexus-9000-series-switches/products-release-notes-list.html
[3]: https://www.cisco.com/c/en/us/td/docs/dcn/aci/apic/5x/release-notes/cisco-aci-nx-os-release-notes-1501.html#_Toc140580685
Expand Down Expand Up @@ -2867,4 +2878,5 @@ This check will verify the count of the `svccoreCtrlr` Managed Object and raise
[66]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwr66848
[67]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwh80837
[68]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwd40071
[69]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCws84232
[69]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCws84232
[70]: https://bst.cloudapps.cisco.com/bugsearch/bug/CSCwt69100
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[
{
"dbgacEpgSummaryTask": {
"attributes": {
"dn": "action/policymgrsubj-[uni/tn-TN_PROD/epgToEpg-EPG_PROD_FE_TO_EPG_PROD_BE/dstepg-[uni/tn-TN_PROD/ap-AP_PROD/epg-EPG_PROD_BE]]/dbgacEpgSummaryTask-ReportODACDef",
"operSt": "processing",
"startTs": "2024-01-01T00:00:00.000+00:00"
}
}
},
{
"dbgacEpgSummaryTask": {
"attributes": {
"dn": "action/policymgrsubj-[uni/tn-TN_TEST/epgToEpg-EPG_TEST_A_TO_EPG_TEST_B/dstepg-[uni/tn-TN_TEST/ap-AP_TEST/epg-EPG_TEST_B]]/dbgacEpgSummaryTask-ReportODACDef",
"operSt": "processing",
"startTs": "2026-01-15T11:30:00.000+00:00"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"dbgacEpgSummaryTask": {
"attributes": {
"dn": "action/policymgrsubj-[uni/tn-TN_PROD/epgToEpg-EPG_PROD_FE_TO_EPG_PROD_BE/dstepg-[uni/tn-TN_PROD/ap-AP_PROD/epg-EPG_PROD_BE]]/dbgacEpgSummaryTask-ReportODACDef",
"operSt": "processing",
"startTs": "2026-01-15T11:30:00.000+00:00"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[
{
"dbgacEpgSummaryTask": {
"attributes": {
"dn": "action/policymgrsubj-[uni/tn-TN_PROD/epgToEpg-EPG_PROD_FE_TO_EPG_PROD_BE/dstepg-[uni/tn-TN_PROD/ap-AP_PROD/epg-EPG_PROD_BE]]/dbgacEpgSummaryTask-ReportODACDef",
"operSt": "processing",
"startTs": "2024-01-01T00:00:00.000+00:00"
}
}
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import os
import pytest
import importlib
from datetime import datetime
from helpers.utils import read_data

script = importlib.import_module("aci-preupgrade-validation-script")

dir = os.path.dirname(os.path.abspath(__file__))

test_function = "stale_epg_summary_task_check"

# icurl query key
task_api = 'dbgacEpgSummaryTask.json?query-target-filter=eq(dbgacEpgSummaryTask.operSt,"processing")'

# Fixed "now" used by mock_datetime fixture: 2026-01-15 12:00:00 UTC
# Stale threshold = 2026-01-14 12:00:00 UTC (24h before fixed now)
# dbgacEpgSummaryTask_stale.json -> startTs 2024-01-01 (way before threshold) -> FAIL_O
# dbgacEpgSummaryTask_recent.json -> startTs 2026-01-15 11:30 UTC (30 min before fixed now) -> PASS
FIXED_NOW = datetime(2026, 1, 15, 12, 0, 0)


class MockDatetime:
"""Replaces datetime class in script to return a fixed 'now' for deterministic tests."""
@staticmethod
def utcnow():
return FIXED_NOW

@staticmethod
def strptime(date_string, format):
return datetime.strptime(date_string, format)

def __new__(cls, *args, **kwargs):
return datetime(*args, **kwargs)


@pytest.fixture
def mock_datetime(monkeypatch):
"""Monkeypatches script.datetime so utcnow() returns a fixed timestamp."""
monkeypatch.setattr(script, "datetime", MockDatetime)


@pytest.mark.parametrize(
"tversion, icurl_outputs, expected_result, expected_data",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pls add the test case for tversion missing.

[
# Case 1: Target version 6.2(2a) is beyond both affected ranges (6.1(5e) and 6.2(1g)).
# The target binary has the fix so version gate fails. Expected: NA without any API calls.
(
"6.2(2a)",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pls change the version 6.2(2a), it doesn't exist. Update with existing CCO version.

{},
script.NA,
[],
),
# Case 2: Target version 6.1(5e) is affected, no dbgacEpgSummaryTask objects found.
# No stale tasks present — system is safe. Expected: PASS.
(
"6.1(5e)",
{
task_api: read_data(dir, "dbgacEpgSummaryTask_empty.json"),
},
script.PASS,
[],
),
# Case 3: Target version 6.1(5e) is affected, one task in processing state but startTs is
# only 30 minutes old (within 24-hour threshold). Not considered stale.
# Expected: PASS.
(
"6.1(5e)",
{
task_api: read_data(dir, "dbgacEpgSummaryTask_recent.json"),
},
script.PASS,
[],
),
# Case 4: Target version 6.1(5e) is affected, one task stuck in processing with startTs
# from 2024 (way older than 24 hours). Stale task detected.
# Expected: FAIL_O with the offending DN and startTs reported.
(
"6.1(5e)",
{
task_api: read_data(dir, "dbgacEpgSummaryTask_stale.json"),
},
script.FAIL_O,
[
[
"action/policymgrsubj-[uni/tn-TN_PROD/epgToEpg-EPG_PROD_FE_TO_EPG_PROD_BE/dstepg-[uni/tn-TN_PROD/ap-AP_PROD/epg-EPG_PROD_BE]]/dbgacEpgSummaryTask-ReportODACDef",
"2024-01-01T00:00:00.000+00:00",
]
],
),
# Case 5: Target version 6.2(1g) is affected, two tasks — one stale (2024), one recent.
# Only the stale task should be reported. Expected: FAIL_O with one row.
(
"6.2(1g)",
{
task_api: read_data(dir, "dbgacEpgSummaryTask_mixed.json"),
},
script.FAIL_O,
[
[
"action/policymgrsubj-[uni/tn-TN_PROD/epgToEpg-EPG_PROD_FE_TO_EPG_PROD_BE/dstepg-[uni/tn-TN_PROD/ap-AP_PROD/epg-EPG_PROD_BE]]/dbgacEpgSummaryTask-ReportODACDef",
"2024-01-01T00:00:00.000+00:00",
]
],
),
],
)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pls add the test cases for the following
Stale exist for exactly 24hrs
Stale exists for more than 24hrs(25hrs) and less than 24hrs(like 23hrs 59mins) combo

def test_logic(run_check, mock_icurl, mock_datetime, tversion, icurl_outputs, expected_result, expected_data):
result = run_check(
tversion=script.AciVersion(tversion),
)
assert result.result == expected_result
assert result.data == expected_data