Skip to content

Commit cc9ee9a

Browse files
authored
Add restarts metric to windows services (#21840)
* Add state metric to windows services * Changelog * Validate metadata * Add restarts metric to windows services * Add unit tests * Requested changes * Update readme * Fix changelog * Fix tests
1 parent 8ce3250 commit cc9ee9a

File tree

5 files changed

+87
-0
lines changed

5 files changed

+87
-0
lines changed

windows_service/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ services:
7575
trigger_start: true
7676
```
7777

78+
Beginning with Agent version 7.74, the check automatically collects metrics for Windows services.
79+
7880
#### Tags
7981

8082
The check automatically tags the Windows service name to each service check in the `windows_service:<SERVICE>` tag. The `<SERVICE>` name in the tag uses lowercase and special characters are replaced with underscores. See [Getting Started with Tags][12] for more information.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add restarts metric to windows services

windows_service/datadog_checks/windows_service/windows_service.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,21 @@ class WindowsService(AgentCheck):
246246
}
247247
UNKNOWN_LITERAL = "unknown"
248248

249+
def __init__(self, name, init_config, instances):
250+
super().__init__(name, init_config, instances)
251+
self._service_pid_cache: dict[str, int] = {}
252+
253+
def _get_service_restarts(self, service_name: str, current_pid: int) -> int:
254+
if current_pid == 0:
255+
return 0
256+
prev_pid = self._service_pid_cache.get(service_name, None)
257+
restarts = 0
258+
if prev_pid is not None and prev_pid != current_pid:
259+
restarts = 1
260+
# only store the last running pid for the service
261+
self._service_pid_cache[service_name] = current_pid
262+
return restarts
263+
249264
def check(self, instance):
250265
services = instance.get('services', [])
251266
custom_tags = instance.get('tags', [])
@@ -318,6 +333,8 @@ def check(self, instance):
318333
if service_pid != 0:
319334
service_uptime = _get_process_uptime_from_cache(service_pid, process_cache)
320335

336+
service_restarts = self._get_service_restarts(service_name, service_pid)
337+
321338
status = self.STATE_TO_STATUS.get(state, self.UNKNOWN)
322339
state_string = self.STATE_TO_STRING.get(state, self.UNKNOWN_LITERAL)
323340

@@ -346,6 +363,7 @@ def check(self, instance):
346363
# Send 1 for windows_service.state so the user can sum by the windows_service_state tag
347364
# to filter services by state. e.g. sum:windows_service.state{*} by windows_service_state
348365
self.gauge('windows_service.state', 1, tags=tags)
366+
self.count('windows_service.restarts', service_restarts, tags=tags)
349367

350368
if 'ALL' not in services:
351369
for service in services_unseen:
@@ -370,3 +388,4 @@ def check(self, instance):
370388
self.log.debug('service state for %s %s', service, status)
371389
self.gauge('windows_service.uptime', 0, tags=tags)
372390
self.gauge('windows_service.state', 1, tags=tags)
391+
self.count('windows_service.restarts', 0, tags=tags)

windows_service/metadata.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags
2+
windows_service.restarts,count,,event,,The number of restarts of the host process of the Windows service,0,windows_service,restarts,,
23
windows_service.state,gauge,,service,,Sum by state to count the number of services in each state,0,windows_service,state,,
34
windows_service.uptime,gauge,,second,,The uptime (in seconds) of the host process of the Windows service,0,windows_service,uptime,,

windows_service/tests/test_windows_service.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ def assert_service_check_and_metrics(aggregator, services):
4242
value=1,
4343
count=service.count,
4444
)
45+
aggregator.assert_metric(
46+
'windows_service.restarts',
47+
tags=service.tags,
48+
value=0,
49+
count=service.count,
50+
)
4551

4652

4753
def test_bad_config(check, instance_bad_config):
@@ -325,6 +331,64 @@ def test_name_regex_order(aggregator, check, instance_name_regex_prefix):
325331
assert_service_check_and_metrics(aggregator, services)
326332

327333

334+
def test_service_restart_detection(aggregator, check, instance_basic):
335+
"""
336+
Test that service restarts are detected when the service PID changes between checks.
337+
"""
338+
c = check(instance_basic)
339+
340+
mock_services = [
341+
{
342+
'ServiceName': 'EventLog',
343+
'DisplayName': 'Windows Event Log',
344+
'CurrentState': win32service.SERVICE_RUNNING,
345+
'ProcessId': 1234,
346+
},
347+
{
348+
'ServiceName': 'Dnscache',
349+
'DisplayName': 'DNS Client',
350+
'CurrentState': win32service.SERVICE_RUNNING,
351+
'ProcessId': 5678,
352+
},
353+
]
354+
355+
with patch('win32service.EnumServicesStatusEx', return_value=mock_services):
356+
c.check(instance_basic)
357+
358+
# On first check, restarts should be 0
359+
aggregator.assert_metric(
360+
'windows_service.restarts',
361+
value=0,
362+
tags=['windows_service:EventLog', 'windows_service_state:running', 'service:EventLog', 'optional:tag1'],
363+
)
364+
aggregator.assert_metric(
365+
'windows_service.restarts',
366+
value=0,
367+
tags=['windows_service:Dnscache', 'windows_service_state:running', 'service:Dnscache', 'optional:tag1'],
368+
)
369+
370+
aggregator.reset()
371+
372+
# Only change the PID of EventLog
373+
mock_services[0]['ProcessId'] = 9999
374+
375+
with patch('win32service.EnumServicesStatusEx', return_value=mock_services):
376+
c.check(instance_basic)
377+
378+
# On second check, EventLog should have restarts=1
379+
aggregator.assert_metric(
380+
'windows_service.restarts',
381+
value=1,
382+
tags=['windows_service:EventLog', 'windows_service_state:running', 'service:EventLog', 'optional:tag1'],
383+
)
384+
# Dnscache should still have restarts=0
385+
aggregator.assert_metric(
386+
'windows_service.restarts',
387+
value=0,
388+
tags=['windows_service:Dnscache', 'windows_service_state:running', 'service:Dnscache', 'optional:tag1'],
389+
)
390+
391+
328392
@pytest.mark.e2e
329393
def test_basic_e2e(dd_agent_check, check, instance_basic):
330394
aggregator = dd_agent_check(instance_basic)

0 commit comments

Comments
 (0)