Skip to content
4 changes: 2 additions & 2 deletions dirac-common/src/DIRACCommon/Core/Utilities/StateMachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,11 @@ def setState(self, candidateState, noWarn=False, *, logger_warn=None):
if not candidateState:
self.state = candidateState
elif candidateState in self.states:
if not self.states[self.state].stateMap:
if self.state is not None and not self.states[self.state].stateMap:
if not noWarn and logger_warn:
logger_warn("Final state, won't move", f"({self.state}, asked to move to {candidateState})")
return S_OK(self.state)
if candidateState not in self.states[self.state].stateMap and logger_warn:
if self.state is not None and candidateState not in self.states[self.state].stateMap and logger_warn:
logger_warn(f"Can't move from {self.state} to {candidateState}, choosing a good one")
result = self.getNextState(candidateState)
if not result["OK"]:
Expand Down
91 changes: 91 additions & 0 deletions dirac-common/tests/Core/Utilities/test_StateMachine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import pytest

from DIRACCommon.Core.Utilities.StateMachine import State, StateMachine


class TestState:
"""Tests for the State class"""

def test_state_basic(self):
state = State(100)
assert state.level == 100
assert state.stateMap == []
assert state.default is None

def test_state_with_transitions(self):
state = State(0, ["State1", "State2"], defState="State1")
assert state.level == 0
assert state.stateMap == ["State1", "State2"]
assert state.default == "State1"

def test_transition_rule_in_map(self):
state = State(0, ["State1", "State2"], defState="State1")
assert state.transitionRule("State2") == "State2"

def test_transition_rule_not_in_map_with_default(self):
state = State(0, ["State1", "State2"], defState="State1")
assert state.transitionRule("UnknownState") == "State1"

def test_transition_rule_not_in_map_without_default(self):
state = State(0, ["State1", "State2"])
assert state.transitionRule("UnknownState") == "UnknownState"


class TestStateMachine:
"""Tests for the StateMachine class"""

def test_get_level_of_state(self):
sm = StateMachine()
assert sm.getLevelOfState("Nirvana") == 100
assert sm.getLevelOfState("UnknownState") == -1

def test_get_states(self):
sm = StateMachine()
assert sm.getStates() == ["Nirvana"]

def test_set_state_none_candidate(self):
sm = StateMachine(state="Nirvana")
result = sm.setState(None)
assert result["OK"] is True
assert result["Value"] is None

def test_set_state_same_state(self):
sm = StateMachine(state="Nirvana")
result = sm.setState("Nirvana")
assert result["OK"] is True
assert result["Value"] == "Nirvana"

def test_set_state_invalid_candidate(self):
sm = StateMachine(state="Nirvana")
result = sm.setState("InvalidState")
assert result["OK"] is False

def test_set_state_from_none_to_valid(self):
"""Test transitioning from None state to a valid state"""
sm = StateMachine(state=None)
result = sm.setState("Nirvana")
assert result["OK"] is True
assert result["Value"] == "Nirvana"

def test_set_state_to_none_then_to_valid(self):
"""Test setting state to None and then to a valid state - reproduces KeyError bug"""
sm = StateMachine(state="Nirvana")
# First transition to None
result = sm.setState(None)
assert result["OK"] is True
assert result["Value"] is None
# Then transition to a valid state - this should not raise KeyError
result = sm.setState("Nirvana")
assert result["OK"] is True
assert result["Value"] == "Nirvana"

def test_get_next_state_with_none_current(self):
sm = StateMachine(state=None)
result = sm.getNextState("Nirvana")
assert result["OK"] is True
assert result["Value"] == "Nirvana"

def test_get_next_state_invalid(self):
sm = StateMachine(state="Nirvana")
result = sm.getNextState("InvalidState")
assert result["OK"] is False
96 changes: 96 additions & 0 deletions dirac.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -1164,6 +1164,102 @@ Operations
pilotVORepoBranch = master # Branch to use
workDir = /tmp/pilot3Files # Local work directory on the masterCS for synchronisation
}

# RSS section
# See https://dirac.readthedocs.io/en/latest/AdministratorGuide/Systems/ResourceStatus/configuration.html
ResourceStatus
{
Config
{
Cache = 720 # Lifetime (seconds) of the RSSCache (default: 300)
FromAddress = rss@dirac # Email address used as sender for RSS notifications
}
Policies
{
# Command arguments for the built-in Downtime policy type.
# hours = 0 means only ongoing downtimes are considered (default).
# Set hours > 0 to also catch downtimes starting within that window.
# Note: this section has no policyType key and is therefore NOT treated
# as a policy definition — it only sets command argument defaults.
Downtime
{
hours = 0 # look-ahead window in hours (0 = ongoing only, default)
}
# Command arguments for the built-in FreeDiskSpace policy type.
# Unit and thresholds apply to all SEs monitored by this policy.
# Note: same as above — no policyType key, so not a policy definition.
FreeDiskSpace
{
Unit = TB # Space unit: TB (default), GB or MB
Banned_threshold = 0.1 # Free space below which the SE is Banned (in the chosen unit)
Degraded_threshold = 5 # Free space below which the SE is Degraded (in the chosen unit)
}
# Example: apply Downtime policy to all Sites
SiteDowntime
{
policyType = Downtime
matchParams
{
element = Site
}
}
# Example: apply Downtime policy to all Resources
ResourceDowntime
{
policyType = Downtime
matchParams
{
element = Resource
}
}
# Example: apply FreeDiskSpace policy to all SE WriteAccess status types
SEWriteAccessFreeDiskSpace
{
policyType = FreeDiskSpace
matchParams
{
element = Resource
elementType = StorageElement
statusType = WriteAccess
}
}
# Example: apply FreeDiskSpace to SE1 with specific args (Unit and Banned_threshold);
# Degraded_threshold falls back to the default defined in the FreeDiskSpace section above.
SpecificFreeDiskSpace
{
policyType = FreeDiskSpace
Unit = GB
Banned_threshold = 15
matchParams
{
name = SE1
}
}
}
PolicyActions
{
# Example: send an email when any Resource reaches Banned status
BannedResourceEmail
{
actionType = EmailAction
notificationGroups = RSSAdmins
matchParams
{
element = Resource
status = Banned
}
}
}
Notification
{
RSSAdmins
{
users = admin@dirac # email addresses used for the notifications
}
}
}

# Services section
Services
{
# See http://dirac.readthedocs.io/en/latest/AdministratorGuide/Resources/Catalog/index.html
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ Already described in :ref:`config section <config section>`.
Policies
--------

This section describes the policies and the conditions to match elements.
This section describes the policies, and to which elements such policies are applied.

::

/Operations/Defaults/ResourceStatus
/Policies
/PolicyName
policyType = policyType
policyType = nameOfPolicyType
doNotCombineResult = something
/matchParams
element = element
Expand All @@ -42,10 +42,10 @@ This section describes the policies and the conditions to match elements.

This is the complete definition of a policy. Let's go one by one.

* PolicyName : this must be a human readable name explaining what the policy is doing ( mandatory ).
* policyType : is the name of the policy we want to run as defined in DIRAC.ResourceStatusSystem.Policy.Configurations ( mandatory ).
* doNotCombineResult : if this option is present, the status will not be merged with the rest of statuses ( but actions on this policy will apply ).
* matchParams : is the dictionary containing the policy metadata used by :ref:`Info Getter <info getter>` to match policies. Any of them can be a CSV.
* PolicyName : this must be a human readable name explaining what the policy is doing (mandatory).
* policyType : is the name of the policy we want to run as defined in DIRAC.ResourceStatusSystem.Policy.Configurations (mandatory). Possible policy names: "Downtime", "FreeDiskSpace", "JobEfficiency", "PilotEfficiency", "AlwaysBanned", "AlwaysActive", "AlwaysProbing", "AlwaysDegraded", "Propagation", "JobDoneRatio", "JobRunningWaitingRatio", "JobRunningMatchedRatio"
* doNotCombineResult : if this option is present, the status will not be merged with the rest of statuses (but actions on this policy will apply).
* matchParams : This section defines to which elements the policy is applied. It is used by :ref:`Info Getter <info getter>` to match policies.

.. note :: Remember, declare ONLY the parameters in match params that want to be taken into account.

Expand All @@ -66,6 +66,105 @@ we cannot define the following matchParams:

Code templates and examples for creating custom policies: :doc:`../../../DeveloperGuide/Systems/ResourceStatus/index`


The Downtime and FreeDiskSpace policies have some configurable parameters.

Built-in Downtime Policy
~~~~~~~~~~~~~~~~~~~~~~~~

The ``Downtime`` policy type evaluates `GOC DB <https://goc.egi.eu/>`__ downtime data for a Site or Resource.
Severity is mapped to RSS status as follows:

* **OUTAGE** → **Banned**
* **WARNING** → **Degraded**
* No downtime → **Active**

The look-ahead window is configurable from the Operations CS:

::

/Operations/Defaults/ResourceStatus
/Policies
/Downtime
hours = 0 # hours to look ahead (0 = ongoing only, default)

.. note::

Setting ``hours = 0`` (the default) means only downtimes that are currently ongoing
are considered. Setting a positive value (e.g. ``12``) also catches downtimes scheduled
to start within that window, which is useful for proactive status changes.

This section has no ``policyType`` key and is therefore treated purely as
command-argument defaults, not as a policy definition.

Example: flag elements with downtimes starting within the next 24 hours::

/Operations/Defaults/ResourceStatus/Policies/Downtime
{
hours = 24
}


Example: setting 2 downtime policies:

::

/Operations/Defaults/ResourceStatus
/Policies
/OngoingDowntime
policyType = Downtime
hours = 0 # hours to look ahead (0 = ongoing only, default)
/matchParams
element = Site
/Downtime12
policyType = Downtime
hours = 12
/matchParams
element = Resource



Built-in FreeDiskSpace Policy
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The ``FreeDiskSpace`` policy type monitors Storage Element occupancy.
It compares the free space reported by the SE against two configurable thresholds:

* If free space is below ``Banned_threshold``, the SE is set to **Banned**.
* If free space is below ``Degraded_threshold`` (but above ``Banned_threshold``), the SE is set to **Degraded**.
* Otherwise the SE is set to **Active**.

All three parameters — unit, banned threshold, and degraded threshold — are fully configurable
from the Operations CS and fall back to safe defaults:

::

/Operations/Defaults/ResourceStatus
/Policies
/FreeDiskSpace
Unit = TB # unit for the SE occupancy query (TB, GB or MB)
Banned_threshold = 0.1 # in the chosen unit (default)
Degraded_threshold = 5 # in the chosen unit (default)

.. note::

These keys live under ``/Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace``,
not under the ``/matchParams`` sub-section. They tune the **command arguments**, not the
element-matching logic. This section has no ``policyType`` key and is therefore not treated
as a policy definition by the policy engine.

The default values of ``0.1`` and ``5`` are always used as fallback regardless of unit.
Make sure to set meaningful threshold values explicitly in the CS when changing the unit.

Example: use GB with tighter thresholds::

/Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace
{
Unit = GB
Banned_threshold = 100
Degraded_threshold = 5000
}

-------------
PolicyActions
-------------
Expand Down
20 changes: 19 additions & 1 deletion docs/source/DeveloperGuide/Systems/ResourceStatus/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ Cache tables for metrics used by policies.
* - PolicyResult
- Policy evaluation results (Element, Name, PolicyName, Status, Reason)
* - SpaceTokenOccupancyCache
- Storage space usage (Endpoint, Token, Free, Guaranteed)
- Storage space usage (Endpoint, Token, Free, Total) — values stored in MB
* - TransferCache
- Transfer quality metrics (SourceName, DestinationName, Metric, Value)

Expand Down Expand Up @@ -215,6 +215,24 @@ Policies inherit from ``PolicyBase`` and implement ``evaluate()``.
return {'Status': 'Degraded', 'Reason': f'Low efficiency: {efficiency:.2%}'}
return {'Status': 'Banned', 'Reason': f'Very low efficiency: {efficiency:.2%}'}

FreeDiskSpace Policy
--------------------

The ``FreeDiskSpacePolicy`` (``Policy/FreeDiskSpacePolicy.py``) evaluates SE occupancy using
configurable thresholds. Thresholds are passed through as command arguments so they propagate
from the CS configuration all the way to the policy evaluation:

1. ``Configurations.py`` reads ``Unit``, ``Banned_threshold`` and ``Degraded_threshold`` from the
Operations CS via ``Operations().getValue("ResourceStatus/Policies/FreeDiskSpace/Banned_threshold", 0.1)``
and stores them in the policy ``args`` dict.
2. ``FreeDiskSpaceCommand`` reads these values from ``self.args`` in ``_prepareCommand()`` and
returns them alongside ``Free`` and ``Total`` in both ``doNew()`` and ``doCache()``.
3. ``FreeDiskSpacePolicy._evaluate()`` reads ``Banned_threshold`` and ``Degraded_threshold``
from the command result dict (with safe defaults) and applies the comparison.

This design keeps thresholds fully configurable per deployment without code changes.
See :ref:`rss_advanced_configuration` for the available CS keys.

Command Implementation
----------------------

Expand Down
Loading
Loading