From 6f4efdd8c1e4b2442d77b4af0a3d4a892abe385e Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 9 Mar 2026 23:52:19 +0100 Subject: [PATCH 1/2] Support multi-cluster operation in Slurm backends --- docs/config_reference.rst | 18 +++++++++++++++++- reframe/core/schedulers/slurm.py | 18 +++++++++++++++--- reframe/schemas/config.json | 5 +++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index ec7210390..3ee73f54e 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -431,6 +431,22 @@ System Partition Configuration List of hosts in a partition that uses the ``ssh`` scheduler. +.. py:attribute:: systems.partitions.sched_options.slurm_multi_cluster_mode + + :required: No + :default: ``[]`` + + List of Slurm clusters to poll for submitted jobs. + + If empty, only the local cluster is considered. + If the single value ``all`` is passed, then all clusters will be considered. + This is translated directly to Slurm's ``-M`` option passed to the ``sacct`` or ``squeue`` commands. + + This option is relevant only for the Slurm backends. + + .. versionadded:: 4.10 + + .. py:attribute:: systems.partitions.sched_options.ignore_reqnodenotavail :required: No @@ -1647,7 +1663,7 @@ The additional properties for the ``httpjson`` handler are the following: These may depend on the server configuration. .. note:: - If you specify an authorization header here, it will be evaluated at the start of the test session and potentially expire. + If you specify an authorization header here, it will be evaluated at the start of the test session and potentially expire. Consider using the :attr:`~config.logging.handlers_perflog..httpjson..authorization_header` parameter instead for dynamic authorization headers. .. versionadded:: 4.2 diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index ac18c2eda..9fc834693 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -147,6 +147,7 @@ def __init__(self): self._sched_access_in_submit = self.get_option( 'sched_access_in_submit' ) + self._multi_clusters = self.get_option('slurm_multi_cluster_mode') self._available_states = { 'ALLOCATED', 'COMPLETING', @@ -155,6 +156,15 @@ def __init__(self): 'RESERVED' } + # Define the base sacct and squeue commands to account for Slurm's + # multiple cluster mode if enabled + self._sacct = 'sacct' + self._squeue = 'squeue' + if self._multi_clusters: + clusters = ",".join(self._multi_clusters) + self._sacct += f' -M {clusters}' + self._squeue += f' -M {clusters}' + def make_job(self, *args, **kwargs): return _SlurmJob(*args, **kwargs) @@ -491,7 +501,7 @@ def poll(self, *jobs): ) try: completed = _run_strict( - f'sacct -S {t_start} -P ' + f'{self._sacct} -S {t_start} -P ' f'-j {",".join(job.jobid for job in jobs)} ' f'-o jobid,state,exitcode,end,nodelist' ) @@ -570,7 +580,9 @@ def _cancel_if_blocked(self, job, reasons=None): return if not reasons: - completed = osext.run_command('squeue -h -j %s -o %%r' % job.jobid) + completed = osext.run_command( + f'{self._squeue} -h -j {job.jobid} -o %r' + ) reasons = completed.stdout.splitlines() if not reasons: # Can't retrieve job's state. Perhaps it has finished already @@ -677,7 +689,7 @@ def poll(self, *jobs): # finished already, squeue might return an error about an invalid # job id. completed = osext.run_command( - f'squeue -h -j {",".join(job.jobid for job in jobs)} ' + f'{self._squeue} -h -j {",".join(job.jobid for job in jobs)} ' f'-o "%%i|%%T|%%N|%%r"' ) diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index fc774f8b6..51a8907d8 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -119,6 +119,10 @@ "type": "array", "items": {"type": "string"} }, + "slurm_multi_cluster_mode": { + "type": "array", + "items": {"type": "string"} + }, "sched_access_in_submit": {"type": "boolean"}, "unqualified_hostnames": {"type": "boolean"}, "use_nodes_option": {"type": "boolean"} @@ -708,6 +712,7 @@ "systems*/sched_options/job_submit_timeout": 60, "systems*/sched_options/max_sacct_failures": 3, "systems*/sched_options/sched_access_in_submit": false, + "systems*/sched_options/slurm_multi_cluster_mode": [], "systems*/sched_options/ssh_hosts": [], "systems*/sched_options/resubmit_on_errors": [], "systems*/sched_options/unqualified_hostnames": false, From dbfc43fc04b79ccc122b1aaee2e3503288f5f59f Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 25 Mar 2026 02:03:32 +0100 Subject: [PATCH 2/2] Treat multi-cluster mode in partition's access options too --- docs/config_reference.rst | 1 + reframe/core/schedulers/__init__.py | 6 +++--- reframe/core/schedulers/slurm.py | 8 +++++++- unittests/test_schedulers.py | 31 ++++++++++++++++++++++++----- 4 files changed, 37 insertions(+), 9 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 3ee73f54e..e8ae6060b 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -441,6 +441,7 @@ System Partition Configuration If empty, only the local cluster is considered. If the single value ``all`` is passed, then all clusters will be considered. This is translated directly to Slurm's ``-M`` option passed to the ``sacct`` or ``squeue`` commands. + If set, the ``-M`` option will also be passed in the partition's :attr:`~config.systems.partitions.access` options. This option is relevant only for the Slurm backends. diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index e66224cfe..d9c42fb43 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -370,13 +370,13 @@ class Job(jsonext.JSONSerializable, metaclass=JobMeta): # The sched_* arguments are exposed also to the frontend def __init__(self, - name, + name, *, workdir='.', script_filename=None, stdout=None, stderr=None, sched_flex_alloc_nodes=None, - sched_access=[], + sched_access=None, sched_options=None): self._cli_options = list(sched_options) if sched_options else [] @@ -390,7 +390,7 @@ def __init__(self, # Backend scheduler related information self._sched_flex_alloc_nodes = sched_flex_alloc_nodes - self._sched_access = sched_access + self._sched_access = list(sched_access) if sched_access else [] # Live job information; to be filled during job's lifetime by the # scheduler diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 9fc834693..54c3e96b5 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -161,11 +161,17 @@ def __init__(self): self._sacct = 'sacct' self._squeue = 'squeue' if self._multi_clusters: - clusters = ",".join(self._multi_clusters) + clusters = ','.join(self._multi_clusters) self._sacct += f' -M {clusters}' self._squeue += f' -M {clusters}' def make_job(self, *args, **kwargs): + if self._multi_clusters: + # Inject the `-M` option in case of multiple clusters + sched_access = kwargs.get('sched_access') or [] + sched_access += [f'-M {",".join(self._multi_clusters)}'] + kwargs['sched_access'] = sched_access + return _SlurmJob(*args, **kwargs) def _format_option(self, var, option): diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index c62e121bd..ac1c28e7d 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -76,13 +76,25 @@ def exec_ctx(make_exec_ctx, scheduler): @pytest.fixture def make_job(scheduler, launcher, tmp_path): - def _make_job(sched_opts=None, **jobargs): + def _make_job(sched_opts=None, config_opts=None, **jobargs): + sched_type = scheduler + + class PatchedScheduler(sched_type): + def get_option(self, name): + try: + return config_opts[name] + except KeyError: + return super().get_option(name) + + if config_opts: + sched_type = PatchedScheduler + if sched_opts: - sched = scheduler(**sched_opts) - elif scheduler.registered_name == 'ssh': - sched = scheduler(hosts=['localhost']) + sched = sched_type(**sched_opts) + elif sched_type.registered_name == 'ssh': + sched = sched_type(hosts=['localhost']) else: - sched = scheduler() + sched = sched_type() return Job.create( sched, launcher(), @@ -487,6 +499,15 @@ def test_prepare_nodes_option_minimal(make_exec_ctx, make_job, slurm_only): assert re.search(r'--nodes=16', fp.read()) is not None +def test_prepare_multi_cluster(make_job, slurm_only): + job = make_job(config_opts={ + 'slurm_multi_cluster_mode': ['cluster1', 'cluster2'] + }) + prepare_job(job) + with open(job.script_filename) as fp: + assert re.search(r'-M cluster1,cluster2', fp.read()) is not None + + def test_submit(make_job, exec_ctx): minimal_job = make_job(sched_access=exec_ctx.access) prepare_job(minimal_job)