From 75b5096042d3e35c79c0081ded6b1ed9a839de8e Mon Sep 17 00:00:00 2001 From: "Mauricio J. Serrano" Date: Fri, 24 Oct 2025 16:55:23 -0400 Subject: [PATCH 1/2] numa support Signed-off-by: Mauricio J. Serrano --- aiu_fms_testing_utils/scripts/inference.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/aiu_fms_testing_utils/scripts/inference.py b/aiu_fms_testing_utils/scripts/inference.py index 3ec33f0e..cc593810 100644 --- a/aiu_fms_testing_utils/scripts/inference.py +++ b/aiu_fms_testing_utils/scripts/inference.py @@ -257,6 +257,12 @@ default=0, help="Timeout to use for messaging in minutes. Default set by PyTorch dist.init_process_group", ) +parser.add_argument( + "--numa", + action="store_true", + help="NUMA aware task distribution (requires distributed option)", +) + args = parser.parse_args() attention_map = { @@ -327,6 +333,19 @@ dist.init_process_group() # Fix until PT 2.3 torch._C._distributed_c10d._register_process_group("default", dist.group.WORLD) + if args.numa: + try: + from numa import info + numa_num_nodes = info.get_num_configured_nodes() + numa_world_size = dist.get_world_size() + numa_size_per_node = numa_world_size // numa_num_nodes + from numa import schedule + numa_rank = dist.get_rank() + numa_node = dist.get_rank() // numa_size_per_node + schedule.run_on_nodes(numa_node) + dprint(f"NUMA: process {numa_rank} set to node {numa_node}") + except: + dprint(f"NUMA not available in this machine, please install libnuma libraries") aiu_setup.aiu_dist_setup(dist.get_rank(), dist.get_world_size()) if args.device_type == "cuda": From 25cb3d3efb81c72a1dede81aa03e25a965bb4500 Mon Sep 17 00:00:00 2001 From: "Mauricio J. Serrano" Date: Sat, 25 Oct 2025 10:29:35 -0400 Subject: [PATCH 2/2] numa set local alloc Signed-off-by: Mauricio J. Serrano --- aiu_fms_testing_utils/scripts/inference.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aiu_fms_testing_utils/scripts/inference.py b/aiu_fms_testing_utils/scripts/inference.py index cc593810..b66b3b4c 100644 --- a/aiu_fms_testing_utils/scripts/inference.py +++ b/aiu_fms_testing_utils/scripts/inference.py @@ -343,6 +343,8 @@ numa_rank = dist.get_rank() numa_node = dist.get_rank() // numa_size_per_node schedule.run_on_nodes(numa_node) + from numa import memory + memory.set_local_alloc() dprint(f"NUMA: process {numa_rank} set to node {numa_node}") except: dprint(f"NUMA not available in this machine, please install libnuma libraries")