Skip to content

Commit 0ec3198

Browse files
authored
add GPU accounting for SMHP (#462)
* add GPU accounting * setup user associations
1 parent 6d15c26 commit 0ec3198

File tree

3 files changed

+76
-0
lines changed

3 files changed

+76
-0
lines changed

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,11 @@ def main(args):
228228
ExecuteBashScript("./utils/fsx_ubuntu.sh").run("0")
229229

230230
ExecuteBashScript("./start_slurm.sh").run(node_type, ",".join(controllers))
231+
232+
# Setup user associations for Slurm accounting (only on controller nodes)
233+
if node_type == SlurmNodeType.HEAD_NODE:
234+
ExecuteBashScript("./setup_user_associations.sh").run()
235+
231236
ExecuteBashScript("./utils/gen-keypair-ubuntu.sh").run()
232237
ExecuteBashScript("./utils/ssh-to-compute.sh").run()
233238

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/setup_mariadb_accounting.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ JobAcctGatherFrequency=30
103103
AccountingStorageType=accounting_storage/slurmdbd
104104
AccountingStorageHost=$DBD_HOST
105105
AccountingStoragePort=6819
106+
AccountingStorageTRES=gres/gpu
107+
GresTypes=gpu
106108
EOL
107109
}
108110

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
LOG_FILE="/var/log/provision/provisioning.log"
6+
7+
# Function to log messages
8+
logger() {
9+
echo "$@" | tee -a $LOG_FILE
10+
}
11+
12+
# Function to add user associations to Slurm accounting
13+
setup_user_associations() {
14+
logger "[INFO] Setting up user associations for Slurm accounting"
15+
16+
# Wait for slurmdbd to be ready
17+
local max_attempts=30
18+
local attempt=0
19+
20+
while [ $attempt -lt $max_attempts ]; do
21+
if systemctl is-active --quiet slurmdbd; then
22+
logger "[INFO] slurmdbd is active"
23+
sleep 5 # Give it a few more seconds to fully initialize
24+
break
25+
fi
26+
logger "[INFO] Waiting for slurmdbd to start... (attempt $((attempt+1))/$max_attempts)"
27+
sleep 2
28+
attempt=$((attempt + 1))
29+
done
30+
31+
if [ $attempt -eq $max_attempts ]; then
32+
logger "[ERROR] slurmdbd failed to start within timeout"
33+
return 1
34+
fi
35+
36+
# Add associations for existing users
37+
logger "[INFO] Adding user associations to Slurm accounting"
38+
39+
# First, ensure the cluster is added to the accounting database
40+
sacctmgr -i add cluster $(sacctmgr show cluster format=cluster --noheader | head -1) || true
41+
42+
# Add root account if it doesn't exist
43+
sacctmgr -i add account root Description="Root Account" || true
44+
45+
# Add ubuntu user to root account
46+
if id -u ubuntu >/dev/null 2>&1; then
47+
logger "[INFO] Adding ubuntu user to root account"
48+
sacctmgr -i add user ubuntu account=root || true
49+
fi
50+
51+
# Add associations for users from shared_users.txt if it exists
52+
SHARED_USER_FILE="shared_users.txt"
53+
if [[ -f $SHARED_USER_FILE ]] && [[ -s $SHARED_USER_FILE ]]; then
54+
while IFS="," read -r username uid home; do
55+
if id -u "$username" >/dev/null 2>&1; then
56+
logger "[INFO] Adding $username to root account"
57+
sacctmgr -i add user "$username" account=root || true
58+
fi
59+
done < $SHARED_USER_FILE
60+
fi
61+
62+
logger "[INFO] User associations setup completed"
63+
}
64+
65+
main() {
66+
setup_user_associations
67+
}
68+
69+
main "$@"

0 commit comments

Comments
 (0)