Skip to content

Commit 7186278

Browse files
authored
Nghtm patch 2 (#683)
* Update install_docker.sh Force override on the `/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg` This will over-ride the prompt to overwrite if the file already exists ``` File '/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg' exists. Overwrite? (y/N) ``` * updates to slurm LCS following ubuntu 22.04 debug. Add backoff to LCS scripts for more graceful recovery incase of intermittent failure, update install dcgm exporter to add cusom metrics, update dcgm exporter version
1 parent 5c4f1ea commit 7186278

File tree

5 files changed

+385
-121
lines changed

5 files changed

+385
-121
lines changed

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx.sh

Lines changed: 112 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,108 +1,144 @@
11
#!/bin/bash
22

3-
# must be run a sudo
4-
5-
set -x
6-
set -e
3+
#set -x # Debug output
74

85
# FSx Lustre Endpoints
96
FSX_DNS_NAME="$1"
107
FSX_MOUNTNAME="$2"
118
MOUNT_POINT="$3"
129

10+
# Cleanup function
11+
cleanup() {
12+
if [ $? -ne 0 ]; then
13+
echo "Script failed, checking logs..."
14+
sudo dmesg | tail -n 20
15+
echo "Mount status:"
16+
mount | grep lustre || true
17+
echo "LNet status:"
18+
sudo lctl list_nids || true
19+
fi
20+
}
21+
22+
trap cleanup EXIT
23+
1324
is_mounted() {
14-
mountpoint -q "$1"
15-
return $?
25+
sudo mountpoint -q "$1"
26+
return $?
1627
}
1728

1829
check_already_mounted() {
19-
# Check if FSx is already mounted to $MOUNT_POINT
20-
if is_mounted $MOUNT_POINT; then
21-
if grep -qs "$FSX_MOUNTNAME $MOUNT_POINT lustre" /proc/mounts; then
22-
echo "FSx Lustre already mounted to $MOUNT_POINT. Exiting."
23-
exit 0
24-
else
25-
echo "$MOUNT_POINT is mounted, but not to mountname: $FSX_MOUNTNAME from provisioning_parameters.json. Exiting."
26-
exit 1
30+
if is_mounted "$MOUNT_POINT"; then
31+
if sudo grep -qs "${FSX_DNS_NAME}@tcp:/${FSX_MOUNTNAME}" /proc/mounts; then
32+
echo "FSx Lustre already mounted to $MOUNT_POINT. Exiting."
33+
exit 0
34+
else
35+
echo "Warning: $MOUNT_POINT is mounted with different filesystem:"
36+
sudo grep "$MOUNT_POINT" /proc/mounts
37+
exit 1
38+
fi
2739
fi
28-
fi
2940
}
3041

3142
is_fsx_reachable() {
32-
if lctl ping "$FSX_DNS_NAME"; then
33-
echo "FSx is reachable"
34-
else
35-
echo "FSx is not reachable, Trying to mount system anyway"
36-
fi
43+
echo "Checking FSx reachability..."
44+
if sudo lctl ping "$FSX_DNS_NAME"; then
45+
echo "FSx is reachable"
46+
return 0
47+
else
48+
echo "FSx is not reachable. Will try mounting anyway."
49+
return 1
50+
fi
3751
}
3852

3953
add_to_fstab() {
40-
# Add FSx to /etc/fstab
41-
echo "$FSX_DNS_NAME@tcp:/$FSX_MOUNTNAME $MOUNT_POINT lustre defaults,noatime,flock,_netdev 0 0" | tee -a /etc/fstab
54+
echo "Adding mount entry to /etc/fstab..."
55+
# Backup existing fstab
56+
sudo cp /etc/fstab /etc/fstab.backup.$(date +%Y%m%d_%H%M%S)
57+
58+
# Remove any existing entries for this mount point
59+
sudo sed -i "\|${MOUNT_POINT}|d" /etc/fstab
60+
61+
# Add new entry
62+
echo "$FSX_DNS_NAME@tcp:/$FSX_MOUNTNAME $MOUNT_POINT lustre defaults,noatime,flock,_netdev 0 0" | sudo tee -a /etc/fstab
4263
}
4364

4465
mount_fs() {
45-
if [[ ! -d $MOUNT_POINT ]]; then
46-
mkdir -p $MOUNT_POINT
47-
chmod 644 $MOUNT_POINT
48-
fi
49-
50-
if mount -t lustre -o noatime,flock "$FSX_DNS_NAME"@tcp:/"$FSX_MOUNTNAME" "$MOUNT_POINT"; then
51-
if ! is_mounted $MOUNT_POINT ;then
52-
echo "Mounting FSx to $MOUNT_POINT directory successful, but mountpoint was not detected. Exiting."
53-
exit 1
66+
echo "Preparing to mount FSx..."
67+
68+
if [[ ! -d $MOUNT_POINT ]]; then
69+
sudo mkdir -p "$MOUNT_POINT" || { echo "Failed to create mount point"; exit 1; }
70+
sudo chmod 755 "$MOUNT_POINT"
5471
fi
55-
else
56-
echo "FAILED to mount, FSX to $MOUNT_POINT directory. Exiting."
57-
exit 1
58-
fi
59-
}
6072

73+
echo "Attempting to mount FSx..."
74+
if sudo mount -t lustre -o noatime,flock "${FSX_DNS_NAME}@tcp:/${FSX_MOUNTNAME}" "$MOUNT_POINT"; then
75+
if is_mounted "$MOUNT_POINT"; then
76+
echo "Mount successful:"
77+
df -h "$MOUNT_POINT"
78+
return 0
79+
else
80+
echo "Error: Mount command succeeded but mountpoint check failed"
81+
sudo dmesg | tail -n 10
82+
exit 1
83+
fi
84+
else
85+
echo "Error: Mount failed"
86+
sudo dmesg | tail -n 10
87+
exit 1
88+
fi
89+
}
6190

6291
load_lnet_modules() {
63-
modprobe -v lnet
92+
echo "Loading kernel modules..."
93+
sudo modprobe lustre || echo "Warning: loading lustre module failed"
94+
sudo modprobe lnet || { echo "Error: Failed to load LNet module"; exit 1; }
95+
sudo lctl network up || { echo "Error: Failed to bring up LNet network"; exit 1; }
6496
}
6597

66-
# create a systemd service to check mount periodically and remount FSx if necessary
67-
# To stop the service, run:
68-
# `systemctl stop check_mount.service`
69-
# To disable the service, run:
70-
# `systemctl disable check_mount.service`
7198
install_remount_service() {
72-
73-
if [[ ! -d /opt/ml/scripts ]]; then
74-
mkdir -p /opt/ml/scripts
75-
chmod 644 /opt/ml/scripts
76-
echo "Created dir /opt/ml/scripts"
77-
fi
99+
echo "Installing remount service..."
100+
if [[ ! -d /opt/ml/scripts ]]; then
101+
sudo mkdir -p /opt/ml/scripts
102+
sudo chmod 755 /opt/ml/scripts
103+
fi
78104

79-
CHECK_MOUNT_FILE=/opt/ml/scripts/check_mount_$FSX_MOUNTNAME.sh
105+
CHECK_MOUNT_FILE=/opt/ml/scripts/check_mount_${FSX_MOUNTNAME//\//_}.sh
80106

81-
cat > $CHECK_MOUNT_FILE << EOF
107+
sudo tee "$CHECK_MOUNT_FILE" > /dev/null << EOF
82108
#!/bin/bash
83109
MOUNT_POINT=$MOUNT_POINT
84-
if ! grep -qs "$MOUNT_POINT" /proc/mounts; then
85-
mount -t lustre -o noatime,flock "$FSX_DNS_NAME"@tcp:/"$FSX_MOUNTNAME" "$MOUNT_POINT"
86-
echo "Mounted FSx to $MOUNT_POINT"
110+
111+
if ! grep -qs "\$MOUNT_POINT" /proc/mounts; then
112+
modprobe lustre
113+
modprobe lnet
114+
lctl network up
115+
mount -t lustre -o noatime,flock "$FSX_DNS_NAME@tcp:/$FSX_MOUNTNAME" "\$MOUNT_POINT"
116+
echo "Mounted FSx to \$MOUNT_POINT"
87117
else
88-
echo "FSx Lustre already mounted to $MOUNT_POINT. Stopping services check_fsx_mount_$FSX_MOUNTNAME.timer and check_fsx_mount_$FSX_MOUNTNAME.service"
89-
systemctl stop check_fsx_mount_$FSX_MOUNTNAME.timer
118+
echo "FSx Lustre already mounted to \$MOUNT_POINT"
90119
fi
91120
EOF
92121

93-
chmod +x $CHECK_MOUNT_FILE
122+
sudo chmod +x "$CHECK_MOUNT_FILE"
94123

95-
cat > /etc/systemd/system/check_fsx_mount_$FSX_MOUNTNAME.service << EOF
124+
sudo tee "/etc/systemd/system/check_fsx_mount_${FSX_MOUNTNAME//\//_}.service" > /dev/null << EOF
96125
[Unit]
97126
Description=Check and remount FSx Lustre filesystems if necessary
127+
After=network-online.target
128+
Wants=network-online.target
98129
99130
[Service]
131+
Type=oneshot
100132
ExecStart=$CHECK_MOUNT_FILE
133+
RemainAfterExit=yes
134+
135+
[Install]
136+
WantedBy=multi-user.target
101137
EOF
102138

103-
cat > /etc/systemd/system/check_fsx_mount_$FSX_MOUNTNAME.timer << EOF
139+
sudo tee "/etc/systemd/system/check_fsx_mount_${FSX_MOUNTNAME//\//_}.timer" > /dev/null << EOF
104140
[Unit]
105-
Description=Run check_fsx_mount_$FSX_MOUNTNAME.service every minute
141+
Description=Run check_fsx_mount_${FSX_MOUNTNAME//\//_}.service every minute
106142
107143
[Timer]
108144
OnBootSec=1min
@@ -112,21 +148,26 @@ OnUnitActiveSec=1min
112148
WantedBy=timers.target
113149
EOF
114150

115-
systemctl daemon-reload
116-
systemctl enable --now check_fsx_mount_$FSX_MOUNTNAME.timer
151+
sudo systemctl daemon-reload
152+
sudo systemctl enable --now "check_fsx_mount_${FSX_MOUNTNAME//\//_}.timer"
117153
}
118154

119155
main() {
120-
echo "Mount_fsx called fsx_dns_name: $FSX_DNS_NAME, fsx_mountname: $FSX_MOUNTNAME"
121-
echo "Using mount_point: $MOUNT_POINT"
122-
load_lnet_modules
123-
check_already_mounted
124-
is_fsx_reachable
125-
add_to_fstab
126-
mount_fs
127-
install_remount_service
128-
echo "FSx Lustre mounted successfully to $MOUNT_POINT"
156+
echo "Starting FSx mount process..."
157+
echo "Parameters:"
158+
echo " FSX_DNS_NAME: $FSX_DNS_NAME"
159+
echo " FSX_MOUNTNAME: $FSX_MOUNTNAME"
160+
echo " MOUNT_POINT: $MOUNT_POINT"
161+
162+
load_lnet_modules
163+
check_already_mounted
164+
is_fsx_reachable
165+
add_to_fstab
166+
mount_fs
167+
install_remount_service
168+
169+
echo "FSx Lustre mount process completed successfully"
170+
df -h "$MOUNT_POINT"
129171
}
130172

131173
main "$@"
132-

1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/mount_fsx_openzfs.sh

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,42 @@ NFS_VERSION=4.2
1313
# Ansible Version
1414
ANSIBLE_VERSION="10.7.0"
1515

16+
# Retry settings
17+
MAX_ATTEMPTS=5
18+
INITIAL_BACKOFF=1
19+
20+
# Function for exponential backoff
21+
retry_with_backoff() {
22+
local max_attempts=$1
23+
local initial_backoff=$2
24+
local cmd="${@:3}"
25+
local attempt=1
26+
local backoff=$initial_backoff
27+
28+
while [ $attempt -le $max_attempts ]; do
29+
if eval "$cmd"; then
30+
return 0
31+
fi
32+
33+
if [ $attempt -eq $max_attempts ]; then
34+
echo "Failed after $attempt attempts"
35+
return 1
36+
fi
37+
38+
echo "Attempt $attempt failed. Retrying in $backoff seconds..."
39+
sleep $backoff
40+
41+
# Exponential backoff with jitter
42+
backoff=$(( backoff * 2 + (RANDOM % 3) ))
43+
attempt=$((attempt + 1))
44+
done
45+
}
46+
1647
# Function for error handling
1748
handle_error()
1849
{
1950
local exit_code=$?
20-
echo "Error occured in command: $BASH_COMMAND"
51+
echo "Error occurred in command: $BASH_COMMAND"
2152
echo "Exit code: $exit_code"
2253
exit $exit_code
2354
}
@@ -33,32 +64,45 @@ verify_parameters()
3364
fi
3465
}
3566

36-
# Install Ansible and collections: Move to higher LCS once others start using Ansible too.
67+
# Install Ansible and collections
3768
install_ansible()
3869
{
39-
apt-get update
40-
# apt-get install -y ansible=$ANSIBLE_VERSION
41-
apt-get install -y python3-pip
42-
python3 -m pip install "ansible==${ANSIBLE_VERSION}"
43-
ansible-galaxy collection install ansible.posix
70+
retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "apt-get update"
71+
retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "apt-get install -y python3-pip"
72+
retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "python3 -m pip install 'ansible==${ANSIBLE_VERSION}'"
73+
retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "ansible-galaxy collection install ansible.posix"
4474
}
4575

4676
# Install NFS Client based on OS
4777
install_nfs_client()
4878
{
4979
if [ -f /etc/lsb-release ]; then
5080
# Ubuntu
51-
ansible localhost -b -m ansible.builtin.apt -a "name=nfs-common state=present update_cache=yes"
81+
retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "ansible ********* -b -m ansible.builtin.apt -a 'name=nfs-common state=present update_cache=yes'"
5282
elif [ -f /etc/redhat-release ]; then
5383
# CentOS/RHEL
54-
ansible localhost -b -m ansible.builtin.yum -a "name=nfs-utils state=present"
84+
retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "ansible ********* -b -m ansible.builtin.yum -a 'name=nfs-utils state=present'"
5585
fi
5686
}
5787

5888
# Mount the FSx OpenZFS file system
5989
mount_fs()
6090
{
61-
ansible localhost -b -m ansible.posix.mount -a "path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted"
91+
# Create mount point directory if it doesn't exist
92+
if [ ! -d "$OPENZFS_MOUNT_POINT" ]; then
93+
mkdir -p "$OPENZFS_MOUNT_POINT"
94+
fi
95+
96+
retry_with_backoff $MAX_ATTEMPTS $INITIAL_BACKOFF "ansible ********* -b -m ansible.posix.mount -a \"path=$OPENZFS_MOUNT_POINT src=$FSX_OPENZFS_DNS_NAME:/fsx fstype=nfs opts=nfsvers=$NFS_VERSION,_netdev,nconnect=16,x-systemd.automount,x-systemd.requires=network-online.target dump=0 passno=0 state=mounted\""
97+
}
98+
99+
# Verify mount was successful
100+
verify_mount()
101+
{
102+
if ! mountpoint -q "$OPENZFS_MOUNT_POINT"; then
103+
echo "Failed to verify mount point $OPENZFS_MOUNT_POINT"
104+
exit 1
105+
fi
62106
}
63107

64108
main()
@@ -69,7 +113,8 @@ main()
69113
install_ansible
70114
install_nfs_client
71115
mount_fs
116+
verify_mount
72117
echo "FSx OpenZFS mounted successfully to $OPENZFS_MOUNT_POINT"
73118
}
74119

75-
main "$@"
120+
main "$@"

0 commit comments

Comments
 (0)