11#! /bin/bash
22
3- # must be run a sudo
4-
5- set -x
6- set -e
3+ # set -x # Debug output
74
85# FSx Lustre Endpoints
96FSX_DNS_NAME=" $1 "
107FSX_MOUNTNAME=" $2 "
118MOUNT_POINT=" $3 "
129
10+ # Cleanup function
11+ cleanup () {
12+ if [ $? -ne 0 ]; then
13+ echo " Script failed, checking logs..."
14+ sudo dmesg | tail -n 20
15+ echo " Mount status:"
16+ mount | grep lustre || true
17+ echo " LNet status:"
18+ sudo lctl list_nids || true
19+ fi
20+ }
21+
22+ trap cleanup EXIT
23+
1324is_mounted () {
14- mountpoint -q " $1 "
15- return $?
25+ sudo mountpoint -q " $1 "
26+ return $?
1627}
1728
1829check_already_mounted () {
19- # Check if FSx is already mounted to $MOUNT_POINT
20- if is_mounted $MOUNT_POINT ; then
21- if grep -qs " $FSX_MOUNTNAME $MOUNT_POINT lustre" /proc/mounts; then
22- echo " FSx Lustre already mounted to $MOUNT_POINT . Exiting."
23- exit 0
24- else
25- echo " $MOUNT_POINT is mounted, but not to mountname: $FSX_MOUNTNAME from provisioning_parameters.json. Exiting."
26- exit 1
30+ if is_mounted " $MOUNT_POINT " ; then
31+ if sudo grep -qs " ${FSX_DNS_NAME} @tcp:/${FSX_MOUNTNAME} " /proc/mounts; then
32+ echo " FSx Lustre already mounted to $MOUNT_POINT . Exiting."
33+ exit 0
34+ else
35+ echo " Warning: $MOUNT_POINT is mounted with different filesystem:"
36+ sudo grep " $MOUNT_POINT " /proc/mounts
37+ exit 1
38+ fi
2739 fi
28- fi
2940}
3041
3142is_fsx_reachable () {
32- if lctl ping " $FSX_DNS_NAME " ; then
33- echo " FSx is reachable"
34- else
35- echo " FSx is not reachable, Trying to mount system anyway"
36- fi
43+ echo " Checking FSx reachability..."
44+ if sudo lctl ping " $FSX_DNS_NAME " ; then
45+ echo " FSx is reachable"
46+ return 0
47+ else
48+ echo " FSx is not reachable. Will try mounting anyway."
49+ return 1
50+ fi
3751}
3852
3953add_to_fstab () {
40- # Add FSx to /etc/fstab
41- echo " $FSX_DNS_NAME @tcp:/$FSX_MOUNTNAME $MOUNT_POINT lustre defaults,noatime,flock,_netdev 0 0" | tee -a /etc/fstab
54+ echo " Adding mount entry to /etc/fstab..."
55+ # Backup existing fstab
56+ sudo cp /etc/fstab /etc/fstab.backup.$( date +%Y%m%d_%H%M%S)
57+
58+ # Remove any existing entries for this mount point
59+ sudo sed -i " \|${MOUNT_POINT} |d" /etc/fstab
60+
61+ # Add new entry
62+ echo " $FSX_DNS_NAME @tcp:/$FSX_MOUNTNAME $MOUNT_POINT lustre defaults,noatime,flock,_netdev 0 0" | sudo tee -a /etc/fstab
4263}
4364
4465mount_fs () {
45- if [[ ! -d $MOUNT_POINT ]]; then
46- mkdir -p $MOUNT_POINT
47- chmod 644 $MOUNT_POINT
48- fi
49-
50- if mount -t lustre -o noatime,flock " $FSX_DNS_NAME " @tcp:/" $FSX_MOUNTNAME " " $MOUNT_POINT " ; then
51- if ! is_mounted $MOUNT_POINT ; then
52- echo " Mounting FSx to $MOUNT_POINT directory successful, but mountpoint was not detected. Exiting."
53- exit 1
66+ echo " Preparing to mount FSx..."
67+
68+ if [[ ! -d $MOUNT_POINT ]]; then
69+ sudo mkdir -p " $MOUNT_POINT " || { echo " Failed to create mount point" ; exit 1; }
70+ sudo chmod 755 " $MOUNT_POINT "
5471 fi
55- else
56- echo " FAILED to mount, FSX to $MOUNT_POINT directory. Exiting."
57- exit 1
58- fi
59- }
6072
73+ echo " Attempting to mount FSx..."
74+ if sudo mount -t lustre -o noatime,flock " ${FSX_DNS_NAME} @tcp:/${FSX_MOUNTNAME} " " $MOUNT_POINT " ; then
75+ if is_mounted " $MOUNT_POINT " ; then
76+ echo " Mount successful:"
77+ df -h " $MOUNT_POINT "
78+ return 0
79+ else
80+ echo " Error: Mount command succeeded but mountpoint check failed"
81+ sudo dmesg | tail -n 10
82+ exit 1
83+ fi
84+ else
85+ echo " Error: Mount failed"
86+ sudo dmesg | tail -n 10
87+ exit 1
88+ fi
89+ }
6190
6291load_lnet_modules () {
63- modprobe -v lnet
92+ echo " Loading kernel modules..."
93+ sudo modprobe lustre || echo " Warning: loading lustre module failed"
94+ sudo modprobe lnet || { echo " Error: Failed to load LNet module" ; exit 1; }
95+ sudo lctl network up || { echo " Error: Failed to bring up LNet network" ; exit 1; }
6496}
6597
66- # create a systemd service to check mount periodically and remount FSx if necessary
67- # To stop the service, run:
68- # `systemctl stop check_mount.service`
69- # To disable the service, run:
70- # `systemctl disable check_mount.service`
7198install_remount_service () {
72-
73- if [[ ! -d /opt/ml/scripts ]]; then
74- mkdir -p /opt/ml/scripts
75- chmod 644 /opt/ml/scripts
76- echo " Created dir /opt/ml/scripts"
77- fi
99+ echo " Installing remount service..."
100+ if [[ ! -d /opt/ml/scripts ]]; then
101+ sudo mkdir -p /opt/ml/scripts
102+ sudo chmod 755 /opt/ml/scripts
103+ fi
78104
79- CHECK_MOUNT_FILE=/opt/ml/scripts/check_mount_$FSX_MOUNTNAME .sh
105+ CHECK_MOUNT_FILE=/opt/ml/scripts/check_mount_${ FSX_MOUNTNAME// \/ / _} .sh
80106
81- cat > $CHECK_MOUNT_FILE << EOF
107+ sudo tee " $CHECK_MOUNT_FILE " > /dev/null << EOF
82108#!/bin/bash
83109MOUNT_POINT=$MOUNT_POINT
84- if ! grep -qs "$MOUNT_POINT " /proc/mounts; then
85- mount -t lustre -o noatime,flock "$FSX_DNS_NAME "@tcp:/"$FSX_MOUNTNAME " "$MOUNT_POINT "
86- echo "Mounted FSx to $MOUNT_POINT "
110+
111+ if ! grep -qs "\$ MOUNT_POINT" /proc/mounts; then
112+ modprobe lustre
113+ modprobe lnet
114+ lctl network up
115+ mount -t lustre -o noatime,flock "$FSX_DNS_NAME @tcp:/$FSX_MOUNTNAME " "\$ MOUNT_POINT"
116+ echo "Mounted FSx to \$ MOUNT_POINT"
87117else
88- echo "FSx Lustre already mounted to $MOUNT_POINT . Stopping services check_fsx_mount_$FSX_MOUNTNAME .timer and check_fsx_mount_$FSX_MOUNTNAME .service"
89- systemctl stop check_fsx_mount_$FSX_MOUNTNAME .timer
118+ echo "FSx Lustre already mounted to \$ MOUNT_POINT"
90119fi
91120EOF
92121
93- chmod +x $CHECK_MOUNT_FILE
122+ sudo chmod +x " $CHECK_MOUNT_FILE "
94123
95- cat > /etc/systemd/system/check_fsx_mount_$FSX_MOUNTNAME .service << EOF
124+ sudo tee " /etc/systemd/system/check_fsx_mount_${ FSX_MOUNTNAME// \/ / _} .service" > /dev/null << EOF
96125[Unit]
97126Description=Check and remount FSx Lustre filesystems if necessary
127+ After=network-online.target
128+ Wants=network-online.target
98129
99130[Service]
131+ Type=oneshot
100132ExecStart=$CHECK_MOUNT_FILE
133+ RemainAfterExit=yes
134+
135+ [Install]
136+ WantedBy=multi-user.target
101137EOF
102138
103- cat > /etc/systemd/system/check_fsx_mount_$FSX_MOUNTNAME .timer << EOF
139+ sudo tee " /etc/systemd/system/check_fsx_mount_${ FSX_MOUNTNAME// \/ / _} .timer" > /dev/null << EOF
104140[Unit]
105- Description=Run check_fsx_mount_$FSX_MOUNTNAME .service every minute
141+ Description=Run check_fsx_mount_${ FSX_MOUNTNAME// \/ / _} .service every minute
106142
107143[Timer]
108144OnBootSec=1min
@@ -112,21 +148,26 @@ OnUnitActiveSec=1min
112148WantedBy=timers.target
113149EOF
114150
115- systemctl daemon-reload
116- systemctl enable --now check_fsx_mount_$FSX_MOUNTNAME .timer
151+ sudo systemctl daemon-reload
152+ sudo systemctl enable --now " check_fsx_mount_${ FSX_MOUNTNAME// \/ / _} .timer"
117153}
118154
119155main () {
120- echo " Mount_fsx called fsx_dns_name: $FSX_DNS_NAME , fsx_mountname: $FSX_MOUNTNAME "
121- echo " Using mount_point: $MOUNT_POINT "
122- load_lnet_modules
123- check_already_mounted
124- is_fsx_reachable
125- add_to_fstab
126- mount_fs
127- install_remount_service
128- echo " FSx Lustre mounted successfully to $MOUNT_POINT "
156+ echo " Starting FSx mount process..."
157+ echo " Parameters:"
158+ echo " FSX_DNS_NAME: $FSX_DNS_NAME "
159+ echo " FSX_MOUNTNAME: $FSX_MOUNTNAME "
160+ echo " MOUNT_POINT: $MOUNT_POINT "
161+
162+ load_lnet_modules
163+ check_already_mounted
164+ is_fsx_reachable
165+ add_to_fstab
166+ mount_fs
167+ install_remount_service
168+
169+ echo " FSx Lustre mount process completed successfully"
170+ df -h " $MOUNT_POINT "
129171}
130172
131173main " $@ "
132-
0 commit comments