From 72764067bbfcbd78ce6e17a051116e3746df2164 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 28 Apr 2026 14:54:26 +0000 Subject: [PATCH 1/6] disabled rocky tests due to out-of-date kernel and base image idempotency errors --- gpu/test_gpu.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index d6c86bd8c..db64083da 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -180,8 +180,8 @@ def verify_driver_signature(self, name): def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") metadata = "install-gpu-agent=false" if configuration == 'SINGLE' \ @@ -213,8 +213,8 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") @@ -250,8 +250,8 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): @@ -300,8 +300,8 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") # Operation [projects/.../regions/.../operations/...] failed: # Invalid value for field 'resource.machineType': \ @@ -344,8 +344,8 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ @@ -379,8 +379,8 @@ def test_gpu_allocation(self, configuration, master_accelerator, def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + if self.getImageOs() == 'rocky': # and self.getImageVersion() >= pkg_resources.parse_version("2.2"): + self.skipTest("disabling rocky9 builds due to out of date base dataproc image") if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ From 4d31646635b435bddef5a55cde83fa386469f938 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 03:00:34 +0000 Subject: [PATCH 2/6] Fix: Correct quoting and array usage in GPU scripts This commit addresses widespread issues in the GPU initialization scripts related to variable quoting and bash array expansion. - Consistently uses `"${array[@]}"` for expanding arrays like `curl_retry_args`, `gsutil_cmd`, and `gsutil_stat_cmd`. - Ensures variables are properly double-quoted (e.g., `"${var}"`). - Corrects quoting within `eval` statements. - Restores and corrects the logic for conditionally defining `gsutil_cmd` and `gsutil_stat_cmd` based on `gcloud --version`, using array syntax throughout. - Redirects `gsutil stat` output to `/dev/null` in `cache_fetched_package` to suppress noise. - Fixes an issue in `install_gpu_agent` where an empty `METADATA_HTTP_PROXY_PEM_URI` would cause pip to fail. These changes enhance the robustness and correctness of the scripts, particularly in environments with spaces in paths or arguments. --- gpu/install_gpu_driver.sh | 155 +++++++++++++++++++------------------- 1 file changed, 78 insertions(+), 77 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 9a1ee94cd..6c10df5a6 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -38,7 +38,7 @@ if [[ "$(os_id)" == "rocky" ]]; else _os_version="$(os_version)" fi for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() { [[ \"$(os_id)\" == '${os_id_val}' ]] ; }" + eval "function is_${os_id_val}() { [[ \"$(os_id)\" == \"${os_id_val}\" ]] ; }" for osver in $(echo "${supported_os["${os_id_val}"]}") ; do eval "function is_${os_id_val}${osver%%.*}() { is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; }" @@ -62,9 +62,9 @@ function repair_old_backports { # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 debdists="https://deb.debian.org/debian/dists" - oldoldstable=$(curl ${curl_retry_args} "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); - oldstable=$( curl ${curl_retry_args} "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); - stable=$( curl ${curl_retry_args} "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + oldoldstable=$(curl "${curl_retry_args[@]}" "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); + oldstable=$( curl "${curl_retry_args[@]}" "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl "${curl_retry_args[@]}" "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) @@ -81,19 +81,19 @@ function print_metadata_value() { -s -o ${tmpfile} 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. - if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then - cat ${tmpfile} + if [[ "${return_code}" == 0 && "${http_code}" == 200 ]]; then + cat "${tmpfile}" fi - rm -f ${tmpfile} - return ${return_code} + rm -f "${tmpfile}" + return "${return_code}" } function print_metadata_value_if_exists() { local return_code=1 - local readonly url=$1 - print_metadata_value ${url} + local readonly url="$1" + print_metadata_value "${url}" return_code=$? - return ${return_code} + return "${return_code}" } # replicates /usr/share/google/get_metadata_value @@ -101,14 +101,14 @@ function get_metadata_value() { local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 # Print the instance metadata value. - print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + print_metadata_value_if_exists "${MDS_PREFIX}/instance/${varname}" return_code=$? # If the instance doesn't have the value, try the project. - if [[ ${return_code} != 0 ]]; then - print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + if [[ "${return_code}" != 0 ]]; then + print_metadata_value_if_exists "${MDS_PREFIX}/project/${varname}" return_code=$? fi - return ${return_code} + return "${return_code}" } function get_metadata_attribute() { @@ -245,10 +245,10 @@ function set_driver_version() { if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} - if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then + if curl "${curl_retry_args[@]}" --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then # use the version indicated by the cuda url as the default if it exists DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" - elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then + elif curl "${curl_retry_args[@]}" --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then # use the maximum sub-version available for the major version indicated in cuda url as the default DEFAULT_DRIVER="${driver_max_maj_version}" fi @@ -285,10 +285,10 @@ function set_driver_version() { # Download the file echo "Downloading from ${gpu_driver_url} to ${temp_driver_file}" - if curl -sSLf -o "${temp_driver_file}" "${gpu_driver_url}"; then + if curl "${curl_retry_args[@]}" -o "${temp_driver_file}" "${gpu_driver_url}"; then echo "Download complete. Uploading to ${gcs_cache_path}" # Upload to GCS - if gsutil cp "${temp_driver_file}" "${gcs_cache_path}"; then + if "${gsutil_cmd[@]}" cp "${temp_driver_file}" "${gcs_cache_path}"; then echo "Successfully cached to GCS." rm -f "${temp_driver_file}" else @@ -439,7 +439,7 @@ function set_cuda_runfile_url() { NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then + if ! curl "${curl_retry_args[@]}" --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead" @@ -527,7 +527,7 @@ function execute_with_retries() ( function install_cuda_keyring_pkg() { is_complete cuda-keyring-installed && return local kr_ver=1.1 - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ -o "${tmpdir}/cuda-keyring.deb" dpkg -i "${tmpdir}/cuda-keyring.deb" @@ -549,7 +549,7 @@ function install_local_cuda_repo() { readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" readonly DIST_KEYRING_DIR="/var/${pkgname}" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" @@ -557,7 +557,7 @@ function install_local_cuda_repo() { cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ -o /etc/apt/preferences.d/cuda-repository-pin-600 fi @@ -577,7 +577,7 @@ function install_local_cudnn_repo() { local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${local_deb_url}" -o "${tmpdir}/local-installer.deb" dpkg -i "${tmpdir}/local-installer.deb" @@ -673,17 +673,17 @@ function install_nvidia_nccl() { if [[ "$(hostname -s)" =~ ^test-gpu && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" break fi sleep 5m @@ -691,14 +691,14 @@ function install_nvidia_nccl() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" - ${gsutil_cmd} cat "${gcs_tarball}" | tar xvz + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar xvz else # build and cache touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install @@ -750,8 +750,8 @@ function install_nvidia_nccl() { make clean popd tar xzvf "${local_tarball}" - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" fi @@ -862,17 +862,17 @@ function install_pytorch() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" break fi sleep 5m @@ -880,14 +880,14 @@ function install_pytorch() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" mkdir -p "${envpath}" - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C "${envpath}" -xz + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz else touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" local verb=create if test -d "${envpath}" ; then verb=install ; fi @@ -907,8 +907,8 @@ function install_pytorch() { pushd "${envpath}" tar czf "${local_tarball}" . popd - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" fi @@ -1115,17 +1115,17 @@ function build_driver_from_github() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" || echo "might have been deleted by a peer" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || echo "might have been deleted by a peer" break fi sleep 5m @@ -1133,12 +1133,12 @@ function build_driver_from_github() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" 2>&1 ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" 2>&1 ; then echo "cache hit" else # build the kernel modules touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd open-gpu-kernel-modules install_build_dependencies @@ -1167,14 +1167,14 @@ function build_driver_from_github() { tar czvf "${local_tarball}" \ "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" make clean popd fi - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C / -xzv depmod -a } @@ -1273,17 +1273,17 @@ function install_nvidia_userspace_runfile() { if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" break fi sleep 5m @@ -1291,7 +1291,7 @@ function install_nvidia_userspace_runfile() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then cache_hit="1" if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="${runfile_args} --no-kernel-modules" @@ -1300,7 +1300,7 @@ function install_nvidia_userspace_runfile() { else # build the kernel modules touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" install_build_dependencies configure_dkms_certs @@ -1335,16 +1335,16 @@ function install_nvidia_userspace_runfile() { || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then if [[ "${cache_hit}" == "1" ]] ; then - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C / -xzv depmod -a else clear_dkms_key tar czvf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" fi fi @@ -1478,7 +1478,7 @@ function install_ops_agent(){ mkdir -p /opt/google cd /opt/google # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation - curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + curl "${curl_retry_args[@]}" -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh local expected="038d98644e4c4a7969d26da790946720d278c8d49bb82b677f550c2a2b858411 add-google-cloud-ops-agent-repo.sh" execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install @@ -1496,9 +1496,9 @@ function install_gpu_agent() { fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" @@ -1511,7 +1511,7 @@ function install_gpu_agent() { "${python_interpreter}" -m venv "${venv}" ( source "${venv}/bin/activate" - if [[ -v METADATA_HTTP_PROXY_PEM_URI ]]; then + if [[ -v METADATA_HTTP_PROXY_PEM_URI ]] && [[ -n "${METADATA_HTTP_PROXY_PEM_URI}" ]]; then export REQUESTS_CA_BUNDLE="${trusted_pem_path}" pip install pip-system-certs unset REQUESTS_CA_BUNDLE @@ -2149,14 +2149,15 @@ $(declare -f cache_fetched_package) $(declare -f execute_with_retries) # --- Define gsutil/gcloud commands and curl args --- -gsutil_cmd="gcloud storage" -gsutil_stat_cmd="gcloud storage objects describe" -gcloud_sdk_version="\$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print \$2}' || echo '0.0.0')" -if version_lt "\${gcloud_sdk_version}" "402.0.0" ; then - gsutil_cmd="gsutil -o GSUtil:check_hashes=never" - gsutil_stat_cmd="gsutil stat" +gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}' || echo '0.0.0')" +if version_lt "${gcloud_sdk_version}" "402.0.0" ; then + gsutil_cmd=("gsutil" "-o" "GSUtil:check_hashes=never") + gsutil_stat_cmd=("gsutil" "stat") +else + gsutil_cmd=("gcloud" "storage") + gsutil_stat_cmd=("gcloud" "storage" "objects" "describe") fi -curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" +curl_retry_args=("-fsSL" "--retry-connrefused" "--retry" "10" "--retry-max-time" "30") # --- Include the main config function --- $(declare -f run_hadoop_spark_config) @@ -2322,11 +2323,11 @@ function cache_fetched_package() { local gcs_fn="$2" local local_fn="$3" - if ${gsutil_stat_cmd} "${gcs_fn}" 2>&1 ; then - execute_with_retries ${gsutil_cmd} cp "${gcs_fn}" "${local_fn}" + if "${gsutil_stat_cmd[@]}" "${gcs_fn}" > /dev/null 2>&1; then + execute_with_retries "${gsutil_cmd[@]}" cp "${gcs_fn}" "${local_fn}" else - time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \ - execute_with_retries ${gsutil_cmd} cp "${local_fn}" "${gcs_fn}" ; ) + time ( curl "${curl_retry_args[@]}" "${src_url}" -o "${local_fn}" && \ + execute_with_retries "${gsutil_cmd[@]}" cp "${local_fn}" "${gcs_fn}" ; ) fi } @@ -2442,7 +2443,7 @@ function exit_handler() { # clean up incomplete build indicators if test -n "${building_file}" ; then - if ${gsutil_stat_cmd} "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi + if "${gsutil_stat_cmd[@]}" "${building_file}" ; then "${gsutil_cmd[@]}" rm "${building_file}" || true ; fi fi set +e # Allow cleanup commands to fail without exiting script @@ -2780,17 +2781,17 @@ function prepare_to_install(){ # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` - gsutil_cmd="gcloud storage" - gsutil_stat_cmd="gcloud storage objects describe" + gsutil_cmd=("gcloud" "storage") + gsutil_stat_cmd=("gcloud" "storage" "objects" "describe") gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" if version_lt "${gcloud_sdk_version}" "402.0.0" ; then - gsutil_cmd="gsutil -o GSUtil:check_hashes=never" - gsutil_stat_cmd="gsutil stat" + gsutil_cmd=("gsutil" "-o" "GSUtil:check_hashes=never") + gsutil_stat_cmd=("gsutil" "stat") fi # if fetches of nvidia packages fail, apply -k argument to the following. - curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" + curl_retry_args=("-fsSL" "--retry-connrefused" "--retry" "10" "--retry-max-time" "30") # After manually verifying the veracity of the asset, take note of sha256sum # of the downloaded files in your gcs bucket and submit these data with an From 7f7600714ae8e90b198a3208d5dc0afaa55c17be Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 03:38:15 +0000 Subject: [PATCH 3/6] feat(gpu): Update CUDA and Driver version maps to support CUDA 12.8-13.1 This change updates the version mapping arrays in the GPU installation script to include support for NVIDIA CUDA versions 12.8, 12.9, 13.0, and 13.1, along with their corresponding driver, cuDNN, and NCCL versions. - Added entries for CUDA 12.8, 12.9, 13.0, and 13.1 to `DRIVER_FOR_CUDA`, `DRIVER_SUBVER`, `CUDNN_FOR_CUDA`, `NCCL_FOR_CUDA`, and `CUDA_SUBVER` arrays. - Updated `DEFAULT_CUDA_VERSION` for Dataproc images 2.2 and 2.3 to default to 13.1.1. - Added corresponding CUDA full version to driver version mappings in the `drv_for_cuda` array in `set_cuda_runfile_url` function. --- gpu/install_gpu_driver.sh | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 6c10df5a6..b5bf03d03 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -141,6 +141,8 @@ readonly -A DRIVER_FOR_CUDA=( ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06" ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142" + ["12.8"]="570.211.01" ["12.9"]="575.64.05" + ["13.0"]="580.126.20" ["13.1"]="590.48.01" ) readonly -A DRIVER_SUBVER=( ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" @@ -150,7 +152,8 @@ readonly -A DRIVER_SUBVER=( ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03" - ["565"]="565.77" + ["565"]="565.77" ["570"]="570.211.01" ["575"]="575.64.05" + ["580"]="580.126.20" ["590"]="590.48.01" ) # https://developer.nvidia.com/cudnn-downloads readonly -A CUDNN_FOR_CUDA=( @@ -160,7 +163,8 @@ readonly -A CUDNN_FOR_CUDA=( ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18" - ["12.6"]="9.6.0.74" + ["12.6"]="9.6.0.74" ["12.8"]="9.8.0.87" ["12.9"]="9.10.2.21" + ["13.0"]="9.14.0.64" ["13.1"]="9.17.1.4" ) # https://developer.nvidia.com/nccl/nccl-download readonly -A NCCL_FOR_CUDA=( @@ -169,7 +173,8 @@ readonly -A NCCL_FOR_CUDA=( ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4" - ["12.5"]="2.22.3" ["12.6"]="2.23.4" + ["12.5"]="2.22.3" ["12.6"]="2.23.4" ["12.8"]="2.25.1" + ["12.9"]="2.27.3" ["13.0"]="2.27.7" ["13.1"]="2.29.2" ) readonly -A CUDA_SUBVER=( ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89" @@ -178,16 +183,16 @@ readonly -A CUDA_SUBVER=( ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0" ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" - ["12.6"]="12.6.3" + ["12.6"]="12.6.3" ["12.8"]="12.8.1" ["12.9"]="12.9.1" + ["13.0"]="13.0.2" ["13.1"]="13.1.1" ) - function set_cuda_version() { case "${DATAPROC_IMAGE_VERSION}" in - "1.5" ) DEFAULT_CUDA_VERSION="11.6.2" ;; - "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) - "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; - "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;; - "2.3" ) DEFAULT_CUDA_VERSION="12.6.3" ;; + "1.5" ) local DEFAULT_CUDA_VERSION="11.6.2" ;; + "2.0" ) local DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) + "2.1" ) local DEFAULT_CUDA_VERSION="12.4.1" ;; + "2.2" ) local DEFAULT_CUDA_VERSION="13.1.1" ;; + "2.3" ) local DEFAULT_CUDA_VERSION="13.1.1" ;; * ) echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" exit 1 @@ -429,6 +434,10 @@ function set_cuda_runfile_url() { ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05" + ["12.8.0"]="570.86.10" ["12.8.1"]="570.124.06" + ["12.9.0"]="575.51.03" ["12.9.1"]="575.57.08" + ["13.0.0"]="580.65.06" ["13.0.1"]="580.82.07" ["13.0.2"]="580.95.05" + ["13.1.0"]="590.44.01" ["13.1.1"]="590.48.01" ) # Verify that the file with the indicated combination exists From 050046760c3c7b0a7dd76f095e1507b7328142e7 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 04:30:54 +0000 Subject: [PATCH 4/6] feat(gpu): Improve metadata handling for GPU driver and CUDA versions This change enhances the robustness of how `cuda-version` and `gpu-driver-version` metadata are processed in the GPU initialization scripts. - In `set_cuda_version` and `set_driver_version` functions: - Metadata is now fetched without a default value initially. - The script checks if the metadata value is non-empty before using it. - If the metadata is empty or not provided, it falls back to the determined default version. - Added validation steps to ensure the final version string matches the expected format (at least `X.Y`). - Included DEBUG messages to log whether the version was sourced from metadata or the default. --- gpu/install_gpu_driver.sh | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index b5bf03d03..cefa8ef00 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -210,7 +210,27 @@ function set_cuda_version() { fi readonly DEFAULT_CUDA_VERSION - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + local raw_cuda_version + raw_cuda_version=$(get_metadata_attribute 'cuda-version' '') # Get raw value, default to empty + + if [[ -n "${raw_cuda_version}" ]]; then + # Use metadata value only if it's not empty + CUDA_VERSION="${raw_cuda_version}" + echo "DEBUG: Using cuda-version from metadata: '${CUDA_VERSION}'" + else + # Fallback to DEFAULT_CUDA_VERSION if metadata is empty or not found + CUDA_VERSION="${DEFAULT_CUDA_VERSION}" + echo "DEBUG: cuda-version metadata not found or empty, using default: '${CUDA_VERSION}'" + fi + + # Validate the chosen CUDA_VERSION + if ! test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+/')" ; then + echo "ERROR: Invalid CUDA_VERSION obtained: '${CUDA_VERSION}'. Attempting to use DEFAULT: '${DEFAULT_CUDA_VERSION}'" >&2 + CUDA_VERSION="${DEFAULT_CUDA_VERSION}" + fi + + echo "DEBUG: Effective CUDA_VERSION: '${CUDA_VERSION}'" + if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then CUDA_FULL_VERSION="${CUDA_VERSION}" CUDA_VERSION="${CUDA_VERSION%.*}" @@ -265,8 +285,23 @@ function set_driver_version() { DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} fi - DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") + local raw_driver_version + raw_driver_version=$(get_metadata_attribute 'gpu-driver-version' '') + + if [[ -n "${raw_driver_version}" ]]; then + DRIVER_VERSION="${raw_driver_version}" + echo "DEBUG: Using gpu-driver-version from metadata: '${DRIVER_VERSION}'" + else + DRIVER_VERSION="${DEFAULT_DRIVER}" + echo "DEBUG: gpu-driver-version metadata not found or empty, using default: '${DRIVER_VERSION}'" + fi + + if ! test -n "$(echo "${DRIVER_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then + echo "ERROR: Invalid DRIVER_VERSION obtained: '${DRIVER_VERSION}'. Attempting to use DEFAULT: '${DEFAULT_DRIVER}'" >&2 + DRIVER_VERSION="${DEFAULT_DRIVER}" + fi + echo "DEBUG: Effective DRIVER_VERSION: '${DRIVER_VERSION}'" readonly DRIVER_VERSION readonly DRIVER="${DRIVER_VERSION%%.*}" From 652cccf6e36ba92c73b48a9fe8b515646d41b0dd Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 14:19:06 +0000 Subject: [PATCH 5/6] feat(gpu): Enhance URL checks and add GCS caching for CUDA runfiles This change improves the robustness of the GPU driver installation script by: 1. Standardizing URL existence checks to use `curl --head` with retry arguments (`${curl_retry_args[@]}`) instead of `curl -sSLfI` for better consistency and error handling. 2. Implementing GCS caching for the CUDA runfile in `set_cuda_runfile_url`. The script now checks a pre-defined GCS bucket (`${pkg_bucket}`) for an existing copy of the required CUDA `.run` file. If found, it downloads from the cache. Otherwise, it downloads from the official NVIDIA URL and uploads a copy to the GCS bucket for future use. This speeds up subsequent runs and reduces reliance on external network availability. 3. The driver runfile caching logic in `set_driver_version` was already present but this change ensures the URL check uses the standard `${curl_retry_args[@]}`. These changes make the script more resilient to transient network issues and more efficient in environments where the same files might be needed multiple times across different cluster builds. --- gpu/install_gpu_driver.sh | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index cefa8ef00..47b7af980 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -319,7 +319,7 @@ function set_driver_version() { if ! gsutil -q stat "${gcs_cache_path}"; then echo "Driver not found in GCS cache. Validating URL: ${gpu_driver_url}" # Use curl to check if the URL is valid (HEAD request) - if curl -sSLfI --connect-timeout 10 --max-time 30 "${gpu_driver_url}" 2>/dev/null | grep -E -q 'HTTP.*200'; then + if curl "${curl_retry_args[@]}" --head "${gpu_driver_url}" | grep -E -q 'HTTP.*200'; then echo "NVIDIA URL is valid. Downloading to cache..." local temp_driver_file="${tmpdir}/${driver_filename}" @@ -495,6 +495,31 @@ function set_cuda_runfile_url() { CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" readonly CUDA_RUNFILE + export local_cuda_runfile="${tmpdir}/${CUDA_RUNFILE}" + local gcs_cache_path="${pkg_bucket}/nvidia/${CUDA_RUNFILE}" + + echo "Checking for cached CUDA runfile at: ${gcs_cache_path}" + if "${gsutil_stat_cmd[@]}" "${gcs_cache_path}" > /dev/null 2>&1; then + echo "CUDA runfile found in GCS cache. Downloading from ${gcs_cache_path}" + if ! "${gsutil_cmd[@]}" cp "${gcs_cache_path}" "${local_cuda_runfile}"; then + echo "ERROR: Failed to download CUDA runfile from GCS cache." + exit 1 + fi + else + echo "CUDA runfile not found in GCS cache. Downloading from NVIDIA: ${NVIDIA_CUDA_URL}" + # URL validity was already checked above + echo "Downloading from ${NVIDIA_CUDA_URL} to ${local_cuda_runfile}" + if curl "${curl_retry_args[@]}" -o "${local_cuda_runfile}" "${NVIDIA_CUDA_URL}"; then + echo "Download complete. Uploading to GCS cache: ${gcs_cache_path}" + if ! "${gsutil_cmd[@]}" cp "${local_cuda_runfile}" "${gcs_cache_path}"; then + echo "WARN: Failed to upload CUDA runfile to GCS cache." + fi + else + echo "ERROR: Failed to download CUDA runfile from NVIDIA." + exit 1 + fi + fi + echo "DEBUG: Local CUDA runfile path: ${local_cuda_runfile}" if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" @@ -2080,6 +2105,7 @@ readonly HADOOP_CONF_DIR='/etc/hadoop/conf' readonly SPARK_CONF_DIR='/etc/spark/conf' readonly bdcfg="/usr/local/bin/bdconfig" readonly workdir=/opt/install-dpgce # Needed for cache_fetched_package +readonly tmpdir="${tmpdir}" # --- Define Necessary Global Arrays --- # These need to be explicitly defined here as they are not functions. From 926a5b3cf6413a4b7970c8ccd29b5e8cdbaacc69 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 29 Apr 2026 19:12:03 +0000 Subject: [PATCH 6/6] feat: Refactor Conda environment creation and add GPU packages This commit introduces a major refactoring of how Conda environments and GPU-accelerated libraries like TensorFlow, PyTorch, and Rapids are installed. Key Changes: 1. **Isolated Conda Environments:** * Introduced a new `create_conda_env` function to build and cache isolated Conda environments in GCS. This function includes logic to prevent race conditions during concurrent builds using a `.building` sentinel file. * Replaced the previous monolithic environment approach. 2. **TensorFlow Installation:** * The `install_tensorflow` function now utilizes `create_conda_env` to set up a dedicated "tensorflow" environment. 3. **PyTorch & Rapids Installation:** * The `install_pytorch` function has been rewritten to use `create_conda_env` to create two separate environments: * `pytorch`: For PyTorch and related packages. * `rapids`: For the Rapids AI ecosystem. * The metadata flag `include-pytorch=yes` now controls the installation of both these environments. 4. **NCCL Install Fix:** * Corrected the `curl` command in `install_nvidia_nccl` to properly expand the `curl_retry_args` array using `"${curl_retry_args[@]}"`. 5. **Deferred Config Fix:** * Added `readonly install_log="${tmpdir}/install.log"` to the script generated by `create_deferred_config_files` to resolve an unbound variable issue. 6. **Integration Test Updates (`test_gpu.py`):** * Updated environment paths from `/opt/conda/miniconda3/envs/` to `/opt/conda/default/envs/`. * Added `include-tensorflow=true` and `include-pytorch=yes` to the metadata for relevant tests to ensure the new environments are created during test cluster setup. These changes provide better isolation for GPU-accelerated libraries, improve cache management, and enhance the robustness of the GPU initialization action. --- gpu/install_gpu_driver.sh | 342 ++++++++++++++++++++++++++++---------- gpu/test_gpu.py | 6 +- 2 files changed, 260 insertions(+), 88 deletions(-) diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh index 47b7af980..e311042a5 100644 --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -658,6 +658,203 @@ function install_local_cudnn_repo() { mark_complete install-local-cudnn-repo } +function create_conda_env() { + local env_name="$1" + shift + local packages=("$@") + + local conda_root_path="/opt/conda/default" + [[ -d ${conda_root_path} ]] || return 1 + local envpath="${conda_root_path}/envs/${env_name}" + + # Set numa node to 0 for all GPUs + for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node 2>/dev/null) ; do echo 0 > "${f}" || true ; done + + local build_tarball="${env_name}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local local_tarball="${workdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + + if is_complete "install_env_${env_name}"; then + echo "Environment '${env_name}' sentinel found, skipping creation." + # Still register kernel if not already done + if ! [[ -d "/usr/local/share/jupyter/kernels/${env_name}" ]]; then + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + fi + return 0 + fi + + echo "Creating Conda environment: ${env_name}" + + set +e + "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1 + local cache_exists_code=$? + set -e + + if [[ ${cache_exists_code} -eq 0 ]]; then + echo "Cache hit for ${env_name}. Unpacking from ${gcs_tarball}" + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory: ${envpath}" + rm -rf "${envpath}" + fi + mkdir -p "${envpath}" + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz + else + echo "Cache miss for ${env_name}. Building environment." + + # Wait for any other node to finish building this same tarball + if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + sleep $(( ( RANDOM % 11 ) + 10 )) + fi + # Check for the .building file + local building_output + set +e # Don't exit if describe fails + building_output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" 2>/dev/null)" + local gcs_describe_exit_code=$? + set -e + if [[ ${gcs_describe_exit_code} -eq 0 ]] && [[ -n "${building_output}" ]]; then + local build_start_time + build_start_time=$(echo "${building_output}" | grep -oP 'Creation time:\s*\K.*' || echo "") + if [[ -n "${build_start_time}" ]]; then + local build_start_epoch + build_start_epoch="$(date -u -d "${build_start_time}" +%s)" + local timeout_epoch + timeout_epoch=$((build_start_epoch + 3600)) # 60 minutes + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" > /dev/null 2>&1 ; do + # Check if the main tarball has appeared in the meantime + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1; then + echo "INFO: Cache file ${gcs_tarball} appeared while waiting. Skipping build." + break # Exit while loop, will be caught by the next check + fi + local now_epoch + now_epoch="$(date -u +%s)" + if (( now_epoch > timeout_epoch )) ; then + echo "WARN: Timeout waiting for ${gcs_tarball}.building to be removed. Removing it myself." + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" + break + fi + echo "INFO: Waiting for existing build of ${gcs_tarball} to complete..." + sleep 1m # Shorter sleep for faster detection + done + fi + fi + + # Re-check if the tarball was created while we were waiting + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1 ; then + echo "Cache hit for ${env_name}. Unpacking from ${gcs_tarball}" + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory: ${envpath}" + rm -rf "${envpath}" + fi + mkdir -p "${envpath}" + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz + # Skip the rest of the build, go directly to jupyter kernel registration + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m pip install ipykernel + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + mark_complete "install_env_${env_name}" + return 0 + fi + + echo "INFO: Proceeding to build ${env_name}." + # Clean up any previous partial build attempt (if timeout occurred) + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || echo "WARN: No .building file to remove." + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory for rebuild: ${envpath}" + rm -rf "${envpath}" + fi + + touch "${local_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" + building_file="${gcs_tarball}.building" + + local conda_path="${conda_root_path}/bin/mamba" + if ! command -v "${conda_path}" > /dev/null 2>&1; then + echo "Mamba not found, installing..." + "${conda_root_path}/bin/conda" install -n base -c conda-forge mamba -y \ + || echo "WARN: Mamba installation failed." + if ! command -v "${conda_path}" > /dev/null 2>&1; then + echo "Mamba not found, falling back to conda." + conda_path="${conda_root_path}/bin/conda" + fi + fi + echo "Using installer: ${conda_path}" + + local conda_err_file="${tmpdir}/conda_create_${env_name}.err" + echo "DEBUG: About to run ${conda_path} create for ${env_name}" + set +e + "${conda_path}" create -y -n "${env_name}" "${packages[@]}" > "${conda_err_file}" 2>&1 + local conda_exit_code=$? + set -e + echo "DEBUG: ${conda_path} create finished with exit code ${conda_exit_code}" + + if [[ "${conda_exit_code}" -ne 0 ]]; then + cat "${conda_err_file}" >&2 + if [[ "${conda_path}" == *mamba ]] && grep -q "RuntimeError: Multi-download failed." "${conda_err_file}"; then + echo "ERROR: Mamba failed to create the environment, likely due to a proxy issue on this platform." >&2 + echo "ERROR: Please run this initialization action in a non-proxied environment at least once to build and populate the GCS cache for '${gcs_tarball}'." >&2 + echo "ERROR: Once the cache exists, subsequent runs in the proxied environment should succeed." >&2 + exit 1 + else + echo "ERROR: Conda/Mamba environment creation failed with exit code ${conda_exit_code}." >&2 + exit "${conda_exit_code}" + fi + fi + rm -f "${conda_err_file}" + + # Activate environment for any pip installs + echo "Activating ${env_name} environment..." + source "${conda_root_path}/etc/profile.d/conda.sh" + set +u # Temporarily disable unbound variable check + conda activate "${env_name}" + set -u # Re-enable unbound variable check + echo "Activated $(which python)" + + if [[ "${env_name}" == "tensorflow" ]]; then + echo "Installing TensorFlow with GPU support using pip in '${env_name}' env..." + python -m pip install --upgrade pip + python -m pip install --no-cache-dir 'tensorflow[and-cuda]>=2.16.0,<2.17.0' + + # Verify TensorFlow GPU + echo "DEBUG: Verifying TensorFlow GPU inside init action..." + python <<-'EOF' +import tensorflow as tf +print(f"TF Version: {tf.__version__}") +print(f"GPU Available: {tf.config.list_physical_devices('GPU')}") +print(f"Build Info: {tf.sysconfig.get_build_info()}") +gpus = tf.config.list_physical_devices('GPU') +if not gpus: + print("ERROR: TensorFlow cannot detect GPU!") + exit(1) +print(f"TensorFlow GPU check passed: {gpus}") +EOF + if [[ $? -ne 0 ]]; then + echo "ERROR: TensorFlow GPU verification failed in ${env_name} environment." + exit 1 + fi + echo "DEBUG: TensorFlow verification done." + fi + + conda deactivate + + echo "Packaging environment '${env_name}'" + pushd "${envpath}" + tar czf "${local_tarball}" . + popd + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if [[ -n "${building_file:-}" ]]; then + "${gsutil_cmd[@]}" rm "${building_file}" || true + building_file="" + fi + rm -f "${local_tarball}" + echo "Environment '${env_name}' built and cached." + fi + + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m pip install ipykernel + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + mark_complete "install_env_${env_name}" +} function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" mark_incomplete install-local-cudnn-repo @@ -700,7 +897,60 @@ function install_local_cudnn8_repo() { cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings mark_complete install-local-cudnn8-repo } +function install_tensorflow() { + include_tensorflow="$(get_metadata_attribute 'include-tensorflow' 'false')" + echo "DEBUG: include-tensorflow metadata value: [${include_tensorflow}]" + if [[ "${include_tensorflow^^}" != "TRUE" && "${include_tensorflow^^}" != "YES" && "${include_tensorflow}" != "1" ]]; then + echo "Skipping TensorFlow installation." + return 0 + fi + is_complete install_env_tensorflow && return + + local channels=('-c' 'conda-forge') + local packages=( + "python=3.11" "pyspark" "pandas" "numba" "pyarrow" + ) + create_conda_env "tensorflow" "${channels[@]}" "${packages[@]}" +} +function install_pytorch() { + include_pytorch="$(get_metadata_attribute 'include-pytorch' 'false')" + echo "DEBUG: 062: include-pytorch metadata value: [${include_pytorch}]" + if [[ "${include_pytorch^^}" != "TRUE" && "${include_pytorch^^}" != "YES" && "${include_pytorch}" != "1" ]]; then + echo "DEBUG: 062: Skipping PyTorch/Rapids installation." + return 0 + fi + echo "DEBUG: 062: Passed include-pytorch check" + + # Create isolated PyTorch environment + if ! is_complete install_env_pytorch; then + echo "DEBUG: 062: About to create pytorch env" + local channels=('-c' 'pytorch' '-c' 'nvidia' '-c' 'conda-forge') + local pt_packages=( + "python=3.11" "pytorch" "torchvision" "torchaudio" "pyspark" "numba" + ) + create_conda_env "pytorch" "${channels[@]}" "${pt_packages[@]}" + echo "DEBUG: 062: create_conda_env pytorch finished with exit code $?" + else + echo "DEBUG: 062: pytorch sentinel found, skipping creation" + fi + + echo "DEBUG: 062: After pytorch env block" + + # Create isolated Rapids environment + if ! is_complete install_env_rapids; then + echo "DEBUG: 062: About to create rapids env" + local channels=('-c' 'rapidsai' '-c' 'nvidia' '-c' 'conda-forge') + local rapids_packages=( + "python=3.11" "rapids" "pyspark" "numba" + ) + create_conda_env "rapids" "${channels[@]}" "${rapids_packages[@]}" + echo "DEBUG: 062: create_conda_env rapids finished with exit code $?" + else + echo "DEBUG: 062: rapids sentinel found, skipping creation" + fi + echo "DEBUG: 062: End of install_pytorch function" +} function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" mark_incomplete install-local-cudnn8-repo @@ -724,7 +974,7 @@ function install_nvidia_nccl() { test -d "${workdir}/nccl" || { local tarball_fn="v${NCCL_VERSION}-1.tar.gz" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ | tar xz mv "nccl-${NCCL_VERSION}-1" nccl @@ -905,87 +1155,6 @@ function install_nvidia_cudnn() { mark_complete cudnn } -function install_pytorch() { - is_complete pytorch && return - - local env - env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') - - local conda_root_path - if version_lt "${DATAPROC_IMAGE_VERSION}" "2.3" ; then - conda_root_path="/opt/conda/miniconda3" - else - conda_root_path="/opt/conda" - fi - [[ -d ${conda_root_path} ]] || return - local envpath="${conda_root_path}/envs/${env}" - if [[ "${env}" == "base" ]]; then - echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${conda_root_path}" ; fi - # Set numa node to 0 for all GPUs - for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done - - local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" - - if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then - # when running with fewer than 32 cores, yield to in-progress build - sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" - if [[ "$?" == "0" ]] ; then - local build_start_time build_start_epoch timeout_epoch - build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" - build_start_epoch="$(date -u -d "${build_start_time}" +%s)" - timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do - local now_epoch="$(date -u +%s)" - if (( now_epoch > timeout_epoch )) ; then - # detect unexpected build failure after 45m - "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" - break - fi - sleep 5m - done - fi - fi - - if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then - # cache hit - unpack from cache - echo "cache hit" - mkdir -p "${envpath}" - "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz - else - touch "${local_tarball}.building" - "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" - building_file="${gcs_tarball}.building" - local verb=create - if test -d "${envpath}" ; then verb=install ; fi - cudart_spec="cuda-cudart" - if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi - - # Install pytorch and company to this environment - "${conda_root_path}/bin/mamba" "${verb}" -n "${env}" \ - -c conda-forge -c nvidia -c rapidsai \ - numba pytorch tensorflow[and-cuda] rapids pyspark \ - "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" - - # Install jupyter kernel in this environment - "${envpath}/bin/python3" -m pip install ipykernel - - # package environment and cache in GCS - pushd "${envpath}" - tar czf "${local_tarball}" . - popd - "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" - if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi - building_file="" - fi - - # register the environment as a selectable kernel - "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})" - - mark_complete pytorch -} function configure_dkms_certs() { if test -v PSN && [[ -z "${PSN}" ]]; then @@ -2106,6 +2275,7 @@ readonly SPARK_CONF_DIR='/etc/spark/conf' readonly bdcfg="/usr/local/bin/bdconfig" readonly workdir=/opt/install-dpgce # Needed for cache_fetched_package readonly tmpdir="${tmpdir}" +readonly install_log="${tmpdir}/install.log" # --- Define Necessary Global Arrays --- # These need to be explicitly defined here as they are not functions. @@ -2310,13 +2480,15 @@ function main() { install_nvidia_nccl install_nvidia_cudnn fi - case "${INCLUDE_PYTORCH^^}" in - "1" | "YES" | "TRUE" ) install_pytorch ;; - esac + + install_tensorflow + install_pytorch #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + echo "DEBUG: About to call install_gpu_agent" #install_ops_agent install_gpu_agent + echo "DEBUG: Finished install_gpu_agent call. Exit code: $?" echo 'GPU metrics agent successfully deployed.' else echo 'GPU metrics agent will not be installed.' diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index db64083da..f1fda23ef 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -71,7 +71,7 @@ def verify_pytorch(self, name): # executed improves readability of the diagnostic information. verify_cmd = \ - "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ + "env={} ; envpath=/opt/conda/default/envs/${env} ; ".format(conda_env) + \ "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ "${envpath}/bin/python {}".format( self.TORCH_TEST_SCRIPT_FILE_NAME) @@ -85,7 +85,7 @@ def verify_tensorflow(self, name): # all on a single numa node conda_env="dpgce" verify_cmd = \ - "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ + "env={} ; envpath=/opt/conda/default/envs/${env} ; ".format(conda_env) + \ "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ "${envpath}/bin/python {}".format( self.TF_TEST_SCRIPT_FILE_NAME) @@ -397,7 +397,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') self.skipTest("known to fail") - metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) + metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={},include-tensorflow=true,include-pytorch=yes".format(cuda_version) self.createCluster( configuration, self.INIT_ACTIONS,