diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h index 643e53974a..69689ac7a6 100644 --- a/source/api_cc/include/commonPT.h +++ b/source/api_cc/include/commonPT.h @@ -10,10 +10,40 @@ #include #include "common.h" +#include "device.h" #include "neighbor_list.h" namespace deepmd { +/** + * @brief Select the per-rank GPU before PyTorch can create a default context. + * + * Some PyTorch/CUDA queries and the torch custom-op library loader may create a + * CUDA/HIP context on the current runtime device. In MPI jobs the runtime + * default is usually GPU 0, so selecting the rank-local GPU first avoids every + * rank leaving a small, unused context on GPU 0. + * + * @param[in] gpu_rank Rank-local GPU index passed by the caller. + * @param[out] gpu_id Visible GPU selected for this rank. + * @param[out] gpu_enabled Whether PyTorch reports CUDA/HIP availability. + */ +inline void preselect_torch_device(const int& gpu_rank, + int& gpu_id, + bool& gpu_enabled) { +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + int gpu_num = 0; + DPGetDeviceCount(gpu_num); + gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0; + if (gpu_num > 0) { + DPErrcheck(DPSetDevice(gpu_id)); + } +#else + int gpu_num = torch::cuda::device_count(); + gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0; +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + gpu_enabled = torch::cuda::is_available(); +} + /** * @brief Build comm_dict tensors from sendlist/sendnum/recvnum buffers. * diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc index 67265415ce..66a2b7dd81 100644 --- a/source/api_cc/src/DeepPotPT.cc +++ b/source/api_cc/src/DeepPotPT.cc @@ -53,18 +53,13 @@ void DeepPotPT::init(const std::string& model, << std::endl; return; } + preselect_torch_device(gpu_rank, gpu_id, gpu_enabled); deepmd::load_op_library(); - int gpu_num = torch::cuda::device_count(); - gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0; - gpu_enabled = torch::cuda::is_available(); torch::Device device(torch::kCUDA, gpu_id); if (!gpu_enabled) { device = torch::Device(torch::kCPU); std::cout << "load model from: " << model << " to cpu " << std::endl; } else { -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - DPErrcheck(DPSetDevice(gpu_id)); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM std::cout << "load model from: " << model << " to gpu " << gpu_id << std::endl; } diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 96033fcab4..4a642b3b31 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -64,9 +64,11 @@ void DeepPotPTExpt::init(const std::string& model, return; } + preselect_torch_device(gpu_rank, gpu_id, gpu_enabled); + // Load libdeepmd_op_pt.so so its TORCH_LIBRARY_FRAGMENT entries // (deepmd::*, deepmd_export::*) are visible to torch's dispatcher - // before the AOTI module loads. Without this, multi-rank GNN .pt2 + // before the AOTI module loads. Without this, multi-rank GNN .pt2 // archives fail at pair_style time with // ``Could not find schema for deepmd_export::border_op``. deepmd::load_op_library(); @@ -77,18 +79,11 @@ void DeepPotPTExpt::init(const std::string& model, "Please provide a file path instead."); } - int gpu_num = torch::cuda::device_count(); - gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0; - gpu_enabled = torch::cuda::is_available(); - std::string device_str; if (!gpu_enabled) { device_str = "cpu"; std::cout << "load model from: " << model << " to cpu" << std::endl; } else { -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - DPErrcheck(DPSetDevice(gpu_id)); -#endif device_str = "cuda:" + std::to_string(gpu_id); std::cout << "load model from: " << model << " to gpu " << gpu_id << std::endl; diff --git a/source/api_cc/src/DeepSpinPT.cc b/source/api_cc/src/DeepSpinPT.cc index aa4e05591d..f1fca5e067 100644 --- a/source/api_cc/src/DeepSpinPT.cc +++ b/source/api_cc/src/DeepSpinPT.cc @@ -52,22 +52,13 @@ void DeepSpinPT::init(const std::string& model, << std::endl; return; } + preselect_torch_device(gpu_rank, gpu_id, gpu_enabled); deepmd::load_op_library(); - int gpu_num = torch::cuda::device_count(); - if (gpu_num > 0) { - gpu_id = gpu_rank % gpu_num; - } else { - gpu_id = 0; - } torch::Device device(torch::kCUDA, gpu_id); - gpu_enabled = torch::cuda::is_available(); if (!gpu_enabled) { device = torch::Device(torch::kCPU); std::cout << "load model from: " << model << " to cpu " << std::endl; } else { -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - DPErrcheck(DPSetDevice(gpu_id)); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM std::cout << "load model from: " << model << " to gpu " << gpu_id << std::endl; } diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc index f5870247f4..97f886a7db 100644 --- a/source/api_cc/src/DeepSpinPTExpt.cc +++ b/source/api_cc/src/DeepSpinPTExpt.cc @@ -64,8 +64,10 @@ void DeepSpinPTExpt::init(const std::string& model, return; } + preselect_torch_device(gpu_rank, gpu_id, gpu_enabled); + // Load libdeepmd_op_pt.so so deepmd_export::* schemas are visible - // to torch's dispatcher before the AOTI module loads. See + // to torch's dispatcher before the AOTI module loads. See // DeepPotPTExpt::init for the full rationale. deepmd::load_op_library(); @@ -75,18 +77,11 @@ void DeepSpinPTExpt::init(const std::string& model, "Please provide a file path instead."); } - int gpu_num = torch::cuda::device_count(); - gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0; - gpu_enabled = torch::cuda::is_available(); - std::string device_str; if (!gpu_enabled) { device_str = "cpu"; std::cout << "load model from: " << model << " to cpu" << std::endl; } else { -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - DPErrcheck(DPSetDevice(gpu_id)); -#endif device_str = "cuda:" + std::to_string(gpu_id); std::cout << "load model from: " << model << " to gpu " << gpu_id << std::endl; diff --git a/source/api_cc/src/DeepTensorPT.cc b/source/api_cc/src/DeepTensorPT.cc index 1636f3af95..3036217ef7 100644 --- a/source/api_cc/src/DeepTensorPT.cc +++ b/source/api_cc/src/DeepTensorPT.cc @@ -9,6 +9,7 @@ #include #include "common.h" +#include "commonPT.h" #include "device.h" #include "errors.h" @@ -74,22 +75,13 @@ void DeepTensorPT::init(const std::string& model, return; } name_scope = name_scope_; + preselect_torch_device(gpu_rank, gpu_id, gpu_enabled); deepmd::load_op_library(); - int gpu_num = torch::cuda::device_count(); - if (gpu_num > 0) { - gpu_id = gpu_rank % gpu_num; - } else { - gpu_id = 0; - } torch::Device device(torch::kCUDA, gpu_id); - gpu_enabled = torch::cuda::is_available(); if (!gpu_enabled) { device = torch::Device(torch::kCPU); std::cout << "load model from: " << model << " to cpu " << std::endl; } else { -#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM - DPErrcheck(DPSetDevice(gpu_id)); -#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM std::cout << "load model from: " << model << " to gpu " << gpu_id << std::endl; }