Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions source/api_cc/include/commonPT.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,40 @@
#include <vector>

#include "common.h"
#include "device.h"
#include "neighbor_list.h"

namespace deepmd {

/**
* @brief Select the per-rank GPU before PyTorch can create a default context.
*
* Some PyTorch/CUDA queries and the torch custom-op library loader may create a
* CUDA/HIP context on the current runtime device. In MPI jobs the runtime
* default is usually GPU 0, so selecting the rank-local GPU first avoids every
* rank leaving a small, unused context on GPU 0.
*
* @param[in] gpu_rank Rank-local GPU index passed by the caller.
* @param[out] gpu_id Visible GPU selected for this rank.
* @param[out] gpu_enabled Whether PyTorch reports CUDA/HIP availability.
*/
inline void preselect_torch_device(const int& gpu_rank,
int& gpu_id,
bool& gpu_enabled) {
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
int gpu_num = 0;
DPGetDeviceCount(gpu_num);
gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
if (gpu_num > 0) {
DPErrcheck(DPSetDevice(gpu_id));
}
#else
int gpu_num = torch::cuda::device_count();
gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
gpu_enabled = torch::cuda::is_available();
}

/**
* @brief Build comm_dict tensors from sendlist/sendnum/recvnum buffers.
*
Expand Down
7 changes: 1 addition & 6 deletions source/api_cc/src/DeepPotPT.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,13 @@ void DeepPotPT::init(const std::string& model,
<< std::endl;
return;
}
preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);
deepmd::load_op_library();
int gpu_num = torch::cuda::device_count();
gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
gpu_enabled = torch::cuda::is_available();
torch::Device device(torch::kCUDA, gpu_id);
if (!gpu_enabled) {
device = torch::Device(torch::kCPU);
std::cout << "load model from: " << model << " to cpu " << std::endl;
} else {
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
DPErrcheck(DPSetDevice(gpu_id));
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
std::cout << "load model from: " << model << " to gpu " << gpu_id
<< std::endl;
}
Expand Down
11 changes: 3 additions & 8 deletions source/api_cc/src/DeepPotPTExpt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,11 @@ void DeepPotPTExpt::init(const std::string& model,
return;
}

preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);

// Load libdeepmd_op_pt.so so its TORCH_LIBRARY_FRAGMENT entries
// (deepmd::*, deepmd_export::*) are visible to torch's dispatcher
// before the AOTI module loads. Without this, multi-rank GNN .pt2
// before the AOTI module loads. Without this, multi-rank GNN .pt2
// archives fail at pair_style time with
// ``Could not find schema for deepmd_export::border_op``.
deepmd::load_op_library();
Expand All @@ -77,18 +79,11 @@ void DeepPotPTExpt::init(const std::string& model,
"Please provide a file path instead.");
}

int gpu_num = torch::cuda::device_count();
gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
gpu_enabled = torch::cuda::is_available();

std::string device_str;
if (!gpu_enabled) {
device_str = "cpu";
std::cout << "load model from: " << model << " to cpu" << std::endl;
} else {
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
DPErrcheck(DPSetDevice(gpu_id));
#endif
device_str = "cuda:" + std::to_string(gpu_id);
std::cout << "load model from: " << model << " to gpu " << gpu_id
<< std::endl;
Expand Down
11 changes: 1 addition & 10 deletions source/api_cc/src/DeepSpinPT.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,22 +52,13 @@ void DeepSpinPT::init(const std::string& model,
<< std::endl;
return;
}
preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);
deepmd::load_op_library();
int gpu_num = torch::cuda::device_count();
if (gpu_num > 0) {
gpu_id = gpu_rank % gpu_num;
} else {
gpu_id = 0;
}
torch::Device device(torch::kCUDA, gpu_id);
gpu_enabled = torch::cuda::is_available();
if (!gpu_enabled) {
device = torch::Device(torch::kCPU);
std::cout << "load model from: " << model << " to cpu " << std::endl;
} else {
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
DPErrcheck(DPSetDevice(gpu_id));
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
std::cout << "load model from: " << model << " to gpu " << gpu_id
<< std::endl;
}
Expand Down
11 changes: 3 additions & 8 deletions source/api_cc/src/DeepSpinPTExpt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,10 @@ void DeepSpinPTExpt::init(const std::string& model,
return;
}

preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);

// Load libdeepmd_op_pt.so so deepmd_export::* schemas are visible
// to torch's dispatcher before the AOTI module loads. See
// to torch's dispatcher before the AOTI module loads. See
// DeepPotPTExpt::init for the full rationale.
deepmd::load_op_library();

Expand All @@ -75,18 +77,11 @@ void DeepSpinPTExpt::init(const std::string& model,
"Please provide a file path instead.");
}

int gpu_num = torch::cuda::device_count();
gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
gpu_enabled = torch::cuda::is_available();

std::string device_str;
if (!gpu_enabled) {
device_str = "cpu";
std::cout << "load model from: " << model << " to cpu" << std::endl;
} else {
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
DPErrcheck(DPSetDevice(gpu_id));
#endif
device_str = "cuda:" + std::to_string(gpu_id);
std::cout << "load model from: " << model << " to gpu " << gpu_id
<< std::endl;
Expand Down
12 changes: 2 additions & 10 deletions source/api_cc/src/DeepTensorPT.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <sstream>

#include "common.h"
#include "commonPT.h"
#include "device.h"
#include "errors.h"

Expand Down Expand Up @@ -74,22 +75,13 @@ void DeepTensorPT::init(const std::string& model,
return;
}
name_scope = name_scope_;
preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);
deepmd::load_op_library();
int gpu_num = torch::cuda::device_count();
if (gpu_num > 0) {
gpu_id = gpu_rank % gpu_num;
} else {
gpu_id = 0;
}
torch::Device device(torch::kCUDA, gpu_id);
gpu_enabled = torch::cuda::is_available();
if (!gpu_enabled) {
device = torch::Device(torch::kCPU);
std::cout << "load model from: " << model << " to cpu " << std::endl;
} else {
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
DPErrcheck(DPSetDevice(gpu_id));
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
std::cout << "load model from: " << model << " to gpu " << gpu_id
<< std::endl;
}
Expand Down
Loading