diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index 643e53974a..69689ac7a6 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -10,10 +10,40 @@
 #include <vector>
 
 #include "common.h"
+#include "device.h"
 #include "neighbor_list.h"
 
 namespace deepmd {
 
+/**
+ * @brief Select the per-rank GPU before PyTorch can create a default context.
+ *
+ * Some PyTorch/CUDA queries and the torch custom-op library loader may create a
+ * CUDA/HIP context on the current runtime device. In MPI jobs the runtime
+ * default is usually GPU 0, so selecting the rank-local GPU first avoids every
+ * rank leaving a small, unused context on GPU 0.
+ *
+ * @param[in] gpu_rank Rank-local GPU index passed by the caller.
+ * @param[out] gpu_id Visible GPU selected for this rank.
+ * @param[out] gpu_enabled Whether PyTorch reports CUDA/HIP availability.
+ */
+inline void preselect_torch_device(const int& gpu_rank,
+                                   int& gpu_id,
+                                   bool& gpu_enabled) {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  int gpu_num = 0;
+  DPGetDeviceCount(gpu_num);
+  gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
+  if (gpu_num > 0) {
+    DPErrcheck(DPSetDevice(gpu_id));
+  }
+#else
+  int gpu_num = torch::cuda::device_count();
+  gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  gpu_enabled = torch::cuda::is_available();
+}
+
 /**
  * @brief Build comm_dict tensors from sendlist/sendnum/recvnum buffers.
  *
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index 67265415ce..66a2b7dd81 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -53,18 +53,13 @@ void DeepPotPT::init(const std::string& model,
               << std::endl;
     return;
   }
+  preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);
   deepmd::load_op_library();
-  int gpu_num = torch::cuda::device_count();
-  gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
-  gpu_enabled = torch::cuda::is_available();
   torch::Device device(torch::kCUDA, gpu_id);
   if (!gpu_enabled) {
     device = torch::Device(torch::kCPU);
     std::cout << "load model from: " << model << " to cpu " << std::endl;
   } else {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    DPErrcheck(DPSetDevice(gpu_id));
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     std::cout << "load model from: " << model << " to gpu " << gpu_id
               << std::endl;
   }
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 96033fcab4..4a642b3b31 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -64,9 +64,11 @@ void DeepPotPTExpt::init(const std::string& model,
     return;
   }
 
+  preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);
+
   // Load libdeepmd_op_pt.so so its TORCH_LIBRARY_FRAGMENT entries
   // (deepmd::*, deepmd_export::*) are visible to torch's dispatcher
-  // before the AOTI module loads.  Without this, multi-rank GNN .pt2
+  // before the AOTI module loads. Without this, multi-rank GNN .pt2
   // archives fail at pair_style time with
   // ``Could not find schema for deepmd_export::border_op``.
   deepmd::load_op_library();
@@ -77,18 +79,11 @@ void DeepPotPTExpt::init(const std::string& model,
         "Please provide a file path instead.");
   }
 
-  int gpu_num = torch::cuda::device_count();
-  gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
-  gpu_enabled = torch::cuda::is_available();
-
   std::string device_str;
   if (!gpu_enabled) {
     device_str = "cpu";
     std::cout << "load model from: " << model << " to cpu" << std::endl;
   } else {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    DPErrcheck(DPSetDevice(gpu_id));
-#endif
     device_str = "cuda:" + std::to_string(gpu_id);
     std::cout << "load model from: " << model << " to gpu " << gpu_id
               << std::endl;
diff --git a/source/api_cc/src/DeepSpinPT.cc b/source/api_cc/src/DeepSpinPT.cc
index aa4e05591d..f1fca5e067 100644
--- a/source/api_cc/src/DeepSpinPT.cc
+++ b/source/api_cc/src/DeepSpinPT.cc
@@ -52,22 +52,13 @@ void DeepSpinPT::init(const std::string& model,
               << std::endl;
     return;
   }
+  preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);
   deepmd::load_op_library();
-  int gpu_num = torch::cuda::device_count();
-  if (gpu_num > 0) {
-    gpu_id = gpu_rank % gpu_num;
-  } else {
-    gpu_id = 0;
-  }
   torch::Device device(torch::kCUDA, gpu_id);
-  gpu_enabled = torch::cuda::is_available();
   if (!gpu_enabled) {
     device = torch::Device(torch::kCPU);
     std::cout << "load model from: " << model << " to cpu " << std::endl;
   } else {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    DPErrcheck(DPSetDevice(gpu_id));
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     std::cout << "load model from: " << model << " to gpu " << gpu_id
               << std::endl;
   }
diff --git a/source/api_cc/src/DeepSpinPTExpt.cc b/source/api_cc/src/DeepSpinPTExpt.cc
index f5870247f4..97f886a7db 100644
--- a/source/api_cc/src/DeepSpinPTExpt.cc
+++ b/source/api_cc/src/DeepSpinPTExpt.cc
@@ -64,8 +64,10 @@ void DeepSpinPTExpt::init(const std::string& model,
     return;
   }
 
+  preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);
+
   // Load libdeepmd_op_pt.so so deepmd_export::* schemas are visible
-  // to torch's dispatcher before the AOTI module loads.  See
+  // to torch's dispatcher before the AOTI module loads. See
   // DeepPotPTExpt::init for the full rationale.
   deepmd::load_op_library();
 
@@ -75,18 +77,11 @@ void DeepSpinPTExpt::init(const std::string& model,
         "Please provide a file path instead.");
   }
 
-  int gpu_num = torch::cuda::device_count();
-  gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
-  gpu_enabled = torch::cuda::is_available();
-
   std::string device_str;
   if (!gpu_enabled) {
     device_str = "cpu";
     std::cout << "load model from: " << model << " to cpu" << std::endl;
   } else {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    DPErrcheck(DPSetDevice(gpu_id));
-#endif
     device_str = "cuda:" + std::to_string(gpu_id);
     std::cout << "load model from: " << model << " to gpu " << gpu_id
               << std::endl;
diff --git a/source/api_cc/src/DeepTensorPT.cc b/source/api_cc/src/DeepTensorPT.cc
index 1636f3af95..3036217ef7 100644
--- a/source/api_cc/src/DeepTensorPT.cc
+++ b/source/api_cc/src/DeepTensorPT.cc
@@ -9,6 +9,7 @@
 #include <sstream>
 
 #include "common.h"
+#include "commonPT.h"
 #include "device.h"
 #include "errors.h"
 
@@ -74,22 +75,13 @@ void DeepTensorPT::init(const std::string& model,
     return;
   }
   name_scope = name_scope_;
+  preselect_torch_device(gpu_rank, gpu_id, gpu_enabled);
   deepmd::load_op_library();
-  int gpu_num = torch::cuda::device_count();
-  if (gpu_num > 0) {
-    gpu_id = gpu_rank % gpu_num;
-  } else {
-    gpu_id = 0;
-  }
   torch::Device device(torch::kCUDA, gpu_id);
-  gpu_enabled = torch::cuda::is_available();
   if (!gpu_enabled) {
     device = torch::Device(torch::kCPU);
     std::cout << "load model from: " << model << " to cpu " << std::endl;
   } else {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    DPErrcheck(DPSetDevice(gpu_id));
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     std::cout << "load model from: " << model << " to gpu " << gpu_id
               << std::endl;
   }