From 5d687c288d0dd510e128f589009a47e22cab4013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C5=A9=20Ho=C3=A0ng=20Nh=E1=BA=ADt=20Tr=C6=B0=E1=BB=9Dng?= Date: Wed, 15 Apr 2026 11:48:14 +0000 Subject: [PATCH 1/2] Fix: Move Tensor from CPU to GPU when using FusedLAMB Optimizer --- colossalai/utils/multi_tensor_apply/multi_tensor_apply.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py index 750c2a32da34..79c2ddb1e220 100644 --- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py +++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py @@ -4,6 +4,7 @@ class MultiTensorApply(object): """ Apply an operation to a list of tensors efficiently. + Move tensors to CUDA if they are on CPU. Args: chunk_size (int): Size of a chunk. @@ -32,4 +33,11 @@ def check_avail(self): def __call__(self, op, noop_flag_buffer, tensor_lists, *args): self.check_avail() + # Move tensors to GPU if not already on GPU + for i, tensor_list in enumerate(tensor_lists): + for j, tensor in enumerate(tensor_list): + if tensor.device.type == "cpu": + tensor_lists[i][j] = tensor.to("cuda") + return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args) + From eca76f67c2c4b8451bdc43dd62c9092e218f0809 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 12:01:11 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- colossalai/utils/multi_tensor_apply/multi_tensor_apply.py | 1 - 1 file changed, 1 deletion(-) diff --git a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py index 79c2ddb1e220..3431bf706373 100644 --- a/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py +++ b/colossalai/utils/multi_tensor_apply/multi_tensor_apply.py @@ -40,4 +40,3 @@ def __call__(self, op, noop_flag_buffer, tensor_lists, *args): tensor_lists[i][j] = tensor.to("cuda") return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args) -