diff --git a/mova/engine/trainer/accelerate/accelerate_trainer.py b/mova/engine/trainer/accelerate/accelerate_trainer.py index 82ec87a..f996012 100644 --- a/mova/engine/trainer/accelerate/accelerate_trainer.py +++ b/mova/engine/trainer/accelerate/accelerate_trainer.py @@ -9,6 +9,7 @@ import os, re import torch +import torch_npu from tqdm import tqdm try: @@ -411,6 +412,8 @@ def train(self): loss = loss_dict["loss"] self.accelerator.backward(loss) + # Synchronize NPU stream to avoid async ordering issues with DP replicate training. + torch_npu.npu.current_stream().synchronize() if self.gradient_clip_norm > 0: if self.accelerator.sync_gradients: