OpenMOSS · danieldale2026 · May 11, 2026
diff --git a/mova/engine/trainer/accelerate/accelerate_trainer.py b/mova/engine/trainer/accelerate/accelerate_trainer.py
@@ -9,6 +9,7 @@
 
 import os, re
 import torch
+import torch_npu
 from tqdm import tqdm
 
 try:
@@ -411,6 +412,8 @@ def train(self):
                 loss = loss_dict["loss"]
 
                 self.accelerator.backward(loss)
+                # Synchronize NPU stream to avoid async ordering issues with DP replicate training.
+                torch_npu.npu.current_stream().synchronize()
 
                 if self.gradient_clip_norm > 0:
                     if self.accelerator.sync_gradients: