intel · yucai-intel · Nov 21, 2025 · Nov 21, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -25,6 +25,16 @@ struct CastScalarFunc {
   }
 };
 
+// TODO: Avoid using sycl::half to prevent the fp16->fp32->fp8 fusion
+// from incorrectly converting -0.0 to NaN. This temporary fix should
+// be removed once the compiler/driver error is resolved.
+template <typename Float8DataType>
+struct CastScalarFunc<Half, Float8DataType> {
+  C10_HOST_DEVICE Float8DataType operator()(Half src_val) const {
+    return Float8DataType(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
+  }
+};
+
 void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
   ScalarType dtype = iter.dtype(0);
   ScalarType other_dtype = iter.dtype(1);

diff --git a/test/regressions/test_conversion.py b/test/regressions/test_conversion.py
@@ -0,0 +1,41 @@
+# Owner(s): ["module: intel"]
+import torch
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+)
+from torch.testing._internal.common_dtype import float8_types
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
+
+class TestSimpleConversion(TestCase):
+    def _compare_convert_with_cpu(self, src_cpu, dtype):
+        src_xpu = src_cpu.to(xpu_device)
+        dst_cpu = src_cpu.to(dtype)
+        dst_xpu = src_xpu.to(dtype)
+        self.assertEqual(dst_xpu.to(cpu_device), dst_cpu)
+
+    @dtypes(*float8_types())
+    def test_half_zero(self, dtype):
+        pos_zero_fp16_cpu = torch.zeros((5, 6), dtype=torch.float16)
+        self._compare_convert_with_cpu(pos_zero_fp16_cpu, dtype)
+
+        neg_zero_fp16_cpu = torch.full((5, 6), -0.0, dtype=torch.float16)
+        self._compare_convert_with_cpu(neg_zero_fp16_cpu, dtype)
+
+    @dtypes(*float8_types())
+    def test_half_nonzero(self, dtype):
+        x_fp16_cpu = torch.arange(-100.0, 101.0, dtype=torch.float16)
+        self._compare_convert_with_cpu(x_fp16_cpu, dtype)
+
+
+instantiate_device_type_tests(
+    TestSimpleConversion, globals(), only_for="xpu", allow_xpu=True
+)
+
+
+if __name__ == "__main__":
+    run_tests()