From ac60ff0b720a862149e2b03efbbd95397bd1ea63 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Fri, 21 Nov 2025 16:28:57 +0800
Subject: [PATCH 1/8] Update CopyKernel.cpp

---
 src/ATen/native/xpu/sycl/CopyKernel.cpp | 51 ++++++++++---------------
 1 file changed, 20 insertions(+), 31 deletions(-)
diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index ca159164c3..92409aae10 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -5,9 +5,8 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/TensorIterator.h>
 
-#include <ATen/native/xpu/sycl/Loops.h>
-
 #include <ATen/native/xpu/sycl/CopyKernel.h>
+#include <ATen/native/xpu/sycl/Loops.h>
 
 namespace at::native::xpu {
 
@@ -25,6 +24,20 @@ struct CastScalarFunc {
   }
 };
 
+template <>
+struct CastScalarFunc<Half, Float8_e4m3fn> {
+  C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const {
+    float f_val = static_cast<float>(src_val);
+    uint16_t half_bits;
+    std::memcpy(&half_bits, &src_val, sizeof(uint16_t));
+
+    if (half_bits == 0x8000) {
+      return Float8_e4m3fn(-0.0f);
+    }
+    return Float8_e4m3fn(f_val);
+  }
+};
+
 void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
   ScalarType dtype = iter.dtype(0);
   ScalarType other_dtype = iter.dtype(1);
@@ -88,21 +101,6 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
         gpu_kernel(iter, CopyScalarFunc<Float8_e5m2fnuz>());
         break;
     }
-  } else if (dtype == kFloat8_e8m0fnu) {
-    switch (other_dtype) {
-      case kFloat:
-        gpu_kernel_nocast(iter, CastScalarFunc<float, Float8_e8m0fnu>());
-        break;
-      case kHalf:
-        gpu_kernel_nocast(iter, CastScalarFunc<Half, Float8_e8m0fnu>());
-        break;
-      case kBFloat16:
-        gpu_kernel_nocast(iter, CastScalarFunc<BFloat16, Float8_e8m0fnu>());
-        break;
-      default:
-        gpu_kernel(iter, CopyScalarFunc<Float8_e8m0fnu>());
-        break;
-    }
   } else {
     TORCH_CHECK(
         false,
@@ -111,16 +109,6 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
   }
 }
 
-void float4_copy_kernel_xpu(TensorIteratorBase& iter) {
-  ScalarType src_dtype = iter.dtype(1);
-
-  if (src_dtype == kFloat4_e2m1fn_x2) {
-    gpu_kernel_nocast(iter, CopyScalarFunc<Float4_e2m1fn_x2>());
-  } else {
-    TORCH_CHECK(false, "Copy from ", src_dtype, " to Float4_e2m1fn_x2 has not been supported.");
-  }
-}
-
 void copy_kernel(TensorIteratorBase& iter) {
   ScalarType dtype = iter.common_dtype();
   if (isQIntType(dtype)) {
@@ -129,8 +117,6 @@ void copy_kernel(TensorIteratorBase& iter) {
     });
   } else if (isFloat8Type(iter.dtype(0))) {
     float8_copy_kernel_xpu(iter);
-  } else if (iter.dtype(0) == kFloat4_e2m1fn_x2) {
-    float4_copy_kernel_xpu(iter);
   } else {
     AT_DISPATCH_V2(
         dtype,
@@ -141,8 +127,11 @@ void copy_kernel(TensorIteratorBase& iter) {
         kBool,
         kBFloat16,
         kComplexHalf,
-        AT_EXPAND(AT_FLOAT8_TYPES),
-        AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+        AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
+        kFloat8_e4m3fn,
+        kFloat8_e5m2,
+        kFloat8_e4m3fnuz,
+        kFloat8_e5m2fnuz);
   }
 }
 

From 3bb86ae3f5bd83d8ba8638a3ad8f0e9c9d73fba2 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Fri, 21 Nov 2025 16:39:26 +0800
Subject: [PATCH 2/8] Update CopyKernel.cpp

---
 src/ATen/native/xpu/sycl/CopyKernel.cpp | 37 +++++++++++++++++++++----
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index 92409aae10..920cdf71b6 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -5,9 +5,10 @@
 #include <ATen/core/Tensor.h>
 #include <ATen/native/TensorIterator.h>
 
-#include <ATen/native/xpu/sycl/CopyKernel.h>
 #include <ATen/native/xpu/sycl/Loops.h>
 
+#include <ATen/native/xpu/sycl/CopyKernel.h>
+
 namespace at::native::xpu {
 
 template <typename scalar_t>
@@ -101,6 +102,21 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
         gpu_kernel(iter, CopyScalarFunc<Float8_e5m2fnuz>());
         break;
     }
+  } else if (dtype == kFloat8_e8m0fnu) {
+    switch (other_dtype) {
+      case kFloat:
+        gpu_kernel_nocast(iter, CastScalarFunc<float, Float8_e8m0fnu>());
+        break;
+      case kHalf:
+        gpu_kernel_nocast(iter, CastScalarFunc<Half, Float8_e8m0fnu>());
+        break;
+      case kBFloat16:
+        gpu_kernel_nocast(iter, CastScalarFunc<BFloat16, Float8_e8m0fnu>());
+        break;
+      default:
+        gpu_kernel(iter, CopyScalarFunc<Float8_e8m0fnu>());
+        break;
+    }
   } else {
     TORCH_CHECK(
         false,
@@ -109,6 +125,16 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) {
   }
 }
 
+void float4_copy_kernel_xpu(TensorIteratorBase& iter) {
+  ScalarType src_dtype = iter.dtype(1);
+
+  if (src_dtype == kFloat4_e2m1fn_x2) {
+    gpu_kernel_nocast(iter, CopyScalarFunc<Float4_e2m1fn_x2>());
+  } else {
+    TORCH_CHECK(false, "Copy from ", src_dtype, " to Float4_e2m1fn_x2 has not been supported.");
+  }
+}
+
 void copy_kernel(TensorIteratorBase& iter) {
   ScalarType dtype = iter.common_dtype();
   if (isQIntType(dtype)) {
@@ -117,6 +143,8 @@ void copy_kernel(TensorIteratorBase& iter) {
     });
   } else if (isFloat8Type(iter.dtype(0))) {
     float8_copy_kernel_xpu(iter);
+  } else if (iter.dtype(0) == kFloat4_e2m1fn_x2) {
+    float4_copy_kernel_xpu(iter);
   } else {
     AT_DISPATCH_V2(
         dtype,
@@ -127,11 +155,8 @@ void copy_kernel(TensorIteratorBase& iter) {
         kBool,
         kBFloat16,
         kComplexHalf,
-        AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
-        kFloat8_e4m3fn,
-        kFloat8_e5m2,
-        kFloat8_e4m3fnuz,
-        kFloat8_e5m2fnuz);
+        AT_EXPAND(AT_FLOAT8_TYPES),
+        AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
   }
 }
 

From d0d732778eb015f0d101bc4f9307ff3021abb173 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:35:47 +0800
Subject: [PATCH 3/8] method2

---
 src/ATen/native/xpu/sycl/CopyKernel.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index 920cdf71b6..9e509014e1 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -28,14 +28,11 @@ struct CastScalarFunc {
 template <>
 struct CastScalarFunc<Half, Float8_e4m3fn> {
   C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const {
-    float f_val = static_cast<float>(src_val);
-    uint16_t half_bits;
-    std::memcpy(&half_bits, &src_val, sizeof(uint16_t));
-
-    if (half_bits == 0x8000) {
-      return Float8_e4m3fn(-0.0f);
-    }
-    return Float8_e4m3fn(f_val);
+    // TODO(Temporarily): Avoid using sycl::half to prevent the fp16 -> fp32 ->
+    // fp8
+    // fusion from incorrectly converting -0.0 to NaN. This temporary fix should
+    // be removed once the compiler error is resolved.
+    return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val));
   }
 };
 

From cb52f9304d4f65ef8df21fb053811c1c727ad847 Mon Sep 17 00:00:00 2001
From: yucai-intel <108388355+yucai-intel@users.noreply.github.com>
Date: Thu, 27 Nov 2025 16:45:40 +0800
Subject: [PATCH 4/8] format

---
 src/ATen/native/xpu/sycl/CopyKernel.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index 9e509014e1..1c20190c17 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -28,8 +28,7 @@ struct CastScalarFunc {
 template <>
 struct CastScalarFunc<Half, Float8_e4m3fn> {
   C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const {
-    // TODO(Temporarily): Avoid using sycl::half to prevent the fp16 -> fp32 ->
-    // fp8
+    // TODO(Temporarily): Avoid using sycl::half to prevent the fp16->fp32->fp8
     // fusion from incorrectly converting -0.0 to NaN. This temporary fix should
     // be removed once the compiler error is resolved.
     return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val));

From 8f621523b99fb48ceb1a099537d67989aa58dd1a Mon Sep 17 00:00:00 2001
From: "Cui, Yifeng" <yifeng.cui@intel.com>
Date: Sun, 30 Nov 2025 21:49:15 -0800
Subject: [PATCH 5/8] Fix FP16 to FP32 conversion

---
 src/ATen/native/xpu/sycl/CopyKernel.cpp | 29 +++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index 1c20190c17..e9b79a3a22 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -25,13 +25,34 @@ struct CastScalarFunc {
   }
 };
 
+// TODO: Avoid using sycl::half to prevent the fp16->fp32->fp8 fusion
+// from incorrectly converting -0.0 to NaN. This temporary fix should
+// be removed once the compiler/driver error is resolved.
 template <>
 struct CastScalarFunc<Half, Float8_e4m3fn> {
   C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const {
-    // TODO(Temporarily): Avoid using sycl::half to prevent the fp16->fp32->fp8
-    // fusion from incorrectly converting -0.0 to NaN. This temporary fix should
-    // be removed once the compiler error is resolved.
-    return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val));
+    return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
+  }
+};
+
+template <>
+struct CastScalarFunc<Half, Float8_e4m3fnuz> {
+  C10_HOST_DEVICE Float8_e4m3fnuz operator()(Half src_val) const {
+    return Float8_e4m3fnuz(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
+  }
+};
+
+template <>
+struct CastScalarFunc<Half, Float8_e5m2> {
+  C10_HOST_DEVICE Float8_e5m2 operator()(Half src_val) const {
+    return Float8_e5m2(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
+  }
+};
+
+template <>
+struct CastScalarFunc<Half, Float8_e5m2fnuz> {
+  C10_HOST_DEVICE Float8_e5m2fnuz operator()(Half src_val) const {
+    return Float8_e5m2fnuz(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
   }
 };
 

From 42de29b8e161cf38c26a5a677e6af7301bfbd859 Mon Sep 17 00:00:00 2001
From: "Cui, Yifeng" <yifeng.cui@intel.com>
Date: Sun, 30 Nov 2025 21:49:35 -0800
Subject: [PATCH 6/8] Add test cases

---
 test/regressions/test_conversion.py | 41 +++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 test/regressions/test_conversion.py

diff --git a/test/regressions/test_conversion.py b/test/regressions/test_conversion.py
new file mode 100644
index 0000000000..3b27f84964
--- /dev/null
+++ b/test/regressions/test_conversion.py
@@ -0,0 +1,41 @@
+# Owner(s): ["module: intel"]
+import torch
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
+)
+from torch.testing._internal.common_dtype import float8_types
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+cpu_device = torch.device("cpu")
+xpu_device = torch.device("xpu")
+
+
+class TestSimpleConversion(TestCase):
+    def _compare_convert_with_cpu(self, src_cpu, dtype):
+        src_xpu = src_cpu.to(xpu_device)
+        dst_cpu = src_cpu.to(dtype)
+        dst_xpu = src_xpu.to(dtype)
+        self.assertEqual(dst_xpu.to(cpu_device), dst_cpu)
+
+    @dtypes(*float8_types())
+    def test_half_zero(self, dtype):
+        pos_zero_fp16_cpu = torch.zeros((5, 6), dtype=torch.float16)
+        self._compare_convert_with_cpu(pos_zero_fp16_cpu, dtype)
+
+        neg_zero_fp16_cpu = torch.full((5, 6), -0.0, dtype=torch.float16)
+        self._compare_convert_with_cpu(neg_zero_fp16_cpu, dtype)
+
+    @dtypes(*float8_types())
+    def test_half_nonzero(self, dtype):
+        x_fp16_cpu = torch.arange(-100.0, 101.0, dtype=torch.float16)
+        self._compare_convert_with_cpu(x_fp16_cpu, dtype)
+
+
+instantiate_device_type_tests(
+    TestSimpleConversion, globals(), only_for="xpu", allow_xpu=True
+)
+
+
+if __name__ == "__main__":
+    run_tests()

From 58f54b597f96bce15315bfd8242093f79c3afafd Mon Sep 17 00:00:00 2001
From: "Cui, Yifeng" <yifeng.cui@intel.com>
Date: Mon, 1 Dec 2025 05:13:31 -0800
Subject: [PATCH 7/8] Simplify template

---
 src/ATen/native/xpu/sycl/CopyKernel.cpp | 29 ++++---------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index e9b79a3a22..ef0dd74e3f 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -28,31 +28,10 @@ struct CastScalarFunc {
 // TODO: Avoid using sycl::half to prevent the fp16->fp32->fp8 fusion
 // from incorrectly converting -0.0 to NaN. This temporary fix should
 // be removed once the compiler/driver error is resolved.
-template <>
-struct CastScalarFunc<Half, Float8_e4m3fn> {
-  C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const {
-    return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
-  }
-};
-
-template <>
-struct CastScalarFunc<Half, Float8_e4m3fnuz> {
-  C10_HOST_DEVICE Float8_e4m3fnuz operator()(Half src_val) const {
-    return Float8_e4m3fnuz(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
-  }
-};
-
-template <>
-struct CastScalarFunc<Half, Float8_e5m2> {
-  C10_HOST_DEVICE Float8_e5m2 operator()(Half src_val) const {
-    return Float8_e5m2(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
-  }
-};
-
-template <>
-struct CastScalarFunc<Half, Float8_e5m2fnuz> {
-  C10_HOST_DEVICE Float8_e5m2fnuz operator()(Half src_val) const {
-    return Float8_e5m2fnuz(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
+template <typename Float8DataType>
+struct CastScalarFunc<Half, Float8DataType> {
+  C10_HOST_DEVICE Float8DataType operator()(Half src_val) const {
+    return Float8DataType(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
   }
 };
 

From ddf4e882f99ada6dd33789c0a8aef3d1f11364e2 Mon Sep 17 00:00:00 2001
From: "Cui, Yifeng" <yifeng.cui@intel.com>
Date: Tue, 2 Dec 2025 06:54:32 -0800
Subject: [PATCH 8/8] Refine fixing

---
 src/ATen/native/xpu/sycl/CopyKernel.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp
index ef0dd74e3f..96ea87b581 100644
--- a/src/ATen/native/xpu/sycl/CopyKernel.cpp
+++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp
@@ -30,8 +30,9 @@ struct CastScalarFunc {
 // be removed once the compiler/driver error is resolved.
 template <typename Float8DataType>
 struct CastScalarFunc<Half, Float8DataType> {
-  C10_HOST_DEVICE Float8DataType operator()(Half src_val) const {
-    return Float8DataType(c10::detail::fp16_ieee_to_fp32_value(src_val.x));
+  Float8DataType operator()(Half src_val) const {
+    Half val = src_val == Half(-0.0) ? Half(0.0) : src_val;
+    return Float8DataType(val);
   }
 };