From ac60ff0b720a862149e2b03efbbd95397bd1ea63 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:28:57 +0800 Subject: [PATCH 1/8] Update CopyKernel.cpp --- src/ATen/native/xpu/sycl/CopyKernel.cpp | 51 ++++++++++--------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index ca159164c3..92409aae10 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -5,9 +5,8 @@ #include #include -#include - #include +#include namespace at::native::xpu { @@ -25,6 +24,20 @@ struct CastScalarFunc { } }; +template <> +struct CastScalarFunc { + C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const { + float f_val = static_cast(src_val); + uint16_t half_bits; + std::memcpy(&half_bits, &src_val, sizeof(uint16_t)); + + if (half_bits == 0x8000) { + return Float8_e4m3fn(-0.0f); + } + return Float8_e4m3fn(f_val); + } +}; + void float8_copy_kernel_xpu(TensorIteratorBase& iter) { ScalarType dtype = iter.dtype(0); ScalarType other_dtype = iter.dtype(1); @@ -88,21 +101,6 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) { gpu_kernel(iter, CopyScalarFunc()); break; } - } else if (dtype == kFloat8_e8m0fnu) { - switch (other_dtype) { - case kFloat: - gpu_kernel_nocast(iter, CastScalarFunc()); - break; - case kHalf: - gpu_kernel_nocast(iter, CastScalarFunc()); - break; - case kBFloat16: - gpu_kernel_nocast(iter, CastScalarFunc()); - break; - default: - gpu_kernel(iter, CopyScalarFunc()); - break; - } } else { TORCH_CHECK( false, @@ -111,16 +109,6 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) { } } -void float4_copy_kernel_xpu(TensorIteratorBase& iter) { - ScalarType src_dtype = iter.dtype(1); - - if (src_dtype == kFloat4_e2m1fn_x2) { - gpu_kernel_nocast(iter, CopyScalarFunc()); - } else { - TORCH_CHECK(false, "Copy from ", src_dtype, " to Float4_e2m1fn_x2 has not been supported."); - } -} - void copy_kernel(TensorIteratorBase& iter) { ScalarType dtype = iter.common_dtype(); if (isQIntType(dtype)) { @@ -129,8 +117,6 @@ void copy_kernel(TensorIteratorBase& iter) { }); } else if (isFloat8Type(iter.dtype(0))) { float8_copy_kernel_xpu(iter); - } else if (iter.dtype(0) == kFloat4_e2m1fn_x2) { - float4_copy_kernel_xpu(iter); } else { AT_DISPATCH_V2( dtype, @@ -141,8 +127,11 @@ void copy_kernel(TensorIteratorBase& iter) { kBool, kBFloat16, kComplexHalf, - AT_EXPAND(AT_FLOAT8_TYPES), - AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); + AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), + kFloat8_e4m3fn, + kFloat8_e5m2, + kFloat8_e4m3fnuz, + kFloat8_e5m2fnuz); } } From 3bb86ae3f5bd83d8ba8638a3ad8f0e9c9d73fba2 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Fri, 21 Nov 2025 16:39:26 +0800 Subject: [PATCH 2/8] Update CopyKernel.cpp --- src/ATen/native/xpu/sycl/CopyKernel.cpp | 37 +++++++++++++++++++++---- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index 92409aae10..920cdf71b6 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -5,9 +5,10 @@ #include #include -#include #include +#include + namespace at::native::xpu { template @@ -101,6 +102,21 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) { gpu_kernel(iter, CopyScalarFunc()); break; } + } else if (dtype == kFloat8_e8m0fnu) { + switch (other_dtype) { + case kFloat: + gpu_kernel_nocast(iter, CastScalarFunc()); + break; + case kHalf: + gpu_kernel_nocast(iter, CastScalarFunc()); + break; + case kBFloat16: + gpu_kernel_nocast(iter, CastScalarFunc()); + break; + default: + gpu_kernel(iter, CopyScalarFunc()); + break; + } } else { TORCH_CHECK( false, @@ -109,6 +125,16 @@ void float8_copy_kernel_xpu(TensorIteratorBase& iter) { } } +void float4_copy_kernel_xpu(TensorIteratorBase& iter) { + ScalarType src_dtype = iter.dtype(1); + + if (src_dtype == kFloat4_e2m1fn_x2) { + gpu_kernel_nocast(iter, CopyScalarFunc()); + } else { + TORCH_CHECK(false, "Copy from ", src_dtype, " to Float4_e2m1fn_x2 has not been supported."); + } +} + void copy_kernel(TensorIteratorBase& iter) { ScalarType dtype = iter.common_dtype(); if (isQIntType(dtype)) { @@ -117,6 +143,8 @@ void copy_kernel(TensorIteratorBase& iter) { }); } else if (isFloat8Type(iter.dtype(0))) { float8_copy_kernel_xpu(iter); + } else if (iter.dtype(0) == kFloat4_e2m1fn_x2) { + float4_copy_kernel_xpu(iter); } else { AT_DISPATCH_V2( dtype, @@ -127,11 +155,8 @@ void copy_kernel(TensorIteratorBase& iter) { kBool, kBFloat16, kComplexHalf, - AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), - kFloat8_e4m3fn, - kFloat8_e5m2, - kFloat8_e4m3fnuz, - kFloat8_e5m2fnuz); + AT_EXPAND(AT_FLOAT8_TYPES), + AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } } From d0d732778eb015f0d101bc4f9307ff3021abb173 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:35:47 +0800 Subject: [PATCH 3/8] method2 --- src/ATen/native/xpu/sycl/CopyKernel.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index 920cdf71b6..9e509014e1 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -28,14 +28,11 @@ struct CastScalarFunc { template <> struct CastScalarFunc { C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const { - float f_val = static_cast(src_val); - uint16_t half_bits; - std::memcpy(&half_bits, &src_val, sizeof(uint16_t)); - - if (half_bits == 0x8000) { - return Float8_e4m3fn(-0.0f); - } - return Float8_e4m3fn(f_val); + // TODO(Temporarily): Avoid using sycl::half to prevent the fp16 -> fp32 -> + // fp8 + // fusion from incorrectly converting -0.0 to NaN. This temporary fix should + // be removed once the compiler error is resolved. + return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val)); } }; From cb52f9304d4f65ef8df21fb053811c1c727ad847 Mon Sep 17 00:00:00 2001 From: yucai-intel <108388355+yucai-intel@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:45:40 +0800 Subject: [PATCH 4/8] format --- src/ATen/native/xpu/sycl/CopyKernel.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index 9e509014e1..1c20190c17 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -28,8 +28,7 @@ struct CastScalarFunc { template <> struct CastScalarFunc { C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const { - // TODO(Temporarily): Avoid using sycl::half to prevent the fp16 -> fp32 -> - // fp8 + // TODO(Temporarily): Avoid using sycl::half to prevent the fp16->fp32->fp8 // fusion from incorrectly converting -0.0 to NaN. This temporary fix should // be removed once the compiler error is resolved. return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val)); From 8f621523b99fb48ceb1a099537d67989aa58dd1a Mon Sep 17 00:00:00 2001 From: "Cui, Yifeng" Date: Sun, 30 Nov 2025 21:49:15 -0800 Subject: [PATCH 5/8] Fix FP16 to FP32 conversion --- src/ATen/native/xpu/sycl/CopyKernel.cpp | 29 +++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index 1c20190c17..e9b79a3a22 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -25,13 +25,34 @@ struct CastScalarFunc { } }; +// TODO: Avoid using sycl::half to prevent the fp16->fp32->fp8 fusion +// from incorrectly converting -0.0 to NaN. This temporary fix should +// be removed once the compiler/driver error is resolved. template <> struct CastScalarFunc { C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const { - // TODO(Temporarily): Avoid using sycl::half to prevent the fp16->fp32->fp8 - // fusion from incorrectly converting -0.0 to NaN. This temporary fix should - // be removed once the compiler error is resolved. - return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val)); + return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); + } +}; + +template <> +struct CastScalarFunc { + C10_HOST_DEVICE Float8_e4m3fnuz operator()(Half src_val) const { + return Float8_e4m3fnuz(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); + } +}; + +template <> +struct CastScalarFunc { + C10_HOST_DEVICE Float8_e5m2 operator()(Half src_val) const { + return Float8_e5m2(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); + } +}; + +template <> +struct CastScalarFunc { + C10_HOST_DEVICE Float8_e5m2fnuz operator()(Half src_val) const { + return Float8_e5m2fnuz(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); } }; From 42de29b8e161cf38c26a5a677e6af7301bfbd859 Mon Sep 17 00:00:00 2001 From: "Cui, Yifeng" Date: Sun, 30 Nov 2025 21:49:35 -0800 Subject: [PATCH 6/8] Add test cases --- test/regressions/test_conversion.py | 41 +++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 test/regressions/test_conversion.py diff --git a/test/regressions/test_conversion.py b/test/regressions/test_conversion.py new file mode 100644 index 0000000000..3b27f84964 --- /dev/null +++ b/test/regressions/test_conversion.py @@ -0,0 +1,41 @@ +# Owner(s): ["module: intel"] +import torch +from torch.testing._internal.common_device_type import ( + dtypes, + instantiate_device_type_tests, +) +from torch.testing._internal.common_dtype import float8_types +from torch.testing._internal.common_utils import run_tests, TestCase + +cpu_device = torch.device("cpu") +xpu_device = torch.device("xpu") + + +class TestSimpleConversion(TestCase): + def _compare_convert_with_cpu(self, src_cpu, dtype): + src_xpu = src_cpu.to(xpu_device) + dst_cpu = src_cpu.to(dtype) + dst_xpu = src_xpu.to(dtype) + self.assertEqual(dst_xpu.to(cpu_device), dst_cpu) + + @dtypes(*float8_types()) + def test_half_zero(self, dtype): + pos_zero_fp16_cpu = torch.zeros((5, 6), dtype=torch.float16) + self._compare_convert_with_cpu(pos_zero_fp16_cpu, dtype) + + neg_zero_fp16_cpu = torch.full((5, 6), -0.0, dtype=torch.float16) + self._compare_convert_with_cpu(neg_zero_fp16_cpu, dtype) + + @dtypes(*float8_types()) + def test_half_nonzero(self, dtype): + x_fp16_cpu = torch.arange(-100.0, 101.0, dtype=torch.float16) + self._compare_convert_with_cpu(x_fp16_cpu, dtype) + + +instantiate_device_type_tests( + TestSimpleConversion, globals(), only_for="xpu", allow_xpu=True +) + + +if __name__ == "__main__": + run_tests() From 58f54b597f96bce15315bfd8242093f79c3afafd Mon Sep 17 00:00:00 2001 From: "Cui, Yifeng" Date: Mon, 1 Dec 2025 05:13:31 -0800 Subject: [PATCH 7/8] Simplify template --- src/ATen/native/xpu/sycl/CopyKernel.cpp | 29 ++++--------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index e9b79a3a22..ef0dd74e3f 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -28,31 +28,10 @@ struct CastScalarFunc { // TODO: Avoid using sycl::half to prevent the fp16->fp32->fp8 fusion // from incorrectly converting -0.0 to NaN. This temporary fix should // be removed once the compiler/driver error is resolved. -template <> -struct CastScalarFunc { - C10_HOST_DEVICE Float8_e4m3fn operator()(Half src_val) const { - return Float8_e4m3fn(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); - } -}; - -template <> -struct CastScalarFunc { - C10_HOST_DEVICE Float8_e4m3fnuz operator()(Half src_val) const { - return Float8_e4m3fnuz(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); - } -}; - -template <> -struct CastScalarFunc { - C10_HOST_DEVICE Float8_e5m2 operator()(Half src_val) const { - return Float8_e5m2(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); - } -}; - -template <> -struct CastScalarFunc { - C10_HOST_DEVICE Float8_e5m2fnuz operator()(Half src_val) const { - return Float8_e5m2fnuz(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); +template +struct CastScalarFunc { + C10_HOST_DEVICE Float8DataType operator()(Half src_val) const { + return Float8DataType(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); } }; From ddf4e882f99ada6dd33789c0a8aef3d1f11364e2 Mon Sep 17 00:00:00 2001 From: "Cui, Yifeng" Date: Tue, 2 Dec 2025 06:54:32 -0800 Subject: [PATCH 8/8] Refine fixing --- src/ATen/native/xpu/sycl/CopyKernel.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index ef0dd74e3f..96ea87b581 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -30,8 +30,9 @@ struct CastScalarFunc { // be removed once the compiler/driver error is resolved. template struct CastScalarFunc { - C10_HOST_DEVICE Float8DataType operator()(Half src_val) const { - return Float8DataType(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); + Float8DataType operator()(Half src_val) const { + Half val = src_val == Half(-0.0) ? Half(0.0) : src_val; + return Float8DataType(val); } };