diff --git a/src/ATen/native/xpu/sycl/CopyKernel.cpp b/src/ATen/native/xpu/sycl/CopyKernel.cpp index ca159164c..ef0dd74e3 100644 --- a/src/ATen/native/xpu/sycl/CopyKernel.cpp +++ b/src/ATen/native/xpu/sycl/CopyKernel.cpp @@ -25,6 +25,16 @@ struct CastScalarFunc { } }; +// TODO: Avoid using sycl::half to prevent the fp16->fp32->fp8 fusion +// from incorrectly converting -0.0 to NaN. This temporary fix should +// be removed once the compiler/driver error is resolved. +template +struct CastScalarFunc { + C10_HOST_DEVICE Float8DataType operator()(Half src_val) const { + return Float8DataType(c10::detail::fp16_ieee_to_fp32_value(src_val.x)); + } +}; + void float8_copy_kernel_xpu(TensorIteratorBase& iter) { ScalarType dtype = iter.dtype(0); ScalarType other_dtype = iter.dtype(1); diff --git a/test/regressions/test_conversion.py b/test/regressions/test_conversion.py new file mode 100644 index 000000000..3b27f8496 --- /dev/null +++ b/test/regressions/test_conversion.py @@ -0,0 +1,41 @@ +# Owner(s): ["module: intel"] +import torch +from torch.testing._internal.common_device_type import ( + dtypes, + instantiate_device_type_tests, +) +from torch.testing._internal.common_dtype import float8_types +from torch.testing._internal.common_utils import run_tests, TestCase + +cpu_device = torch.device("cpu") +xpu_device = torch.device("xpu") + + +class TestSimpleConversion(TestCase): + def _compare_convert_with_cpu(self, src_cpu, dtype): + src_xpu = src_cpu.to(xpu_device) + dst_cpu = src_cpu.to(dtype) + dst_xpu = src_xpu.to(dtype) + self.assertEqual(dst_xpu.to(cpu_device), dst_cpu) + + @dtypes(*float8_types()) + def test_half_zero(self, dtype): + pos_zero_fp16_cpu = torch.zeros((5, 6), dtype=torch.float16) + self._compare_convert_with_cpu(pos_zero_fp16_cpu, dtype) + + neg_zero_fp16_cpu = torch.full((5, 6), -0.0, dtype=torch.float16) + self._compare_convert_with_cpu(neg_zero_fp16_cpu, dtype) + + @dtypes(*float8_types()) + def test_half_nonzero(self, dtype): + x_fp16_cpu = torch.arange(-100.0, 101.0, dtype=torch.float16) + self._compare_convert_with_cpu(x_fp16_cpu, dtype) + + +instantiate_device_type_tests( + TestSimpleConversion, globals(), only_for="xpu", allow_xpu=True +) + + +if __name__ == "__main__": + run_tests()