From 752a5bb0316dd8ec5c7f836abef5f8a3bb5aeff3 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Tue, 26 May 2026 14:41:44 +0800 Subject: [PATCH 1/2] This PR updates the Stable Diffusion IP-Adapter integration tests to keep them passing across PyTorch versions. These cases pass on PyTorch 2.11, but fail on PyTorch 2.12 because torch.randn no longer produces the same values for the same seed, which changes the generated latents and breaks the expected image slices. Signed-off-by: Wang, Yi --- .../test_ip_adapter_stable_diffusion.py | 294 ++++++++++-------- 1 file changed, 161 insertions(+), 133 deletions(-) diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index 32590111cdf3..1073ef06bf2c 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -15,6 +15,7 @@ import gc import unittest +from unittest import mock import numpy as np import torch @@ -53,6 +54,15 @@ class IPAdapterNightlyTestsMixin(unittest.TestCase): dtype = torch.float16 + _SD_PIPELINE_RANDN_TENSOR_TARGETS = { + StableDiffusionPipeline: "diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.randn_tensor", + StableDiffusionImg2ImgPipeline: "diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.randn_tensor", + StableDiffusionInpaintPipeline: "diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.randn_tensor", + StableDiffusionXLPipeline: "diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.randn_tensor", + StableDiffusionXLImg2ImgPipeline: "diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.randn_tensor", + StableDiffusionXLInpaintPipeline: "diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint.randn_tensor", + } + def setUp(self): # clean up the VRAM before each test super().setUp() @@ -65,6 +75,22 @@ def tearDown(self): gc.collect() backend_empty_cache(torch_device) + def get_fixed_noise(self, shape=(1, 4, 64, 64), seed=33): + return torch.from_numpy(np.random.RandomState(seed).standard_normal(shape)).to(torch.float32) + + def get_fixed_randn_tensor_patch(self, pipeline, shape=(1, 4, 64, 64), seed=33): + fixed_noise = self.get_fixed_noise(shape=shape, seed=seed) + + def fake_randn_tensor(requested_shape, generator=None, device=None, dtype=None, layout=None): + self.assertEqual(tuple(requested_shape), tuple(fixed_noise.shape)) + return fixed_noise.to(device=device, dtype=dtype) + + for pipeline_cls, target in self._SD_PIPELINE_RANDN_TENSOR_TARGETS.items(): + if isinstance(pipeline, pipeline_cls): + return mock.patch(target, side_effect=fake_randn_tensor) + + self.fail(f"No fixed randn_tensor patch target configured for pipeline type {type(pipeline)}") + def get_image_encoder(self, repo_id, subfolder): image_encoder = CLIPVisionModelWithProjection.from_pretrained( repo_id, subfolder=subfolder, torch_dtype=self.dtype @@ -182,10 +208,11 @@ def test_text_to_image(self): pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin") inputs = self.get_dummy_inputs() - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array([0.80810547, 0.88183594, 0.9296875, 0.9189453, 0.9848633, 1.0, 0.97021484, 1.0, 1.0]) + expected_slice = np.array([0.3291, 0.2964, 0.2742, 0.3010, 0.2698, 0.2507, 0.2917, 0.2671, 0.2478]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -193,12 +220,11 @@ def test_text_to_image(self): pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin") inputs = self.get_dummy_inputs() - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array( - [0.30444336, 0.26513672, 0.22436523, 0.2758789, 0.25585938, 0.20751953, 0.25390625, 0.24633789, 0.21923828] - ) + expected_slice = np.array([0.1238, 0.0579, 0.0312, 0.0493, 0.0010, 0.0, 0.0188, 0.0, 0.0]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -214,26 +240,30 @@ def test_image_to_image(self): pipeline.to(torch_device) pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin") - inputs = self.get_dummy_inputs(for_image_to_image=True) - images = pipeline(**inputs).images - image_slice = images[0, :3, :3, -1].flatten() + with self.get_fixed_randn_tensor_patch(pipeline): + inputs = self.get_dummy_inputs(for_image_to_image=True) + images = pipeline(**inputs).images + image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array( - [0.22167969, 0.21875, 0.21728516, 0.22607422, 0.21948242, 0.23925781, 0.22387695, 0.25268555, 0.2722168] - ) + expected_slice = np.array([0.1492, 0.1294, 0.1123, 0.1504, 0.1328, 0.0923, 0.1428, 0.1479, 0.1370]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin") - inputs = self.get_dummy_inputs(for_image_to_image=True) - images = pipeline(**inputs).images - image_slice = images[0, :3, :3, -1].flatten() + with self.get_fixed_randn_tensor_patch(pipeline): + inputs = self.get_dummy_inputs(for_image_to_image=True) + images = pipeline(**inputs).images + image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array( - [0.35913086, 0.265625, 0.26367188, 0.24658203, 0.19750977, 0.39990234, 0.15258789, 0.20336914, 0.5517578] - ) + expected_slice = Expectations( + { + ("cuda", None): np.array([0.0493, 0.0059, 0.0, 0.0166, 0.0056, 0.0027, 0.0139, 0.0090, 0.0129]), + ("xpu", None): np.array([0.0513, 0.0083, 0.0, 0.0183, 0.0073, 0.0039, 0.0159, 0.0100, 0.0142]), + (None, None): np.array([0.0493, 0.0059, 0.0, 0.0166, 0.0056, 0.0027, 0.0139, 0.0090, 0.0129]), + } + ).get_expectation() max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -250,12 +280,11 @@ def test_inpainting(self): pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin") inputs = self.get_dummy_inputs(for_inpainting=True) - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array( - [0.27148438, 0.24047852, 0.22167969, 0.23217773, 0.21118164, 0.21142578, 0.21875, 0.20751953, 0.20019531] - ) + expected_slice = np.array([0.2766, 0.2437, 0.2246, 0.2354, 0.2126, 0.2119, 0.2207, 0.2075, 0.1992]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -263,9 +292,12 @@ def test_inpainting(self): pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin") inputs = self.get_dummy_inputs(for_inpainting=True) - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() + expected_slice = np.array([0.3042, 0.2739, 0.2532, 0.2666, 0.2434, 0.2351, 0.2507, 0.2358, 0.2217]) + max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -281,11 +313,13 @@ def test_text_to_image_model_cpu_offload(self): pipeline.to(torch_device) inputs = self.get_dummy_inputs() - output_without_offload = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + output_without_offload = pipeline(**inputs).images pipeline.enable_model_cpu_offload(device=torch_device) inputs = self.get_dummy_inputs() - output_with_offload = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + output_with_offload = pipeline(**inputs).images max_diff = np.abs(output_with_offload - output_without_offload).max() self.assertLess(max_diff, 1e-3, "CPU offloading should not affect the inference results") @@ -312,9 +346,10 @@ def test_text_to_image_full_face(self): pipeline.set_ip_adapter_scale(0.7) inputs = self.get_dummy_inputs() - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array([0.1704, 0.1296, 0.1272, 0.2212, 0.1514, 0.1479, 0.4172, 0.4263, 0.4360]) + expected_slice = np.array([0.4033, 0.3989, 0.3992, 0.4006, 0.3879, 0.4355, 0.4192, 0.4333, 0.4753]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -358,9 +393,10 @@ def test_multi(self): inputs = self.get_dummy_inputs() ip_adapter_image = inputs["ip_adapter_image"] inputs["ip_adapter_image"] = [ip_adapter_image, [ip_adapter_image] * 2] - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array([0.5234, 0.5352, 0.5625, 0.5713, 0.5947, 0.6206, 0.5786, 0.6187, 0.6494]) + expected_slice = np.array([0.2783, 0.2302, 0.1921, 0.2354, 0.1934, 0.1528, 0.2207, 0.1902, 0.1526]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -386,10 +422,10 @@ def test_text_to_image_face_id(self): id_embeds = id_embeds.reshape((2, 1, 1, 512)) inputs["ip_adapter_image_embeds"] = [id_embeds] inputs["ip_adapter_image"] = None - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - - expected_slice = np.array([0.3237, 0.3186, 0.3406, 0.3154, 0.2942, 0.3220, 0.3188, 0.3528, 0.3242]) + expected_slice = np.array([0.4780, 0.5117, 0.5103, 0.5044, 0.4922, 0.4932, 0.5029, 0.4954, 0.4802]) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -411,23 +447,23 @@ def test_text_to_image_sdxl(self): pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs() - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 128, 128)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() expected_slice = np.array( [ - 0.09630299, - 0.09551358, - 0.08480701, - 0.09070173, - 0.09437338, - 0.09264627, - 0.08883232, - 0.09287417, - 0.09197289, + 0.15138859, + 0.15170279, + 0.14246401, + 0.15483627, + 0.15317351, + 0.15564519, + 0.14952978, + 0.15584505, + 0.14940351, ] ) - max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -447,11 +483,23 @@ def test_text_to_image_sdxl(self): ) inputs = self.get_dummy_inputs() - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 128, 128)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array([0.0596, 0.0539, 0.0459, 0.0580, 0.0560, 0.0548, 0.0501, 0.0563, 0.0500]) - + expected_slice = np.array( + [ + 0.09022659, + 0.08629113, + 0.07586601, + 0.09006533, + 0.08684656, + 0.08665657, + 0.08367643, + 0.08839294, + 0.08377907, + ] + ) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -469,23 +517,13 @@ def test_image_to_image_sdxl(self): pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs(for_image_to_image=True) - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 64, 64)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() expected_slice = np.array( - [ - 0.06513795, - 0.07009393, - 0.07234055, - 0.07426041, - 0.07002589, - 0.06415862, - 0.07827643, - 0.07962808, - 0.07411247, - ] + [0.05107406, 0.05074775, 0.00099546, 0.05845362, 0.05587912, 0.0, 0.06056768, 0.05724522, 0.0648115] ) - assert np.allclose(image_slice, expected_slice, atol=1e-3) image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder") @@ -505,21 +543,11 @@ def test_image_to_image_sdxl(self): ) inputs = self.get_dummy_inputs(for_image_to_image=True) - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 64, 64)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array( - [ - 0.07126552, - 0.07025367, - 0.07348302, - 0.07580167, - 0.07467338, - 0.06918576, - 0.07480252, - 0.08279955, - 0.08547315, - ] + [0.05652112, 0.05557555, 0.00392720, 0.06261870, 0.06117940, 0.0, 0.05906063, 0.06035855, 0.06263199] ) assert np.allclose(image_slice, expected_slice, atol=1e-3) @@ -538,12 +566,21 @@ def test_inpainting_sdxl(self): pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") inputs = self.get_dummy_inputs(for_inpainting=True) - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 128, 128)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - image_slice.tolist() - expected_slice = np.array( - [0.14181179, 0.1493012, 0.14283323, 0.14602411, 0.14915377, 0.15015268, 0.14725655, 0.15009224, 0.15164584] + [ + 0.14227295, + 0.14525282, + 0.14307272, + 0.15040666, + 0.14928216, + 0.14794737, + 0.14742243, + 0.15273672, + 0.15166444, + ] ) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) @@ -566,11 +603,22 @@ def test_inpainting_sdxl(self): ) inputs = self.get_dummy_inputs(for_inpainting=True) - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 128, 128)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - image_slice.tolist() - - expected_slice = np.array([0.1398, 0.1476, 0.1407, 0.1442, 0.1470, 0.1480, 0.1449, 0.1481, 0.1494]) + expected_slice = np.array( + [ + 0.14031684, + 0.14346808, + 0.14132470, + 0.14918229, + 0.14789128, + 0.14650577, + 0.14599693, + 0.15143514, + 0.15061957, + ] + ) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -594,12 +642,22 @@ def test_ip_adapter_mask(self): mask = processor.preprocess(mask) inputs["cross_attention_kwargs"]["ip_adapter_masks"] = mask inputs["ip_adapter_image"] = inputs["ip_adapter_image"][0] - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 128, 128)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() expected_slice = np.array( - [0.7307304, 0.73450166, 0.73731124, 0.7377061, 0.7318013, 0.73720926, 0.74746597, 0.7409929, 0.74074936] + [ + 0.47833657, + 0.50273246, + 0.49865803, + 0.46196738, + 0.51376355, + 0.49931064, + 0.45902768, + 0.55391037, + 0.50260746, + ] ) - max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -621,10 +679,11 @@ def test_ip_adapter_multiple_masks(self): processor = IPAdapterMaskProcessor() masks = processor.preprocess(masks) inputs["cross_attention_kwargs"]["ip_adapter_masks"] = masks - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 128, 128)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() expected_slice = np.array( - [0.79474676, 0.7977683, 0.8013954, 0.7988008, 0.7970615, 0.8029355, 0.80614823, 0.8050743, 0.80627424] + [0.3578991, 0.39458388, 0.43545875, 0.35710996, 0.3885604, 0.43619853, 0.37826842, 0.39264038, 0.45008034] ) max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) @@ -663,54 +722,23 @@ def test_instant_style_multiple_masks(self): masks2 = processor.preprocess(masks2, height=1024, width=1024) masks2 = masks2.reshape(1, masks2.shape[0], masks2.shape[2], masks2.shape[3]) inputs["cross_attention_kwargs"]["ip_adapter_masks"] = [masks1, masks2] - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 128, 128)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slices = Expectations( - { - ("xpu", 3): np.array( - [ - 0.2520, - 0.1050, - 0.1510, - 0.0997, - 0.0893, - 0.0019, - 0.0000, - 0.0000, - 0.0210, - ] - ), - ("cuda", 7): np.array( - [ - 0.2323, - 0.1026, - 0.1338, - 0.0638, - 0.0662, - 0.0000, - 0.0000, - 0.0000, - 0.0199, - ] - ), - ("cuda", 8): np.array( - [ - 0.2518, - 0.1059, - 0.1553, - 0.0977, - 0.0852, - 0.0000, - 0.0000, - 0.0000, - 0.0220, - ] - ), - } + expected_slice = np.array( + [ + 0.0271, + 0.0004, + 0.0000, + 0.0011, + 0.0000, + 0.0000, + 0.0000, + 0.0000, + 0.0037, + ] ) - expected_slice = expected_slices.get_expectation() - max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 @@ -735,11 +763,11 @@ def test_ip_adapter_multiple_masks_one_adapter(self): inputs["cross_attention_kwargs"]["ip_adapter_masks"] = [masks] ip_images = inputs["ip_adapter_image"] inputs["ip_adapter_image"] = [[image[0] for image in ip_images]] - images = pipeline(**inputs).images + with self.get_fixed_randn_tensor_patch(pipeline, shape=(1, 4, 128, 128)): + images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() expected_slice = np.array( - [0.79474676, 0.7977683, 0.8013954, 0.7988008, 0.7970615, 0.8029355, 0.80614823, 0.8050743, 0.80627424] + [0.35761628, 0.39357206, 0.43524706, 0.3571607, 0.38741112, 0.43580052, 0.37814528, 0.3915079, 0.44959208] ) - max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4 From d66805a7593a507508cb67f7b514cb1ae48145b5 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Tue, 26 May 2026 15:21:06 +0800 Subject: [PATCH 2/2] adjust xpu expectation Signed-off-by: Wang, Yi --- .../ip_adapters/test_ip_adapter_stable_diffusion.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py index 1073ef06bf2c..6689526e8b13 100644 --- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py +++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py @@ -224,8 +224,15 @@ def test_text_to_image(self): images = pipeline(**inputs).images image_slice = images[0, :3, :3, -1].flatten() - expected_slice = np.array([0.1238, 0.0579, 0.0312, 0.0493, 0.0010, 0.0, 0.0188, 0.0, 0.0]) - + expected_slice = Expectations( + { + ("cuda", None): np.array([0.1238, 0.0579, 0.0312, 0.0493, 0.0010, 0.0, 0.0188, 0.0, 0.0]), + ("xpu", None): np.array( + [0.11938477, 0.05249023, 0.02490234, 0.04370117, 0.0, 0.0, 0.01342773, 0.0, 0.0] + ), + (None, None): np.array([0.1238, 0.0579, 0.0312, 0.0493, 0.0010, 0.0, 0.0188, 0.0, 0.0]), + } + ).get_expectation() max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice) assert max_diff < 5e-4