diff --git a/examples/diffusers/cache_diffusion/benchmarks.py b/examples/diffusers/cache_diffusion/benchmarks.py
deleted file mode 100644
index 1942d3c5c..000000000
--- a/examples/diffusers/cache_diffusion/benchmarks.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import time
-from pathlib import Path
-
-import torch
-from cache_diffusion import cachify
-from cache_diffusion.utils import SD3_DEFAULT_CONFIG, SDXL_DEFAULT_CONFIG
-from diffusers import DiffusionPipeline, StableDiffusion3Pipeline
-from pipeline.deploy import compile, teardown
-
-MODEL_IDS = {
-    "sdxl": "stabilityai/stable-diffusion-xl-base-1.0",
-    "sd3-medium": "stabilityai/stable-diffusion-3-medium-diffusers",
-}
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-id", type=str, default="sdxl", choices=["sdxl", "sd3-medium"])
-    parser.add_argument("--batch-size", type=int, default=2)
-    parser.add_argument("--num-inference-steps", type=int, default=30)
-    parser.add_argument("--num-iter", type=int, default=8)
-    parser.add_argument("--output-dir", type=str, default=".")
-    args = parser.parse_args()
-    for key, value in vars(args).items():
-        if value is not None:
-            print(f"Parsed args -- {key}: {value}")
-    return args
-
-
-def main(args):
-    if args.model_id == "sdxl":
-        pipe = DiffusionPipeline.from_pretrained(
-            MODEL_IDS[args.model_id],
-            torch_dtype=torch.float16,
-            variant="fp16",
-            use_safetensors=True,
-        )
-    else:
-        pipe = StableDiffusion3Pipeline.from_pretrained(
-            MODEL_IDS[args.model_id], torch_dtype=torch.float16
-        )
-    pipe = pipe.to("cuda")
-
-    prompt = "A dog"
-
-    compile(
-        pipe,
-        args.model_id,
-        onnx_path=Path(args.output_dir, "onnx"),
-        engine_path=Path(args.output_dir, "engine"),
-        batch_size=args.batch_size,
-    )
-
-    cachify.prepare(pipe, SDXL_DEFAULT_CONFIG if args.model_id else SD3_DEFAULT_CONFIG)
-
-    generator = torch.Generator(device="cuda").manual_seed(2946901)
-    total_time = 0
-    cachify.disable(pipe)
-    for _ in range(args.num_iter):
-        with torch.autocast("cuda"):
-            start_time = time.time()
-            _ = pipe(
-                prompt=[prompt] * args.batch_size,
-                num_inference_steps=args.num_inference_steps,
-                generator=generator,
-            )
-            end_time = time.time()
-        total_time += end_time - start_time
-    total_time = total_time / args.num_iter
-    latency = total_time / args.batch_size
-    print(f"TRT Disabled Cache: {latency}")
-
-    generator = torch.Generator(device="cuda").manual_seed(2946901)
-    total_time = 0
-    cachify.enable(pipe)
-    for _ in range(args.num_iter):
-        with torch.autocast("cuda"):
-            start_time = time.time()
-            _ = pipe(
-                prompt=[prompt] * args.batch_size,
-                num_inference_steps=args.num_inference_steps,
-                generator=generator,
-            )
-            end_time = time.time()
-        cachify.reset_status(pipe)
-        total_time += end_time - start_time
-    total_time = total_time / args.num_iter
-    latency = total_time / args.batch_size
-    print(f"TRT Enabled Cache: {latency}")
-    teardown(pipe)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/diffusers/cache_diffusion/pipeline/config.py b/examples/diffusers/cache_diffusion/pipeline/config.py
deleted file mode 100644
index c6ef07e17..000000000
--- a/examples/diffusers/cache_diffusion/pipeline/config.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel
-from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
-
-sd3_common_transformer_block_config = {
-    "dummy_input": {
-        "hidden_states": (2, 4096, 1536),
-        "encoder_hidden_states": (2, 333, 1536),
-        "temb": (2, 1536),
-    },
-    "output_names": ["encoder_hidden_states_out", "hidden_states_out"],
-    "dynamic_axes": {
-        "hidden_states": {0: "batch_size"},
-        "encoder_hidden_states": {0: "batch_size"},
-        "temb": {0: "steps"},
-    },
-}
-
-ONNX_CONFIG = {
-    UNet2DConditionModel: {
-        "down_blocks.0": {
-            "dummy_input": {
-                "hidden_states": (2, 320, 128, 128),
-                "temb": (2, 1280),
-            },
-            "output_names": ["sample", "res_samples_0", "res_samples_1", "res_samples_2"],
-            "dynamic_axes": {
-                "hidden_states": {0: "batch_size"},
-                "temb": {0: "steps"},
-            },
-        },
-        "down_blocks.1": {
-            "dummy_input": {
-                "hidden_states": (2, 320, 64, 64),
-                "temb": (2, 1280),
-                "encoder_hidden_states": (2, 77, 2048),
-            },
-            "output_names": ["sample", "res_samples_0", "res_samples_1", "res_samples_2"],
-            "dynamic_axes": {
-                "hidden_states": {0: "batch_size"},
-                "temb": {0: "steps"},
-                "encoder_hidden_states": {0: "batch_size"},
-            },
-        },
-        "down_blocks.2": {
-            "dummy_input": {
-                "hidden_states": (2, 640, 32, 32),
-                "temb": (2, 1280),
-                "encoder_hidden_states": (2, 77, 2048),
-            },
-            "output_names": ["sample", "res_samples_0", "res_samples_1"],
-            "dynamic_axes": {
-                "hidden_states": {0: "batch_size"},
-                "temb": {0: "steps"},
-                "encoder_hidden_states": {0: "batch_size"},
-            },
-        },
-        "mid_block": {
-            "dummy_input": {
-                "hidden_states": (2, 1280, 32, 32),
-                "temb": (2, 1280),
-                "encoder_hidden_states": (2, 77, 2048),
-            },
-            "output_names": ["sample"],
-            "dynamic_axes": {
-                "hidden_states": {0: "batch_size"},
-                "temb": {0: "steps"},
-                "encoder_hidden_states": {0: "batch_size"},
-            },
-        },
-        "up_blocks.0": {
-            "dummy_input": {
-                "hidden_states": (2, 1280, 32, 32),
-                "res_hidden_states_0": (2, 640, 32, 32),
-                "res_hidden_states_1": (2, 1280, 32, 32),
-                "res_hidden_states_2": (2, 1280, 32, 32),
-                "temb": (2, 1280),
-                "encoder_hidden_states": (2, 77, 2048),
-            },
-            "output_names": ["sample"],
-            "dynamic_axes": {
-                "hidden_states": {0: "batch_size"},
-                "temb": {0: "steps"},
-                "encoder_hidden_states": {0: "batch_size"},
-                "res_hidden_states_0": {0: "batch_size"},
-                "res_hidden_states_1": {0: "batch_size"},
-                "res_hidden_states_2": {0: "batch_size"},
-            },
-        },
-        "up_blocks.1": {
-            "dummy_input": {
-                "hidden_states": (2, 1280, 64, 64),
-                "res_hidden_states_0": (2, 320, 64, 64),
-                "res_hidden_states_1": (2, 640, 64, 64),
-                "res_hidden_states_2": (2, 640, 64, 64),
-                "temb": (2, 1280),
-                "encoder_hidden_states": (2, 77, 2048),
-            },
-            "output_names": ["sample"],
-            "dynamic_axes": {
-                "hidden_states": {0: "batch_size"},
-                "temb": {0: "steps"},
-                "encoder_hidden_states": {0: "batch_size"},
-                "res_hidden_states_0": {0: "batch_size"},
-                "res_hidden_states_1": {0: "batch_size"},
-                "res_hidden_states_2": {0: "batch_size"},
-            },
-        },
-        "up_blocks.2": {
-            "dummy_input": {
-                "hidden_states": (2, 640, 128, 128),
-                "res_hidden_states_0": (2, 320, 128, 128),
-                "res_hidden_states_1": (2, 320, 128, 128),
-                "res_hidden_states_2": (2, 320, 128, 128),
-                "temb": (2, 1280),
-            },
-            "output_names": ["sample"],
-            "dynamic_axes": {
-                "hidden_states": {0: "batch_size"},
-                "temb": {0: "steps"},
-                "res_hidden_states_0": {0: "batch_size"},
-                "res_hidden_states_1": {0: "batch_size"},
-                "res_hidden_states_2": {0: "batch_size"},
-            },
-        },
-    },
-    SD3Transformer2DModel: {
-        **{f"transformer_blocks.{i}": sd3_common_transformer_block_config for i in range(23)},
-        "transformer_blocks.23": {
-            "dummy_input": {
-                "hidden_states": (2, 4096, 1536),
-                "encoder_hidden_states": (2, 333, 1536),
-                "temb": (2, 1536),
-            },
-            "output_names": ["hidden_states_out"],
-            "dynamic_axes": {
-                "hidden_states": {0: "batch_size"},
-                "encoder_hidden_states": {0: "batch_size"},
-                "temb": {0: "steps"},
-            },
-        },
-    },
-}
diff --git a/examples/diffusers/cache_diffusion/pipeline/deploy.py b/examples/diffusers/cache_diffusion/pipeline/deploy.py
deleted file mode 100644
index bf55ff309..000000000
--- a/examples/diffusers/cache_diffusion/pipeline/deploy.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import types
-from pathlib import Path
-
-import tensorrt as trt
-import torch
-from cache_diffusion.cachify import CACHED_PIPE, get_model
-from cuda import cudart
-from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel
-from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
-from pipeline.config import ONNX_CONFIG
-from pipeline.models.sd3 import sd3_forward
-from pipeline.models.sdxl import (
-    cachecrossattnupblock2d_forward,
-    cacheunet_forward,
-    cacheupblock2d_forward,
-)
-from polygraphy.backend.trt import (
-    CreateConfig,
-    Profile,
-    engine_from_network,
-    network_from_onnx_path,
-    save_engine,
-)
-from torch.onnx import export as onnx_export
-
-from .utils import Engine
-
-
-def replace_new_forward(backbone):
-    if backbone.__class__ == UNet2DConditionModel:
-        backbone.forward = types.MethodType(cacheunet_forward, backbone)
-        for upsample_block in backbone.up_blocks:
-            if (
-                hasattr(upsample_block, "has_cross_attention")
-                and upsample_block.has_cross_attention
-            ):
-                upsample_block.forward = types.MethodType(
-                    cachecrossattnupblock2d_forward, upsample_block
-                )
-            else:
-                upsample_block.forward = types.MethodType(cacheupblock2d_forward, upsample_block)
-    elif backbone.__class__ == SD3Transformer2DModel:
-        backbone.forward = types.MethodType(sd3_forward, backbone)
-
-
-def get_input_info(dummy_dict, info: str | None = None, batch_size: int = 1):
-    return_val = [] if info in {"profile_shapes", "input_names"} else {}
-
-    def collect_leaf_keys(d):
-        for key, value in d.items():
-            if isinstance(value, dict):
-                collect_leaf_keys(value)
-            else:
-                value = (value[0] * batch_size, *value[1:])
-                if info == "profile_shapes":
-                    return_val.append((key, value))
-                elif info == "profile_shapes_dict":
-                    return_val[key] = value  # type: ignore[index]
-                elif info == "dummy_input":
-                    return_val[key] = torch.ones(value).half().cuda()  # type: ignore[index]
-                elif info == "input_names":
-                    return_val.append(key)
-
-    collect_leaf_keys(dummy_dict)
-    return return_val
-
-
-def compile2trt(cls, onnx_path: Path, engine_path: Path, batch_size: int = 1):
-    subdirs = [f for f in onnx_path.iterdir() if f.is_dir()]
-    for subdir in subdirs:
-        if subdir.name not in ONNX_CONFIG[cls]:
-            continue
-        model_path = subdir / "model.onnx"
-        plan_path = engine_path / f"{subdir.name}.plan"
-        if not plan_path.exists():
-            print(f"Building {model_path!s}")
-            build_profile = Profile()
-            profile_shapes = get_input_info(
-                ONNX_CONFIG[cls][subdir.name]["dummy_input"], "profile_shapes", batch_size
-            )
-            for input_name, input_shape in profile_shapes:
-                min_input_shape = (2, *input_shape[1:])
-                build_profile.add(input_name, min_input_shape, input_shape, input_shape)
-            block_network = network_from_onnx_path(
-                str(model_path), flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM], strongly_typed=True
-            )
-            build_config = CreateConfig(
-                builder_optimization_level=4,
-                profiles=[build_profile],
-            )
-            engine = engine_from_network(
-                block_network,
-                config=build_config,
-            )
-            save_engine(engine, path=plan_path)
-        else:
-            print(f"{model_path!s} already exists!")
-
-
-def get_total_device_memory(backbone):
-    max_device_memory = 0
-    for engine in backbone.engines.values():
-        max_device_memory = max(max_device_memory, engine.engine.device_memory_size)
-    return max_device_memory
-
-
-def load_engines(backbone, engine_path: Path, batch_size: int = 1):
-    backbone.engines = {}
-    for f in engine_path.iterdir():
-        if f.is_file():
-            eng = Engine()
-            eng.load(str(f))
-            backbone.engines[f"{f.stem}"] = eng
-    _, shared_device_memory = cudart.cudaMalloc(get_total_device_memory(backbone))
-    for engine in backbone.engines.values():
-        engine.activate(shared_device_memory)
-    backbone.cuda_stream = cudart.cudaStreamCreate()[1]
-    for block_name in backbone.engines:
-        backbone.engines[block_name].allocate_buffers(
-            shape_dict=get_input_info(
-                ONNX_CONFIG[backbone.__class__][block_name]["dummy_input"],
-                "profile_shapes_dict",
-                batch_size,
-            ),
-            device=backbone.device,
-            batch_size=batch_size,
-        )
-
-
-def free_memory(model_id, backbone):
-    if model_id == "sd3-medium":
-        for block in backbone.transformer_blocks:
-            block.to_empty(device="cpu")
-    else:
-        backbone.mid_block.to_empty(device="cpu")
-        backbone.down_blocks.to_empty(device="cpu")
-        backbone.up_blocks.to_empty(device="cpu")
-        torch.cuda.empty_cache()
-
-
-def export_onnx(backbone, onnx_path: Path):
-    for name, module in backbone.named_modules():
-        if isinstance(module, CACHED_PIPE[backbone.__class__]):
-            _onnx_dir = onnx_path.joinpath(f"{name}")
-            _onnx_file = _onnx_dir.joinpath("model.onnx")
-            if not _onnx_file.exists():
-                _onnx_dir.mkdir(parents=True, exist_ok=True)
-                dummy_input = get_input_info(
-                    ONNX_CONFIG[backbone.__class__][f"{name}"]["dummy_input"], "dummy_input"
-                )
-                input_names = get_input_info(
-                    ONNX_CONFIG[backbone.__class__][f"{name}"]["dummy_input"], "input_names"
-                )
-                output_names = ONNX_CONFIG[backbone.__class__][f"{name}"]["output_names"]
-                onnx_export(
-                    module,
-                    args=tuple(dummy_input.values())
-                    if isinstance(dummy_input, dict)
-                    else dummy_input,
-                    f=_onnx_file.as_posix(),
-                    input_names=input_names,
-                    output_names=output_names,
-                    dynamic_axes=ONNX_CONFIG[backbone.__class__][f"{name}"]["dynamic_axes"],
-                    do_constant_folding=True,
-                    opset_version=17,
-                    dynamo=False,
-                )
-            else:
-                print(f"{_onnx_file!s} already exists!")
-
-
-def warm_up(backbone, batch_size: int = 1):
-    print("Warming-up TensorRT engines...")
-    for name, engine in backbone.engines.items():
-        dummy_input = get_input_info(
-            ONNX_CONFIG[backbone.__class__][name]["dummy_input"], "dummy_input", batch_size
-        )
-        _ = engine(dummy_input, backbone.cuda_stream)
-
-
-def teardown(pipe):
-    backbone = get_model(pipe)
-    for engine in backbone.engines.values():
-        del engine
-
-    cudart.cudaStreamDestroy(backbone.cuda_stream)
-    del backbone.cuda_stream
-
-
-def compile(pipe, model_id: str, onnx_path: Path, engine_path: Path, batch_size: int = 1):
-    backbone = get_model(pipe)
-    onnx_path.mkdir(parents=True, exist_ok=True)
-    engine_path.mkdir(parents=True, exist_ok=True)
-
-    replace_new_forward(backbone)
-    export_onnx(backbone, onnx_path)
-    compile2trt(backbone.__class__, onnx_path, engine_path, batch_size)
-    load_engines(backbone, engine_path, batch_size)
-    free_memory(model_id, backbone)
-    warm_up(backbone, batch_size)
-    backbone.use_trt_infer = True
diff --git a/examples/diffusers/cache_diffusion/pipeline/models/sd3.py b/examples/diffusers/cache_diffusion/pipeline/models/sd3.py
deleted file mode 100755
index f4adc1b46..000000000
--- a/examples/diffusers/cache_diffusion/pipeline/models/sd3.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any
-
-import torch
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.utils import (
-    USE_PEFT_BACKEND,
-    is_torch_version,
-    scale_lora_layers,
-    unscale_lora_layers,
-)
-
-
-def sd3_forward(
-    self,
-    hidden_states: torch.FloatTensor,
-    encoder_hidden_states: torch.FloatTensor = None,
-    pooled_projections: torch.FloatTensor = None,
-    timestep: torch.LongTensor = None,
-    block_controlnet_hidden_states: list | None = None,
-    joint_attention_kwargs: dict | None = None,
-    return_dict: bool = True,
-) -> torch.FloatTensor | Transformer2DModelOutput:
-    """
-    The [`SD3Transformer2DModel`] forward method.
-
-    Args:
-        hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
-            Input `hidden_states`.
-        encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
-            Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-        pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
-            from the embeddings of input conditions.
-        timestep ( `torch.LongTensor`):
-            Used to indicate denoising step.
-        block_controlnet_hidden_states: (`list` of `torch.Tensor`):
-            A list of tensors that if specified are added to the residuals of transformer blocks.
-        joint_attention_kwargs (`dict`, *optional*):
-            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-            `self.processor` in
-            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-        return_dict (`bool`, *optional*, defaults to `True`):
-            Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-            tuple.
-
-    Returns:
-        If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-        `tuple` where the first element is the sample tensor.
-    """
-    if joint_attention_kwargs is not None:
-        joint_attention_kwargs = joint_attention_kwargs.copy()
-        lora_scale = joint_attention_kwargs.pop("scale", 1.0)
-    else:
-        lora_scale = 1.0
-
-    if USE_PEFT_BACKEND:
-        # weight the lora layers by setting `lora_scale` for each PEFT layer
-        scale_lora_layers(self, lora_scale)
-
-    height, width = hidden_states.shape[-2:]
-
-    hidden_states = self.pos_embed(hidden_states)  # takes care of adding positional embeddings too.
-    temb = self.time_text_embed(timestep, pooled_projections)
-    encoder_hidden_states = self.context_embedder(encoder_hidden_states)
-
-    for index_block, block in enumerate(self.transformer_blocks):
-        if self.training and self.gradient_checkpointing:
-
-            def create_custom_forward(module, return_dict=None):
-                def custom_forward(*inputs):
-                    if return_dict is not None:
-                        return module(*inputs, return_dict=return_dict)
-                    else:
-                        return module(*inputs)
-
-                return custom_forward
-
-            ckpt_kwargs: dict[str, Any] = (
-                {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-            )
-            encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(block),
-                hidden_states,
-                encoder_hidden_states,
-                temb,
-                **ckpt_kwargs,
-            )
-
-        elif hasattr(self, "use_trt_infer") and self.use_trt_infer:
-            feed_dict = {
-                "hidden_states": hidden_states,
-                "encoder_hidden_states": encoder_hidden_states,
-                "temb": temb,
-            }
-            _results = self.engines[f"transformer_blocks.{index_block}"](
-                feed_dict, self.cuda_stream
-            )
-            if index_block != 23:
-                encoder_hidden_states = _results["encoder_hidden_states_out"]
-            hidden_states = _results["hidden_states_out"]
-        else:
-            encoder_hidden_states, hidden_states = block(
-                hidden_states=hidden_states,
-                encoder_hidden_states=encoder_hidden_states,
-                temb=temb,
-            )
-
-        # controlnet residual
-        if block_controlnet_hidden_states is not None and block.context_pre_only is False:
-            interval_control = len(self.transformer_blocks) // len(block_controlnet_hidden_states)
-            hidden_states = (
-                hidden_states + block_controlnet_hidden_states[index_block // interval_control]
-            )
-
-    hidden_states = self.norm_out(hidden_states, temb)
-    hidden_states = self.proj_out(hidden_states)
-
-    # unpatchify
-    patch_size = self.config.patch_size
-    height = height // patch_size
-    width = width // patch_size
-
-    hidden_states = hidden_states.reshape(
-        shape=(hidden_states.shape[0], height, width, patch_size, patch_size, self.out_channels)
-    )
-    hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
-    output = hidden_states.reshape(
-        shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
-    )
-
-    if USE_PEFT_BACKEND:
-        # remove `lora_scale` from each PEFT layer
-        unscale_lora_layers(self, lora_scale)
-
-    if not return_dict:
-        return (output,)
-
-    return Transformer2DModelOutput(sample=output)
diff --git a/examples/diffusers/cache_diffusion/pipeline/models/sdxl.py b/examples/diffusers/cache_diffusion/pipeline/models/sdxl.py
deleted file mode 100755
index 6b14de862..000000000
--- a/examples/diffusers/cache_diffusion/pipeline/models/sdxl.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Adapted from
-# https://github.com/huggingface/diffusers/blob/73acebb8cfbd1d2954cabe1af4185f9994e61917/src/diffusers/models/unets/unet_2d_condition.py#L1039-L1312
-# https://github.com/huggingface/diffusers/blob/73acebb8cfbd1d2954cabe1af4185f9994e61917/src/diffusers/models/unets/unet_2d_blocks.py#L2482-L2564
-# https://github.com/huggingface/diffusers/blob/73acebb8cfbd1d2954cabe1af4185f9994e61917/src/diffusers/models/unets/unet_2d_blocks.py#L2617-L2679
-
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any
-
-import torch
-from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
-
-
-def cachecrossattnupblock2d_forward(
-    self,
-    hidden_states: torch.FloatTensor,
-    res_hidden_states_0: torch.FloatTensor,
-    res_hidden_states_1: torch.FloatTensor,
-    res_hidden_states_2: torch.FloatTensor,
-    temb: torch.FloatTensor | None = None,
-    encoder_hidden_states: torch.FloatTensor | None = None,
-    cross_attention_kwargs: dict[str, Any] | None = None,
-    upsample_size: int | None = None,
-    attention_mask: torch.FloatTensor | None = None,
-    encoder_attention_mask: torch.FloatTensor | None = None,
-) -> torch.FloatTensor:
-    res_hidden_states_tuple = (res_hidden_states_0, res_hidden_states_1, res_hidden_states_2)
-    for resnet, attn in zip(self.resnets, self.attentions):
-        # pop res hidden states
-        res_hidden_states = res_hidden_states_tuple[-1]
-        res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-
-        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-
-        hidden_states = resnet(hidden_states, temb)
-        hidden_states = attn(
-            hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            cross_attention_kwargs=cross_attention_kwargs,
-            attention_mask=attention_mask,
-            encoder_attention_mask=encoder_attention_mask,
-            return_dict=False,
-        )[0]
-
-    if self.upsamplers is not None:
-        for upsampler in self.upsamplers:
-            hidden_states = upsampler(hidden_states, upsample_size)
-
-    return hidden_states
-
-
-def cacheupblock2d_forward(
-    self,
-    hidden_states: torch.FloatTensor,
-    res_hidden_states_0: torch.FloatTensor,
-    res_hidden_states_1: torch.FloatTensor,
-    res_hidden_states_2: torch.FloatTensor,
-    temb: torch.FloatTensor | None = None,
-    upsample_size: int | None = None,
-) -> torch.FloatTensor:
-    res_hidden_states_tuple = (res_hidden_states_0, res_hidden_states_1, res_hidden_states_2)
-    for resnet in self.resnets:
-        # pop res hidden states
-        res_hidden_states = res_hidden_states_tuple[-1]
-        res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-
-        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
-
-        hidden_states = resnet(hidden_states, temb)
-
-    if self.upsamplers is not None:
-        for upsampler in self.upsamplers:
-            hidden_states = upsampler(hidden_states, upsample_size)
-
-    return hidden_states
-
-
-def cacheunet_forward(
-    self,
-    sample: torch.FloatTensor,
-    timestep: torch.Tensor | float | int,
-    encoder_hidden_states: torch.Tensor,
-    class_labels: torch.Tensor | None = None,
-    timestep_cond: torch.Tensor | None = None,
-    attention_mask: torch.Tensor | None = None,
-    cross_attention_kwargs: dict[str, Any] | None = None,
-    added_cond_kwargs: dict[str, torch.Tensor] | None = None,
-    down_block_additional_residuals: tuple[torch.Tensor] | None = None,
-    mid_block_additional_residual: torch.Tensor | None = None,
-    down_intrablock_additional_residuals: tuple[torch.Tensor] | None = None,
-    encoder_attention_mask: torch.Tensor | None = None,
-    return_dict: bool = True,
-) -> UNet2DConditionOutput | tuple:
-    # 1. time
-    t_emb = self.get_time_embed(sample=sample, timestep=timestep)
-    emb = self.time_embedding(t_emb, timestep_cond)
-    aug_emb = None
-
-    aug_emb = self.get_aug_embed(
-        emb=emb,
-        encoder_hidden_states=encoder_hidden_states,
-        added_cond_kwargs=added_cond_kwargs,
-    )
-
-    emb = emb + aug_emb if aug_emb is not None else emb
-
-    encoder_hidden_states = self.process_encoder_hidden_states(
-        encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
-    )
-
-    # 2. pre-process
-    sample = self.conv_in(sample)
-
-    if hasattr(self, "_export_precess_onnx") and self._export_precess_onnx:
-        return (
-            sample,
-            encoder_hidden_states,
-            emb,
-        )
-
-    down_block_res_samples = (sample,)
-    for i, downsample_block in enumerate(self.down_blocks):
-        if (
-            hasattr(downsample_block, "has_cross_attention")
-            and downsample_block.has_cross_attention
-        ):
-            if hasattr(self, "use_trt_infer") and self.use_trt_infer:
-                feed_dict = {
-                    "hidden_states": sample,
-                    "temb": emb,
-                    "encoder_hidden_states": encoder_hidden_states,
-                }
-                down_results = self.engines[f"down_blocks.{i}"](feed_dict, self.cuda_stream)
-                sample = down_results["sample"]
-                res_samples_0 = down_results["res_samples_0"]
-                res_samples_1 = down_results["res_samples_1"]
-                if "res_samples_2" in down_results:
-                    res_samples_2 = down_results["res_samples_2"]
-            else:
-                # For t2i-adapter CrossAttnDownBlock2D
-                additional_residuals = {}
-
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    encoder_attention_mask=encoder_attention_mask,
-                    **additional_residuals,
-                )
-        elif hasattr(self, "use_trt_infer") and self.use_trt_infer:
-            feed_dict = {"hidden_states": sample, "temb": emb}
-            down_results = self.engines[f"down_blocks.{i}"](feed_dict, self.cuda_stream)
-            sample = down_results["sample"]
-            res_samples_0 = down_results["res_samples_0"]
-            res_samples_1 = down_results["res_samples_1"]
-            if "res_samples_2" in down_results:
-                res_samples_2 = down_results["res_samples_2"]
-        else:
-            sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
-        if hasattr(self, "use_trt_infer") and self.use_trt_infer:
-            down_block_res_samples += (
-                res_samples_0,
-                res_samples_1,
-            )
-            if "res_samples_2" in down_results:
-                down_block_res_samples += (res_samples_2,)
-        else:
-            down_block_res_samples += res_samples
-
-    if hasattr(self, "use_trt_infer") and self.use_trt_infer:
-        feed_dict = {
-            "hidden_states": sample,
-            "temb": emb,
-            "encoder_hidden_states": encoder_hidden_states,
-        }
-        mid_results = self.engines["mid_block"](feed_dict, self.cuda_stream)
-        sample = mid_results["sample"]
-    else:
-        sample = self.mid_block(
-            sample,
-            emb,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=attention_mask,
-            cross_attention_kwargs=cross_attention_kwargs,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-
-    # 5. up
-    for i, upsample_block in enumerate(self.up_blocks):
-        res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-        down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-
-        if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-            if hasattr(self, "use_trt_infer") and self.use_trt_infer:
-                feed_dict = {
-                    "hidden_states": sample,
-                    "res_hidden_states_0": res_samples[0],
-                    "res_hidden_states_1": res_samples[1],
-                    "res_hidden_states_2": res_samples[2],
-                    "temb": emb,
-                    "encoder_hidden_states": encoder_hidden_states,
-                }
-                up_results = self.engines[f"up_blocks.{i}"](feed_dict, self.cuda_stream)
-                sample = up_results["sample"]
-            else:
-                sample = upsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    res_hidden_states_0=res_samples[0],
-                    res_hidden_states_1=res_samples[1],
-                    res_hidden_states_2=res_samples[2],
-                    encoder_hidden_states=encoder_hidden_states,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    attention_mask=attention_mask,
-                    encoder_attention_mask=encoder_attention_mask,
-                )
-        elif hasattr(self, "use_trt_infer") and self.use_trt_infer:
-            feed_dict = {
-                "hidden_states": sample,
-                "res_hidden_states_0": res_samples[0],
-                "res_hidden_states_1": res_samples[1],
-                "res_hidden_states_2": res_samples[2],
-                "temb": emb,
-            }
-            up_results = self.engines[f"up_blocks.{i}"](feed_dict, self.cuda_stream)
-            sample = up_results["sample"]
-        else:
-            sample = upsample_block(
-                hidden_states=sample,
-                temb=emb,
-                res_hidden_states_0=res_samples[0],
-                res_hidden_states_1=res_samples[1],
-                res_hidden_states_2=res_samples[2],
-            )
-
-    # 6. post-process
-    if self.conv_norm_out:
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_act(sample)
-    sample = self.conv_out(sample)
-
-    if not return_dict:
-        return (sample,)
-
-    return UNet2DConditionOutput(sample=sample)
diff --git a/examples/diffusers/cache_diffusion/pipeline/utils.py b/examples/diffusers/cache_diffusion/pipeline/utils.py
deleted file mode 100644
index 7157e3d2c..000000000
--- a/examples/diffusers/cache_diffusion/pipeline/utils.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import OrderedDict
-
-import numpy as np
-import tensorrt as trt
-import torch
-from cuda import cudart
-from polygraphy.backend.common import bytes_from_path
-from polygraphy.backend.trt import engine_from_bytes
-
-numpy_to_torch_dtype_dict = {
-    np.uint8: torch.uint8,
-    np.int8: torch.int8,
-    np.int16: torch.int16,
-    np.int32: torch.int32,
-    np.int64: torch.int64,
-    np.float16: torch.float16,
-    np.float32: torch.float32,
-    np.float64: torch.float64,
-    np.complex64: torch.complex64,
-    np.complex128: torch.complex128,
-}
-
-
-class Engine:
-    def __init__(
-        self,
-    ):
-        self.engine = None
-        self.context = None
-        self.buffers = OrderedDict()
-        self.tensors = OrderedDict()
-        self.cuda_graph_instance = None  # cuda graph
-        self.has_cross_attention = False
-
-    def __del__(self):
-        del self.engine
-        del self.context
-        del self.buffers
-        del self.tensors
-
-    def load(self, engine_path):
-        self.engine = engine_from_bytes(bytes_from_path(engine_path))
-
-    def activate(self, reuse_device_memory=None):
-        if reuse_device_memory:
-            self.context = self.engine.create_execution_context_without_device_memory()  # type: ignore[union-attr]
-            self.context.device_memory = reuse_device_memory
-        else:
-            self.context = self.engine.create_execution_context()  # type: ignore[union-attr]
-
-    def allocate_buffers(self, shape_dict=None, device="cuda", batch_size=1):
-        for binding in range(self.engine.num_io_tensors):  # type: ignore[union-attr]
-            name = self.engine.get_tensor_name(binding)  # type: ignore[union-attr]
-            if shape_dict and name in shape_dict:
-                shape = shape_dict[name]
-            else:
-                shape = self.engine.get_tensor_shape(name)  # type: ignore[union-attr]
-                shape = (batch_size * 2, *shape[1:])
-            dtype = trt.nptype(self.engine.get_tensor_dtype(name))  # type: ignore[union-attr]
-            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:  # type: ignore[union-attr]
-                self.context.set_input_shape(name, shape)  # type: ignore[union-attr]
-            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(
-                device=device
-            )
-            self.tensors[name] = tensor
-
-    def __call__(self, feed_dict, stream, use_cuda_graph=False):
-        for name, buf in feed_dict.items():
-            self.tensors[name].copy_(buf)
-
-        for name, tensor in self.tensors.items():
-            self.context.set_tensor_address(name, tensor.data_ptr())  # type: ignore[union-attr]
-
-        if use_cuda_graph:
-            if self.cuda_graph_instance is not None:
-                cuassert(cudart.cudaGraphLaunch(self.cuda_graph_instance, stream))
-                cuassert(cudart.cudaStreamSynchronize(stream))
-            else:
-                # do inference before CUDA graph capture
-                noerror = self.context.execute_async_v3(stream)  # type: ignore[union-attr]
-                if not noerror:
-                    raise ValueError("ERROR: inference failed.")
-                # capture cuda graph
-                cuassert(
-                    cudart.cudaStreamBeginCapture(
-                        stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal
-                    )
-                )
-                self.context.execute_async_v3(stream)  # type: ignore[union-attr]
-                self.graph = cuassert(cudart.cudaStreamEndCapture(stream))
-                self.cuda_graph_instance = cuassert(cudart.cudaGraphInstantiate(self.graph, 0))
-        else:
-            noerror = self.context.execute_async_v3(stream)  # type: ignore[union-attr]
-            if not noerror:
-                raise ValueError("ERROR: inference failed.")
-
-        return self.tensors
-
-
-def cuassert(cuda_ret):
-    err = cuda_ret[0]
-    if err != cudart.cudaError_t.cudaSuccess:
-        raise RuntimeError(
-            f"CUDA ERROR: {err}, error code reference: "
-            "https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html"
-        )
-    if len(cuda_ret) > 1:
-        return cuda_ret[1]
-    return None
diff --git a/examples/diffusers/requirements.txt b/examples/diffusers/requirements.txt
index 3cdac70f8..f5582868b 100644
--- a/examples/diffusers/requirements.txt
+++ b/examples/diffusers/requirements.txt
@@ -1,4 +1,3 @@
-cuda-python<13
 nvtx
 opencv-python>=4.8.1.78,<4.12.0.88
 sentencepiece
diff --git a/tests/examples/diffusers/test_cache_diffusion.py b/tests/examples/diffusers/test_cache_diffusion.py
index fff3fa4cb..18f495257 100644
--- a/tests/examples/diffusers/test_cache_diffusion.py
+++ b/tests/examples/diffusers/test_cache_diffusion.py
@@ -59,19 +59,3 @@ def test_pixart_cachify():
     # Clear cuda memory as pytest doesnt clear it between tests
     del pipe
     torch.cuda.empty_cache()
-
-
-def test_sdxl_benchmarks(tmp_path):
-    # fmt: off
-    subprocess.run(
-        [
-            "python", "benchmarks.py",
-            "--model-id", "sdxl",
-            "--batch-size", "1",
-            "--num-iter", "2",
-            "--output-dir", tmp_path,
-        ],
-        cwd=MODELOPT_ROOT / "examples/diffusers/cache_diffusion",
-        check=True,
-    )
-    # fmt: on