diff --git a/examples/diffusers/cache_diffusion/benchmarks.py b/examples/diffusers/cache_diffusion/benchmarks.py deleted file mode 100644 index 1942d3c5c..000000000 --- a/examples/diffusers/cache_diffusion/benchmarks.py +++ /dev/null @@ -1,111 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import time -from pathlib import Path - -import torch -from cache_diffusion import cachify -from cache_diffusion.utils import SD3_DEFAULT_CONFIG, SDXL_DEFAULT_CONFIG -from diffusers import DiffusionPipeline, StableDiffusion3Pipeline -from pipeline.deploy import compile, teardown - -MODEL_IDS = { - "sdxl": "stabilityai/stable-diffusion-xl-base-1.0", - "sd3-medium": "stabilityai/stable-diffusion-3-medium-diffusers", -} - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--model-id", type=str, default="sdxl", choices=["sdxl", "sd3-medium"]) - parser.add_argument("--batch-size", type=int, default=2) - parser.add_argument("--num-inference-steps", type=int, default=30) - parser.add_argument("--num-iter", type=int, default=8) - parser.add_argument("--output-dir", type=str, default=".") - args = parser.parse_args() - for key, value in vars(args).items(): - if value is not None: - print(f"Parsed args -- {key}: {value}") - return args - - -def main(args): - if args.model_id == "sdxl": - pipe = DiffusionPipeline.from_pretrained( - MODEL_IDS[args.model_id], - torch_dtype=torch.float16, - variant="fp16", - use_safetensors=True, - ) - else: - pipe = StableDiffusion3Pipeline.from_pretrained( - MODEL_IDS[args.model_id], torch_dtype=torch.float16 - ) - pipe = pipe.to("cuda") - - prompt = "A dog" - - compile( - pipe, - args.model_id, - onnx_path=Path(args.output_dir, "onnx"), - engine_path=Path(args.output_dir, "engine"), - batch_size=args.batch_size, - ) - - cachify.prepare(pipe, SDXL_DEFAULT_CONFIG if args.model_id else SD3_DEFAULT_CONFIG) - - generator = torch.Generator(device="cuda").manual_seed(2946901) - total_time = 0 - cachify.disable(pipe) - for _ in range(args.num_iter): - with torch.autocast("cuda"): - start_time = time.time() - _ = pipe( - prompt=[prompt] * args.batch_size, - num_inference_steps=args.num_inference_steps, - generator=generator, - ) - end_time = time.time() - total_time += end_time - start_time - total_time = total_time / args.num_iter - latency = total_time / args.batch_size - print(f"TRT Disabled Cache: {latency}") - - generator = torch.Generator(device="cuda").manual_seed(2946901) - total_time = 0 - cachify.enable(pipe) - for _ in range(args.num_iter): - with torch.autocast("cuda"): - start_time = time.time() - _ = pipe( - prompt=[prompt] * args.batch_size, - num_inference_steps=args.num_inference_steps, - generator=generator, - ) - end_time = time.time() - cachify.reset_status(pipe) - total_time += end_time - start_time - total_time = total_time / args.num_iter - latency = total_time / args.batch_size - print(f"TRT Enabled Cache: {latency}") - teardown(pipe) - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/examples/diffusers/cache_diffusion/pipeline/config.py b/examples/diffusers/cache_diffusion/pipeline/config.py deleted file mode 100644 index c6ef07e17..000000000 --- a/examples/diffusers/cache_diffusion/pipeline/config.py +++ /dev/null @@ -1,157 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel -from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel - -sd3_common_transformer_block_config = { - "dummy_input": { - "hidden_states": (2, 4096, 1536), - "encoder_hidden_states": (2, 333, 1536), - "temb": (2, 1536), - }, - "output_names": ["encoder_hidden_states_out", "hidden_states_out"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "encoder_hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - }, -} - -ONNX_CONFIG = { - UNet2DConditionModel: { - "down_blocks.0": { - "dummy_input": { - "hidden_states": (2, 320, 128, 128), - "temb": (2, 1280), - }, - "output_names": ["sample", "res_samples_0", "res_samples_1", "res_samples_2"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - }, - }, - "down_blocks.1": { - "dummy_input": { - "hidden_states": (2, 320, 64, 64), - "temb": (2, 1280), - "encoder_hidden_states": (2, 77, 2048), - }, - "output_names": ["sample", "res_samples_0", "res_samples_1", "res_samples_2"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - "encoder_hidden_states": {0: "batch_size"}, - }, - }, - "down_blocks.2": { - "dummy_input": { - "hidden_states": (2, 640, 32, 32), - "temb": (2, 1280), - "encoder_hidden_states": (2, 77, 2048), - }, - "output_names": ["sample", "res_samples_0", "res_samples_1"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - "encoder_hidden_states": {0: "batch_size"}, - }, - }, - "mid_block": { - "dummy_input": { - "hidden_states": (2, 1280, 32, 32), - "temb": (2, 1280), - "encoder_hidden_states": (2, 77, 2048), - }, - "output_names": ["sample"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - "encoder_hidden_states": {0: "batch_size"}, - }, - }, - "up_blocks.0": { - "dummy_input": { - "hidden_states": (2, 1280, 32, 32), - "res_hidden_states_0": (2, 640, 32, 32), - "res_hidden_states_1": (2, 1280, 32, 32), - "res_hidden_states_2": (2, 1280, 32, 32), - "temb": (2, 1280), - "encoder_hidden_states": (2, 77, 2048), - }, - "output_names": ["sample"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - "encoder_hidden_states": {0: "batch_size"}, - "res_hidden_states_0": {0: "batch_size"}, - "res_hidden_states_1": {0: "batch_size"}, - "res_hidden_states_2": {0: "batch_size"}, - }, - }, - "up_blocks.1": { - "dummy_input": { - "hidden_states": (2, 1280, 64, 64), - "res_hidden_states_0": (2, 320, 64, 64), - "res_hidden_states_1": (2, 640, 64, 64), - "res_hidden_states_2": (2, 640, 64, 64), - "temb": (2, 1280), - "encoder_hidden_states": (2, 77, 2048), - }, - "output_names": ["sample"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - "encoder_hidden_states": {0: "batch_size"}, - "res_hidden_states_0": {0: "batch_size"}, - "res_hidden_states_1": {0: "batch_size"}, - "res_hidden_states_2": {0: "batch_size"}, - }, - }, - "up_blocks.2": { - "dummy_input": { - "hidden_states": (2, 640, 128, 128), - "res_hidden_states_0": (2, 320, 128, 128), - "res_hidden_states_1": (2, 320, 128, 128), - "res_hidden_states_2": (2, 320, 128, 128), - "temb": (2, 1280), - }, - "output_names": ["sample"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - "res_hidden_states_0": {0: "batch_size"}, - "res_hidden_states_1": {0: "batch_size"}, - "res_hidden_states_2": {0: "batch_size"}, - }, - }, - }, - SD3Transformer2DModel: { - **{f"transformer_blocks.{i}": sd3_common_transformer_block_config for i in range(23)}, - "transformer_blocks.23": { - "dummy_input": { - "hidden_states": (2, 4096, 1536), - "encoder_hidden_states": (2, 333, 1536), - "temb": (2, 1536), - }, - "output_names": ["hidden_states_out"], - "dynamic_axes": { - "hidden_states": {0: "batch_size"}, - "encoder_hidden_states": {0: "batch_size"}, - "temb": {0: "steps"}, - }, - }, - }, -} diff --git a/examples/diffusers/cache_diffusion/pipeline/deploy.py b/examples/diffusers/cache_diffusion/pipeline/deploy.py deleted file mode 100644 index bf55ff309..000000000 --- a/examples/diffusers/cache_diffusion/pipeline/deploy.py +++ /dev/null @@ -1,216 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import types -from pathlib import Path - -import tensorrt as trt -import torch -from cache_diffusion.cachify import CACHED_PIPE, get_model -from cuda import cudart -from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel -from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel -from pipeline.config import ONNX_CONFIG -from pipeline.models.sd3 import sd3_forward -from pipeline.models.sdxl import ( - cachecrossattnupblock2d_forward, - cacheunet_forward, - cacheupblock2d_forward, -) -from polygraphy.backend.trt import ( - CreateConfig, - Profile, - engine_from_network, - network_from_onnx_path, - save_engine, -) -from torch.onnx import export as onnx_export - -from .utils import Engine - - -def replace_new_forward(backbone): - if backbone.__class__ == UNet2DConditionModel: - backbone.forward = types.MethodType(cacheunet_forward, backbone) - for upsample_block in backbone.up_blocks: - if ( - hasattr(upsample_block, "has_cross_attention") - and upsample_block.has_cross_attention - ): - upsample_block.forward = types.MethodType( - cachecrossattnupblock2d_forward, upsample_block - ) - else: - upsample_block.forward = types.MethodType(cacheupblock2d_forward, upsample_block) - elif backbone.__class__ == SD3Transformer2DModel: - backbone.forward = types.MethodType(sd3_forward, backbone) - - -def get_input_info(dummy_dict, info: str | None = None, batch_size: int = 1): - return_val = [] if info in {"profile_shapes", "input_names"} else {} - - def collect_leaf_keys(d): - for key, value in d.items(): - if isinstance(value, dict): - collect_leaf_keys(value) - else: - value = (value[0] * batch_size, *value[1:]) - if info == "profile_shapes": - return_val.append((key, value)) - elif info == "profile_shapes_dict": - return_val[key] = value # type: ignore[index] - elif info == "dummy_input": - return_val[key] = torch.ones(value).half().cuda() # type: ignore[index] - elif info == "input_names": - return_val.append(key) - - collect_leaf_keys(dummy_dict) - return return_val - - -def compile2trt(cls, onnx_path: Path, engine_path: Path, batch_size: int = 1): - subdirs = [f for f in onnx_path.iterdir() if f.is_dir()] - for subdir in subdirs: - if subdir.name not in ONNX_CONFIG[cls]: - continue - model_path = subdir / "model.onnx" - plan_path = engine_path / f"{subdir.name}.plan" - if not plan_path.exists(): - print(f"Building {model_path!s}") - build_profile = Profile() - profile_shapes = get_input_info( - ONNX_CONFIG[cls][subdir.name]["dummy_input"], "profile_shapes", batch_size - ) - for input_name, input_shape in profile_shapes: - min_input_shape = (2, *input_shape[1:]) - build_profile.add(input_name, min_input_shape, input_shape, input_shape) - block_network = network_from_onnx_path( - str(model_path), flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM], strongly_typed=True - ) - build_config = CreateConfig( - builder_optimization_level=4, - profiles=[build_profile], - ) - engine = engine_from_network( - block_network, - config=build_config, - ) - save_engine(engine, path=plan_path) - else: - print(f"{model_path!s} already exists!") - - -def get_total_device_memory(backbone): - max_device_memory = 0 - for engine in backbone.engines.values(): - max_device_memory = max(max_device_memory, engine.engine.device_memory_size) - return max_device_memory - - -def load_engines(backbone, engine_path: Path, batch_size: int = 1): - backbone.engines = {} - for f in engine_path.iterdir(): - if f.is_file(): - eng = Engine() - eng.load(str(f)) - backbone.engines[f"{f.stem}"] = eng - _, shared_device_memory = cudart.cudaMalloc(get_total_device_memory(backbone)) - for engine in backbone.engines.values(): - engine.activate(shared_device_memory) - backbone.cuda_stream = cudart.cudaStreamCreate()[1] - for block_name in backbone.engines: - backbone.engines[block_name].allocate_buffers( - shape_dict=get_input_info( - ONNX_CONFIG[backbone.__class__][block_name]["dummy_input"], - "profile_shapes_dict", - batch_size, - ), - device=backbone.device, - batch_size=batch_size, - ) - - -def free_memory(model_id, backbone): - if model_id == "sd3-medium": - for block in backbone.transformer_blocks: - block.to_empty(device="cpu") - else: - backbone.mid_block.to_empty(device="cpu") - backbone.down_blocks.to_empty(device="cpu") - backbone.up_blocks.to_empty(device="cpu") - torch.cuda.empty_cache() - - -def export_onnx(backbone, onnx_path: Path): - for name, module in backbone.named_modules(): - if isinstance(module, CACHED_PIPE[backbone.__class__]): - _onnx_dir = onnx_path.joinpath(f"{name}") - _onnx_file = _onnx_dir.joinpath("model.onnx") - if not _onnx_file.exists(): - _onnx_dir.mkdir(parents=True, exist_ok=True) - dummy_input = get_input_info( - ONNX_CONFIG[backbone.__class__][f"{name}"]["dummy_input"], "dummy_input" - ) - input_names = get_input_info( - ONNX_CONFIG[backbone.__class__][f"{name}"]["dummy_input"], "input_names" - ) - output_names = ONNX_CONFIG[backbone.__class__][f"{name}"]["output_names"] - onnx_export( - module, - args=tuple(dummy_input.values()) - if isinstance(dummy_input, dict) - else dummy_input, - f=_onnx_file.as_posix(), - input_names=input_names, - output_names=output_names, - dynamic_axes=ONNX_CONFIG[backbone.__class__][f"{name}"]["dynamic_axes"], - do_constant_folding=True, - opset_version=17, - dynamo=False, - ) - else: - print(f"{_onnx_file!s} already exists!") - - -def warm_up(backbone, batch_size: int = 1): - print("Warming-up TensorRT engines...") - for name, engine in backbone.engines.items(): - dummy_input = get_input_info( - ONNX_CONFIG[backbone.__class__][name]["dummy_input"], "dummy_input", batch_size - ) - _ = engine(dummy_input, backbone.cuda_stream) - - -def teardown(pipe): - backbone = get_model(pipe) - for engine in backbone.engines.values(): - del engine - - cudart.cudaStreamDestroy(backbone.cuda_stream) - del backbone.cuda_stream - - -def compile(pipe, model_id: str, onnx_path: Path, engine_path: Path, batch_size: int = 1): - backbone = get_model(pipe) - onnx_path.mkdir(parents=True, exist_ok=True) - engine_path.mkdir(parents=True, exist_ok=True) - - replace_new_forward(backbone) - export_onnx(backbone, onnx_path) - compile2trt(backbone.__class__, onnx_path, engine_path, batch_size) - load_engines(backbone, engine_path, batch_size) - free_memory(model_id, backbone) - warm_up(backbone, batch_size) - backbone.use_trt_infer = True diff --git a/examples/diffusers/cache_diffusion/pipeline/models/sd3.py b/examples/diffusers/cache_diffusion/pipeline/models/sd3.py deleted file mode 100755 index f4adc1b46..000000000 --- a/examples/diffusers/cache_diffusion/pipeline/models/sd3.py +++ /dev/null @@ -1,152 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any - -import torch -from diffusers.models.modeling_outputs import Transformer2DModelOutput -from diffusers.utils import ( - USE_PEFT_BACKEND, - is_torch_version, - scale_lora_layers, - unscale_lora_layers, -) - - -def sd3_forward( - self, - hidden_states: torch.FloatTensor, - encoder_hidden_states: torch.FloatTensor = None, - pooled_projections: torch.FloatTensor = None, - timestep: torch.LongTensor = None, - block_controlnet_hidden_states: list | None = None, - joint_attention_kwargs: dict | None = None, - return_dict: bool = True, -) -> torch.FloatTensor | Transformer2DModelOutput: - """ - The [`SD3Transformer2DModel`] forward method. - - Args: - hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`): - Input `hidden_states`. - encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`): - Conditional embeddings (embeddings computed from the input conditions such as prompts) to use. - pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected - from the embeddings of input conditions. - timestep ( `torch.LongTensor`): - Used to indicate denoising step. - block_controlnet_hidden_states: (`list` of `torch.Tensor`): - A list of tensors that if specified are added to the residuals of transformer blocks. - joint_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain - tuple. - - Returns: - If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a - `tuple` where the first element is the sample tensor. - """ - if joint_attention_kwargs is not None: - joint_attention_kwargs = joint_attention_kwargs.copy() - lora_scale = joint_attention_kwargs.pop("scale", 1.0) - else: - lora_scale = 1.0 - - if USE_PEFT_BACKEND: - # weight the lora layers by setting `lora_scale` for each PEFT layer - scale_lora_layers(self, lora_scale) - - height, width = hidden_states.shape[-2:] - - hidden_states = self.pos_embed(hidden_states) # takes care of adding positional embeddings too. - temb = self.time_text_embed(timestep, pooled_projections) - encoder_hidden_states = self.context_embedder(encoder_hidden_states) - - for index_block, block in enumerate(self.transformer_blocks): - if self.training and self.gradient_checkpointing: - - def create_custom_forward(module, return_dict=None): - def custom_forward(*inputs): - if return_dict is not None: - return module(*inputs, return_dict=return_dict) - else: - return module(*inputs) - - return custom_forward - - ckpt_kwargs: dict[str, Any] = ( - {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} - ) - encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(block), - hidden_states, - encoder_hidden_states, - temb, - **ckpt_kwargs, - ) - - elif hasattr(self, "use_trt_infer") and self.use_trt_infer: - feed_dict = { - "hidden_states": hidden_states, - "encoder_hidden_states": encoder_hidden_states, - "temb": temb, - } - _results = self.engines[f"transformer_blocks.{index_block}"]( - feed_dict, self.cuda_stream - ) - if index_block != 23: - encoder_hidden_states = _results["encoder_hidden_states_out"] - hidden_states = _results["hidden_states_out"] - else: - encoder_hidden_states, hidden_states = block( - hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - temb=temb, - ) - - # controlnet residual - if block_controlnet_hidden_states is not None and block.context_pre_only is False: - interval_control = len(self.transformer_blocks) // len(block_controlnet_hidden_states) - hidden_states = ( - hidden_states + block_controlnet_hidden_states[index_block // interval_control] - ) - - hidden_states = self.norm_out(hidden_states, temb) - hidden_states = self.proj_out(hidden_states) - - # unpatchify - patch_size = self.config.patch_size - height = height // patch_size - width = width // patch_size - - hidden_states = hidden_states.reshape( - shape=(hidden_states.shape[0], height, width, patch_size, patch_size, self.out_channels) - ) - hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states) - output = hidden_states.reshape( - shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size) - ) - - if USE_PEFT_BACKEND: - # remove `lora_scale` from each PEFT layer - unscale_lora_layers(self, lora_scale) - - if not return_dict: - return (output,) - - return Transformer2DModelOutput(sample=output) diff --git a/examples/diffusers/cache_diffusion/pipeline/models/sdxl.py b/examples/diffusers/cache_diffusion/pipeline/models/sdxl.py deleted file mode 100755 index 6b14de862..000000000 --- a/examples/diffusers/cache_diffusion/pipeline/models/sdxl.py +++ /dev/null @@ -1,275 +0,0 @@ -# Adapted from -# https://github.com/huggingface/diffusers/blob/73acebb8cfbd1d2954cabe1af4185f9994e61917/src/diffusers/models/unets/unet_2d_condition.py#L1039-L1312 -# https://github.com/huggingface/diffusers/blob/73acebb8cfbd1d2954cabe1af4185f9994e61917/src/diffusers/models/unets/unet_2d_blocks.py#L2482-L2564 -# https://github.com/huggingface/diffusers/blob/73acebb8cfbd1d2954cabe1af4185f9994e61917/src/diffusers/models/unets/unet_2d_blocks.py#L2617-L2679 - -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any - -import torch -from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput - - -def cachecrossattnupblock2d_forward( - self, - hidden_states: torch.FloatTensor, - res_hidden_states_0: torch.FloatTensor, - res_hidden_states_1: torch.FloatTensor, - res_hidden_states_2: torch.FloatTensor, - temb: torch.FloatTensor | None = None, - encoder_hidden_states: torch.FloatTensor | None = None, - cross_attention_kwargs: dict[str, Any] | None = None, - upsample_size: int | None = None, - attention_mask: torch.FloatTensor | None = None, - encoder_attention_mask: torch.FloatTensor | None = None, -) -> torch.FloatTensor: - res_hidden_states_tuple = (res_hidden_states_0, res_hidden_states_1, res_hidden_states_2) - for resnet, attn in zip(self.resnets, self.attentions): - # pop res hidden states - res_hidden_states = res_hidden_states_tuple[-1] - res_hidden_states_tuple = res_hidden_states_tuple[:-1] - - hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) - - hidden_states = resnet(hidden_states, temb) - hidden_states = attn( - hidden_states, - encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, - attention_mask=attention_mask, - encoder_attention_mask=encoder_attention_mask, - return_dict=False, - )[0] - - if self.upsamplers is not None: - for upsampler in self.upsamplers: - hidden_states = upsampler(hidden_states, upsample_size) - - return hidden_states - - -def cacheupblock2d_forward( - self, - hidden_states: torch.FloatTensor, - res_hidden_states_0: torch.FloatTensor, - res_hidden_states_1: torch.FloatTensor, - res_hidden_states_2: torch.FloatTensor, - temb: torch.FloatTensor | None = None, - upsample_size: int | None = None, -) -> torch.FloatTensor: - res_hidden_states_tuple = (res_hidden_states_0, res_hidden_states_1, res_hidden_states_2) - for resnet in self.resnets: - # pop res hidden states - res_hidden_states = res_hidden_states_tuple[-1] - res_hidden_states_tuple = res_hidden_states_tuple[:-1] - - hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) - - hidden_states = resnet(hidden_states, temb) - - if self.upsamplers is not None: - for upsampler in self.upsamplers: - hidden_states = upsampler(hidden_states, upsample_size) - - return hidden_states - - -def cacheunet_forward( - self, - sample: torch.FloatTensor, - timestep: torch.Tensor | float | int, - encoder_hidden_states: torch.Tensor, - class_labels: torch.Tensor | None = None, - timestep_cond: torch.Tensor | None = None, - attention_mask: torch.Tensor | None = None, - cross_attention_kwargs: dict[str, Any] | None = None, - added_cond_kwargs: dict[str, torch.Tensor] | None = None, - down_block_additional_residuals: tuple[torch.Tensor] | None = None, - mid_block_additional_residual: torch.Tensor | None = None, - down_intrablock_additional_residuals: tuple[torch.Tensor] | None = None, - encoder_attention_mask: torch.Tensor | None = None, - return_dict: bool = True, -) -> UNet2DConditionOutput | tuple: - # 1. time - t_emb = self.get_time_embed(sample=sample, timestep=timestep) - emb = self.time_embedding(t_emb, timestep_cond) - aug_emb = None - - aug_emb = self.get_aug_embed( - emb=emb, - encoder_hidden_states=encoder_hidden_states, - added_cond_kwargs=added_cond_kwargs, - ) - - emb = emb + aug_emb if aug_emb is not None else emb - - encoder_hidden_states = self.process_encoder_hidden_states( - encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs - ) - - # 2. pre-process - sample = self.conv_in(sample) - - if hasattr(self, "_export_precess_onnx") and self._export_precess_onnx: - return ( - sample, - encoder_hidden_states, - emb, - ) - - down_block_res_samples = (sample,) - for i, downsample_block in enumerate(self.down_blocks): - if ( - hasattr(downsample_block, "has_cross_attention") - and downsample_block.has_cross_attention - ): - if hasattr(self, "use_trt_infer") and self.use_trt_infer: - feed_dict = { - "hidden_states": sample, - "temb": emb, - "encoder_hidden_states": encoder_hidden_states, - } - down_results = self.engines[f"down_blocks.{i}"](feed_dict, self.cuda_stream) - sample = down_results["sample"] - res_samples_0 = down_results["res_samples_0"] - res_samples_1 = down_results["res_samples_1"] - if "res_samples_2" in down_results: - res_samples_2 = down_results["res_samples_2"] - else: - # For t2i-adapter CrossAttnDownBlock2D - additional_residuals = {} - - sample, res_samples = downsample_block( - hidden_states=sample, - temb=emb, - encoder_hidden_states=encoder_hidden_states, - attention_mask=attention_mask, - cross_attention_kwargs=cross_attention_kwargs, - encoder_attention_mask=encoder_attention_mask, - **additional_residuals, - ) - elif hasattr(self, "use_trt_infer") and self.use_trt_infer: - feed_dict = {"hidden_states": sample, "temb": emb} - down_results = self.engines[f"down_blocks.{i}"](feed_dict, self.cuda_stream) - sample = down_results["sample"] - res_samples_0 = down_results["res_samples_0"] - res_samples_1 = down_results["res_samples_1"] - if "res_samples_2" in down_results: - res_samples_2 = down_results["res_samples_2"] - else: - sample, res_samples = downsample_block(hidden_states=sample, temb=emb) - - if hasattr(self, "use_trt_infer") and self.use_trt_infer: - down_block_res_samples += ( - res_samples_0, - res_samples_1, - ) - if "res_samples_2" in down_results: - down_block_res_samples += (res_samples_2,) - else: - down_block_res_samples += res_samples - - if hasattr(self, "use_trt_infer") and self.use_trt_infer: - feed_dict = { - "hidden_states": sample, - "temb": emb, - "encoder_hidden_states": encoder_hidden_states, - } - mid_results = self.engines["mid_block"](feed_dict, self.cuda_stream) - sample = mid_results["sample"] - else: - sample = self.mid_block( - sample, - emb, - encoder_hidden_states=encoder_hidden_states, - attention_mask=attention_mask, - cross_attention_kwargs=cross_attention_kwargs, - encoder_attention_mask=encoder_attention_mask, - ) - - # 5. up - for i, upsample_block in enumerate(self.up_blocks): - res_samples = down_block_res_samples[-len(upsample_block.resnets) :] - down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] - - if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: - if hasattr(self, "use_trt_infer") and self.use_trt_infer: - feed_dict = { - "hidden_states": sample, - "res_hidden_states_0": res_samples[0], - "res_hidden_states_1": res_samples[1], - "res_hidden_states_2": res_samples[2], - "temb": emb, - "encoder_hidden_states": encoder_hidden_states, - } - up_results = self.engines[f"up_blocks.{i}"](feed_dict, self.cuda_stream) - sample = up_results["sample"] - else: - sample = upsample_block( - hidden_states=sample, - temb=emb, - res_hidden_states_0=res_samples[0], - res_hidden_states_1=res_samples[1], - res_hidden_states_2=res_samples[2], - encoder_hidden_states=encoder_hidden_states, - cross_attention_kwargs=cross_attention_kwargs, - attention_mask=attention_mask, - encoder_attention_mask=encoder_attention_mask, - ) - elif hasattr(self, "use_trt_infer") and self.use_trt_infer: - feed_dict = { - "hidden_states": sample, - "res_hidden_states_0": res_samples[0], - "res_hidden_states_1": res_samples[1], - "res_hidden_states_2": res_samples[2], - "temb": emb, - } - up_results = self.engines[f"up_blocks.{i}"](feed_dict, self.cuda_stream) - sample = up_results["sample"] - else: - sample = upsample_block( - hidden_states=sample, - temb=emb, - res_hidden_states_0=res_samples[0], - res_hidden_states_1=res_samples[1], - res_hidden_states_2=res_samples[2], - ) - - # 6. post-process - if self.conv_norm_out: - sample = self.conv_norm_out(sample) - sample = self.conv_act(sample) - sample = self.conv_out(sample) - - if not return_dict: - return (sample,) - - return UNet2DConditionOutput(sample=sample) diff --git a/examples/diffusers/cache_diffusion/pipeline/utils.py b/examples/diffusers/cache_diffusion/pipeline/utils.py deleted file mode 100644 index 7157e3d2c..000000000 --- a/examples/diffusers/cache_diffusion/pipeline/utils.py +++ /dev/null @@ -1,124 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import OrderedDict - -import numpy as np -import tensorrt as trt -import torch -from cuda import cudart -from polygraphy.backend.common import bytes_from_path -from polygraphy.backend.trt import engine_from_bytes - -numpy_to_torch_dtype_dict = { - np.uint8: torch.uint8, - np.int8: torch.int8, - np.int16: torch.int16, - np.int32: torch.int32, - np.int64: torch.int64, - np.float16: torch.float16, - np.float32: torch.float32, - np.float64: torch.float64, - np.complex64: torch.complex64, - np.complex128: torch.complex128, -} - - -class Engine: - def __init__( - self, - ): - self.engine = None - self.context = None - self.buffers = OrderedDict() - self.tensors = OrderedDict() - self.cuda_graph_instance = None # cuda graph - self.has_cross_attention = False - - def __del__(self): - del self.engine - del self.context - del self.buffers - del self.tensors - - def load(self, engine_path): - self.engine = engine_from_bytes(bytes_from_path(engine_path)) - - def activate(self, reuse_device_memory=None): - if reuse_device_memory: - self.context = self.engine.create_execution_context_without_device_memory() # type: ignore[union-attr] - self.context.device_memory = reuse_device_memory - else: - self.context = self.engine.create_execution_context() # type: ignore[union-attr] - - def allocate_buffers(self, shape_dict=None, device="cuda", batch_size=1): - for binding in range(self.engine.num_io_tensors): # type: ignore[union-attr] - name = self.engine.get_tensor_name(binding) # type: ignore[union-attr] - if shape_dict and name in shape_dict: - shape = shape_dict[name] - else: - shape = self.engine.get_tensor_shape(name) # type: ignore[union-attr] - shape = (batch_size * 2, *shape[1:]) - dtype = trt.nptype(self.engine.get_tensor_dtype(name)) # type: ignore[union-attr] - if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT: # type: ignore[union-attr] - self.context.set_input_shape(name, shape) # type: ignore[union-attr] - tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to( - device=device - ) - self.tensors[name] = tensor - - def __call__(self, feed_dict, stream, use_cuda_graph=False): - for name, buf in feed_dict.items(): - self.tensors[name].copy_(buf) - - for name, tensor in self.tensors.items(): - self.context.set_tensor_address(name, tensor.data_ptr()) # type: ignore[union-attr] - - if use_cuda_graph: - if self.cuda_graph_instance is not None: - cuassert(cudart.cudaGraphLaunch(self.cuda_graph_instance, stream)) - cuassert(cudart.cudaStreamSynchronize(stream)) - else: - # do inference before CUDA graph capture - noerror = self.context.execute_async_v3(stream) # type: ignore[union-attr] - if not noerror: - raise ValueError("ERROR: inference failed.") - # capture cuda graph - cuassert( - cudart.cudaStreamBeginCapture( - stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeGlobal - ) - ) - self.context.execute_async_v3(stream) # type: ignore[union-attr] - self.graph = cuassert(cudart.cudaStreamEndCapture(stream)) - self.cuda_graph_instance = cuassert(cudart.cudaGraphInstantiate(self.graph, 0)) - else: - noerror = self.context.execute_async_v3(stream) # type: ignore[union-attr] - if not noerror: - raise ValueError("ERROR: inference failed.") - - return self.tensors - - -def cuassert(cuda_ret): - err = cuda_ret[0] - if err != cudart.cudaError_t.cudaSuccess: - raise RuntimeError( - f"CUDA ERROR: {err}, error code reference: " - "https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__ERROR.html" - ) - if len(cuda_ret) > 1: - return cuda_ret[1] - return None diff --git a/examples/diffusers/requirements.txt b/examples/diffusers/requirements.txt index 3cdac70f8..f5582868b 100644 --- a/examples/diffusers/requirements.txt +++ b/examples/diffusers/requirements.txt @@ -1,4 +1,3 @@ -cuda-python<13 nvtx opencv-python>=4.8.1.78,<4.12.0.88 sentencepiece diff --git a/tests/examples/diffusers/test_cache_diffusion.py b/tests/examples/diffusers/test_cache_diffusion.py index fff3fa4cb..18f495257 100644 --- a/tests/examples/diffusers/test_cache_diffusion.py +++ b/tests/examples/diffusers/test_cache_diffusion.py @@ -59,19 +59,3 @@ def test_pixart_cachify(): # Clear cuda memory as pytest doesnt clear it between tests del pipe torch.cuda.empty_cache() - - -def test_sdxl_benchmarks(tmp_path): - # fmt: off - subprocess.run( - [ - "python", "benchmarks.py", - "--model-id", "sdxl", - "--batch-size", "1", - "--num-iter", "2", - "--output-dir", tmp_path, - ], - cwd=MODELOPT_ROOT / "examples/diffusers/cache_diffusion", - check=True, - ) - # fmt: on