diff --git a/diffsynth/core/vram/layers.py b/diffsynth/core/vram/layers.py index 751792d0..b0179b91 100644 --- a/diffsynth/core/vram/layers.py +++ b/diffsynth/core/vram/layers.py @@ -64,7 +64,11 @@ def cast_to(self, weight, dtype, device): def check_free_vram(self): device = self.computation_device if self.computation_device != "npu" else "npu:0" - gpu_mem_state = getattr(torch, self.computation_device_type).mem_get_info(device) + device_module = getattr(torch, self.computation_device_type, None) + # Only CUDA and NPU have mem_get_info, for MPS/CPU assume enough memory + if device_module is None or not hasattr(device_module, "mem_get_info"): + return True + gpu_mem_state = device_module.mem_get_info(device) used_memory = (gpu_mem_state[1] - gpu_mem_state[0]) / (1024**3) return used_memory < self.vram_limit diff --git a/diffsynth/diffusion/base_pipeline.py b/diffsynth/diffusion/base_pipeline.py index fa355a16..4d90ff29 100644 --- a/diffsynth/diffusion/base_pipeline.py +++ b/diffsynth/diffusion/base_pipeline.py @@ -155,7 +155,10 @@ def load_models_to_device(self, model_names): for module in model.modules(): if hasattr(module, "offload"): module.offload() - getattr(torch, self.device_type).empty_cache() + # Clear cache if available (only CUDA has empty_cache) + device_module = getattr(torch, self.device_type, None) + if device_module is not None and hasattr(device_module, "empty_cache"): + device_module.empty_cache() # onload models for name, model in self.named_children(): if name in model_names: diff --git a/diffsynth/models/dinov3_image_encoder.py b/diffsynth/models/dinov3_image_encoder.py index be2ee587..70eec5b2 100644 --- a/diffsynth/models/dinov3_image_encoder.py +++ b/diffsynth/models/dinov3_image_encoder.py @@ -70,7 +70,10 @@ def __init__(self): } ) - def forward(self, image, torch_dtype=torch.bfloat16, device="cuda"): + def forward(self, image, torch_dtype=torch.bfloat16, device=None): + # Use model's device if not specified + if device is None: + device = next(self.parameters()).device inputs = self.processor(images=image, return_tensors="pt") pixel_values = inputs["pixel_values"].to(dtype=torch_dtype, device=device) bool_masked_pos = None diff --git a/diffsynth/models/siglip2_image_encoder.py b/diffsynth/models/siglip2_image_encoder.py index 10184f85..dbc3a0e1 100644 --- a/diffsynth/models/siglip2_image_encoder.py +++ b/diffsynth/models/siglip2_image_encoder.py @@ -47,7 +47,10 @@ def __init__(self): } ) - def forward(self, image, torch_dtype=torch.bfloat16, device="cuda"): + def forward(self, image, torch_dtype=torch.bfloat16, device=None): + # Use model's device if not specified + if device is None: + device = next(self.parameters()).device pixel_values = self.processor(images=[image], return_tensors="pt")["pixel_values"] pixel_values = pixel_values.to(device=device, dtype=torch_dtype) output_attentions = False