invoke-ai · lstein · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/docs/src/content/docs/configuration/invokeai-yaml.mdx b/docs/src/content/docs/configuration/invokeai-yaml.mdx
@@ -114,6 +114,39 @@ Most common algorithms are supported, like `md5`, `sha256`, and `sha512`. These
 
 These options set the paths of various directories and files used by InvokeAI. Any user-defined paths should be absolute paths.
 
+#### Multi-GPU Generation
+
+On a machine with more than one GPU, InvokeAI can run several generation sessions at the same time — one per GPU — instead of processing the queue one job at a time. Jobs are distributed fairly across users, so a single user's large batch cannot monopolize every GPU while others wait.
+
+This is controlled by the `generation_devices` setting:
+
+```yaml
+generation_devices: auto # default value
+```
+
+| Value                      | Behavior                                                                                                                |
+| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
+| `auto`                          | Use every available CUDA GPU, running one generation session per GPU concurrently. This is the default.             |
+| `[cuda:0,cuda:1]`         | Use the specific devices listed, one session per device. Useful for reserving a GPU for other work.                 |
+| `[cuda:0]`                | Use a single specific device. Generation runs serially, as it did before multi-GPU support.                                 |
+| `[]`                      | Use the first detected device. Generation runs serially, as it did before multi-GPU support.                                |
+
+Each entry in the list must be one of `cpu`, `cuda`, `mps`, or `cuda:N`, where `N` is a zero-based device number (`cuda:0` is the first GPU, `cuda:1` the second, and so on).
+
+```yaml
+# Use the first and third GPUs, leaving the second free for other tasks
+generation_devices: [cuda:0, cuda:2]
+```
+
+Notes:
+
+- On a system without a CUDA GPU, `auto` resolves to the single best available device (`mps` on Apple Silicon, otherwise `cpu`), so generation runs serially.
+- Each active GPU gets its own model cache, and model weights are duplicated in system RAM for every device. Running many GPUs in parallel therefore increases RAM usage — ensure you have ample system memory before enabling a large device list.
+- Duplicate entries are ignored; `[cuda:0, cuda:0]` is treated as `[cuda:0]`.
+- You can restrict which physical GPUs InvokeAI sees with the `CUDA_VISIBLE_DEVICES` environment variable. When set, `auto` only enumerates the visible subset, and `cuda:N` indices refer to positions within that subset.
+
+During parallel generation, the progress display shows one progress bar per active session, stacked vertically, each disappearing as its session completes.
+
 #### Image Subfolder Strategy
 
 By default, generated images are stored in a single flat directory under `outputs/images/`. The `image_subfolder_strategy` setting lets you organize newly-created images into subfolders automatically. You can edit this setting in `invokeai.yaml` or, as an admin user, in the Settings panel.

diff --git a/docs/src/generated/settings.json b/docs/src/generated/settings.json
@@ -490,6 +490,17 @@
       "type": "<class 'str'>",
       "validation": {}
     },
+    {
+      "category": "DEVICE",
+      "default": "auto",
+      "description": "Devices to use for parallel generation. `auto` (the default) uses every available GPU, running one generation session per GPU concurrently and distributing jobs fairly across users. Provide an explicit list (e.g. `[cuda:0, cuda:1]`) to use specific devices, or a single-device list (e.g. `[cuda:0]`) to run serially. On systems without a GPU, `auto` resolves to the single `cpu`/`mps` device.<br>Valid values: `auto`, or a list whose entries are each `cpu`, `cuda`, `mps`, or `cuda:N` (where N is a device number)",
+      "env_var": "INVOKEAI_GENERATION_DEVICES",
+      "literal_values": [],
+      "name": "generation_devices",
+      "required": false,
+      "type": "typing.Union[typing.Literal['auto'], list[str]]",
+      "validation": {}
+    },
     {
       "category": "DEVICE",
       "default": "auto",

@@ -1,15 +1,16 @@
 import locale
+import re
 from enum import Enum
 from importlib.metadata import distributions
 from pathlib import Path as FilePath
 from threading import Lock
-from typing import Any
+from typing import Any, Literal, Union
 
 import torch
 import yaml
 from fastapi import Body, HTTPException, Path
 from fastapi.routing import APIRouter
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
 
 from invokeai.app.api.auth_dependencies import AdminUserOrDefault
 from invokeai.app.api.dependencies import ApiDependencies
@@ -118,6 +119,16 @@ def _remove_nullable_default_from_schema(schema: dict[str, Any]) -> None:
             schema.update(non_null_schemas[0])
 
 
+_GENERATION_DEVICE_PATTERN = re.compile(r"^(cpu|mps|cuda(:\d+)?)$")
+
+
+class GenerationDeviceOption(BaseModel):
+    """A device that may be selected for generation."""
+
+    device: str = Field(description="The device identifier, e.g. 'cuda:0', 'mps', or 'cpu'")
+    name: str = Field(description="Human-readable device name")
+
+
 class UpdateAppGenerationSettingsRequest(BaseModel):
     """Writable generation-related app settings."""
 
@@ -131,14 +142,59 @@ class UpdateAppGenerationSettingsRequest(BaseModel):
         ge=0,
         description="Keep the last N completed, failed, and canceled queue items on startup. Set to 0 to prune all terminal items.",
     )
+    generation_devices: Union[Literal["auto"], list[str]] | None = Field(
+        default=None,
+        description="Devices to use for parallel generation. `auto` uses every available GPU; provide an explicit list (e.g. `[cuda:0, cuda:1]`) to use specific devices. Takes effect after restarting InvokeAI.",
+        json_schema_extra=_remove_nullable_default_from_schema,
+    )
+
+    @field_validator("generation_devices")
+    @classmethod
+    def validate_generation_devices(
+        cls, v: Union[Literal["auto"], list[str], None]
+    ) -> Union[Literal["auto"], list[str], None]:
+        if v is None or v == "auto":
+            return v
+        for device in v:
+            if not _GENERATION_DEVICE_PATTERN.match(device):
+                raise ValueError(
+                    f"Invalid generation device '{device}'. Valid values are 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
+                )
+        return v
 
     @model_validator(mode="after")
     def validate_explicit_nulls(self) -> "UpdateAppGenerationSettingsRequest":
         if "image_subfolder_strategy" in self.model_fields_set and self.image_subfolder_strategy is None:
             raise ValueError("image_subfolder_strategy may not be null")
+        if "generation_devices" in self.model_fields_set and self.generation_devices is None:
+            raise ValueError("generation_devices may not be null")
         return self
 
 
+@app_router.get(
+    "/generation_device_options",
+    operation_id="get_generation_device_options",
+    status_code=200,
+    response_model=list[GenerationDeviceOption],
+)
+async def get_generation_device_options() -> list[GenerationDeviceOption]:
+    """List the devices available for generation, for use with the `generation_devices` setting."""
+    options: list[GenerationDeviceOption] = []
+    if torch.cuda.is_available():
+        for index in range(torch.cuda.device_count()):
+            device = f"cuda:{index}"
+            try:
+                name = torch.cuda.get_device_name(index)
+            except Exception:
+                name = device
+            options.append(GenerationDeviceOption(device=device, name=name))
+    elif torch.backends.mps.is_available():
+        options.append(GenerationDeviceOption(device="mps", name="Apple MPS"))
+    else:
+        options.append(GenerationDeviceOption(device="cpu", name="CPU"))
+    return options
+
+
 @app_router.get(
     "/runtime_config", operation_id="get_runtime_config", status_code=200, response_model=InvokeAIAppConfigWithSetFields
 )

@@ -443,7 +443,11 @@ async def update_model_record(
         # nn.Module at load time, so toggling them on a cached model is otherwise silently a no-op until
         # the entry is evicted. Drop any unlocked cached entries for this model so the next load rebuilds.
         if _load_settings_changed(previous_config, config):
-            dropped = ApiDependencies.invoker.services.model_manager.load.ram_cache.drop_model(key)
+            # Drop the model from every per-device cache so the next load on any GPU rebuilds it.
+            dropped = sum(
+                cache.drop_model(key)
+                for cache in ApiDependencies.invoker.services.model_manager.load.ram_caches.values()
+            )
             if dropped:
                 logger.info(
                     f"Dropped {dropped} cached entr{'y' if dropped == 1 else 'ies'} for model {key} after settings change."
@@ -1304,9 +1308,10 @@ async def get_stats() -> Optional[CacheStats]:
 )
 async def empty_model_cache(current_admin: AdminUserOrDefault) -> None:
     """Drop all models from the model cache to free RAM/VRAM. 'Locked' models that are in active use will not be dropped."""
-    # Request 1000GB of room in order to force the cache to drop all models.
+    # Request 1000GB of room in order to force each per-device cache to drop all models.
     ApiDependencies.invoker.services.logger.info("Emptying model cache.")
-    ApiDependencies.invoker.services.model_manager.load.ram_cache.make_room(1000 * 2**30)
+    for cache in ApiDependencies.invoker.services.model_manager.load.ram_caches.values():
+        cache.make_room(1000 * 2**30)
 
 
 class HFTokenStatus(str, Enum):

@@ -608,7 +608,7 @@ def _run_transformer(ctx: torch.Tensor, x: torch.Tensor, t: torch.Tensor) -> tor
 
             if driver is not None:
                 user_step = 0
-                pbar = tqdm(total=total_steps, desc="Denoising (Anima)")
+                pbar = tqdm(total=total_steps, desc=f"Denoising (Anima){TorchDevice.get_session_device_label()}")
                 for it in driver.iterations():
                     timestep = torch.tensor(
                         [it.sigma_curr * ANIMA_MULTIPLIER], device=device, dtype=inference_dtype
@@ -655,7 +655,9 @@ def _run_transformer(ctx: torch.Tensor, x: torch.Tensor, t: torch.Tensor) -> tor
                 pbar.close()
             else:
                 # Built-in Euler implementation (default for Anima)
-                for step_idx in tqdm(range(total_steps), desc="Denoising (Anima)"):
+                for step_idx in tqdm(
+                    range(total_steps), desc=f"Denoising (Anima){TorchDevice.get_session_device_label()}"
+                ):
                     sigma_curr = sigmas[step_idx]
                     sigma_prev = sigmas[step_idx + 1]
 

@@ -294,7 +294,7 @@ def _run_diffusion(
             assert isinstance(transformer, CogView4Transformer2DModel)
 
             # Denoising loop
-            for step_idx in tqdm(range(total_steps)):
+            for step_idx in tqdm(range(total_steps), desc=f"Denoising{TorchDevice.get_session_device_label()}"):
                 t_curr = timesteps[step_idx]
                 sigma_curr = sigmas[step_idx]
                 sigma_prev = sigmas[step_idx + 1]

@@ -284,7 +284,10 @@ def _run_diffusion(
             assert isinstance(transformer, SD3Transformer2DModel)
 
             # 6. Denoising loop
-            for step_idx, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
+            for step_idx, (t_curr, t_prev) in tqdm(
+                list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True))),
+                desc=f"Denoising{TorchDevice.get_session_device_label()}",
+            ):
                 # Expand the latents if we are doing CFG.
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 # Expand the timestep to match the latent model input.

@@ -569,7 +569,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor:
                 # Use diffusers scheduler for stepping
                 # Use tqdm with total_steps (user-facing steps) not num_scheduler_steps (internal steps)
                 # This ensures progress bar shows 1/8, 2/8, etc. even when scheduler uses more internal steps
-                pbar = tqdm(total=total_steps, desc="Denoising")
+                pbar = tqdm(total=total_steps, desc=f"Denoising{TorchDevice.get_session_device_label()}")
                 for step_index in range(num_scheduler_steps):
                     sched_timestep = scheduler.timesteps[step_index]
                     # Convert scheduler timestep (0-1000) to normalized sigma (0-1)
@@ -686,7 +686,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor:
                 pbar.close()
             else:
                 # Original Euler implementation (default, optimized for Z-Image)
-                for step_idx in tqdm(range(total_steps)):
+                for step_idx in tqdm(range(total_steps), desc=f"Denoising{TorchDevice.get_session_device_label()}"):
                     sigma_curr = sigmas[step_idx]
                     sigma_prev = sigmas[step_idx + 1]
 

@@ -11,7 +11,7 @@
 import shutil
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Literal, Optional
+from typing import Any, Literal, Optional, Union
 
 import yaml
 from pydantic import BaseModel, Field, PrivateAttr, field_validator
@@ -205,6 +205,7 @@ class InvokeAIAppConfig(BaseSettings):
 
     # DEVICE
     device:                      str = Field(default="auto",                description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `mps`, `cuda:N` (where N is a device number)", pattern=r"^(auto|cpu|mps|cuda(:\d+)?)$")
+    generation_devices: Union[Literal["auto"], list[str]] = Field(default="auto", description="Devices to use for parallel generation. `auto` (the default) uses every available GPU, running one generation session per GPU concurrently and distributing jobs fairly across users. Provide an explicit list (e.g. `[cuda:0, cuda:1]`) to use specific devices, or a single-device list (e.g. `[cuda:0]`) to run serially. On systems without a GPU, `auto` resolves to the single `cpu`/`mps` device.<br>Valid values: `auto`, or a list whose entries are each `cpu`, `cuda`, `mps`, or `cuda:N` (where N is a device number)")
     precision:                PRECISION = Field(default="auto",             description="Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.")
 
     # GENERATION
@@ -257,6 +258,19 @@ class InvokeAIAppConfig(BaseSettings):
 
     model_config = SettingsConfigDict(env_prefix="INVOKEAI_", env_ignore_empty=True)
 
+    @field_validator("generation_devices")
+    @classmethod
+    def validate_generation_devices(cls, v: Union[str, list[str]]) -> Union[str, list[str]]:
+        if v == "auto":
+            return v
+        pattern = re.compile(r"^(cpu|mps|cuda(:\d+)?)$")
+        for device in v:
+            if not pattern.match(device):
+                raise ValueError(
+                    f"Invalid generation device '{device}'. Valid values are 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
+                )
+        return v
+
     def update_config(self, config: dict[str, Any] | InvokeAIAppConfig, clobber: bool = True) -> None:
         """Updates the config, overwriting existing values.
 

@@ -138,6 +138,10 @@ class InvocationProgressEvent(InvocationEventBase):
     image: ProgressImage | None = Field(
         default=None, description="An image representing the current state of the progress"
     )
+    device: str | None = Field(
+        default=None,
+        description="The device processing this session, e.g. 'cuda:1' (set only when running on a CUDA GPU)",
+    )
 
     @classmethod
     def build(
@@ -148,6 +152,13 @@ def build(
         percentage: float | None = None,
         image: ProgressImage | None = None,
     ) -> "InvocationProgressEvent":
+        # This is emitted from the session-processor worker thread, which pins its CUDA device via
+        # TorchDevice.set_session_device(). Resolve that here so the UI can label progress by GPU.
+        from invokeai.backend.util.devices import TorchDevice
+
+        session_device = TorchDevice.get_session_device()
+        device = str(session_device) if session_device is not None and session_device.type == "cuda" else None
+
         return cls(
             queue_id=queue_item.queue_id,
             item_id=queue_item.item_id,
@@ -161,6 +172,7 @@ def build(
             percentage=percentage,
             image=image,
             message=message,
+            device=device,
         )