From 825790ea526a7a3eb19996a4bee1b9fce41bf18b Mon Sep 17 00:00:00 2001 From: oscardev256 <42308241+oscardev256@users.noreply.github.com> Date: Sat, 8 Nov 2025 23:11:33 -0500 Subject: [PATCH 1/8] Create isaac.py Signed-off-by: Oscar Gonzalez --- vllm/model_executor/models/isaac.py | 1490 +++++++++++++++++++++++++++ 1 file changed, 1490 insertions(+) create mode 100644 vllm/model_executor/models/isaac.py diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py new file mode 100644 index 000000000000..4f29d1cff347 --- /dev/null +++ b/vllm/model_executor/models/isaac.py @@ -0,0 +1,1490 @@ +from __future__ import annotations + +from collections.abc import Mapping, Sequence, Iterable +from typing import Any, Optional, Union +from typing_extensions import TypedDict, Unpack + +import itertools +from enum import Enum +from dataclasses import dataclass + +import math +import numpy as np +import PIL.Image +import torch +import torch.nn as nn +import torch.nn.functional as F + +from transformers import PretrainedConfig, Qwen3Config +from transformers.image_processing_utils import BatchFeature +from transformers.tokenization_utils import TensorType +from transformers.models.siglip2.modeling_siglip2 import ( + Siglip2MLP, +) +from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig + +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.model_executor.models.interfaces import SupportsMultiModal +from vllm.model_executor.models.utils import ( + WeightsMapper, + AutoWeightsLoader, + _merge_multimodal_embeddings, +) +from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal.processing import ( + BaseMultiModalProcessor, + BaseProcessingInfo, + PromptReplacement, +) +from vllm.multimodal.parse import MultiModalDataItems, ImageSize +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.inputs import ( + MultiModalFieldConfig, + MultiModalKwargs, + MultiModalDataDict, +) +from vllm.config import VllmConfig +from vllm.model_executor.models.interfaces import ( + MultiModalEmbeddings, + SupportsLoRA, + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, +) + +# ===== TensorStream Compatibility Layer for Isaac MRoPE ===== +# Minimal implementation of TensorStream classes needed for Isaac's 3D positional encoding + +class ModalityType(Enum): + """ + Base class for modality-type enumerations. + Each derived class (VisionType, TextType) holds + an integer value that identifies a specific modality. + + Example usage: + If you have an object `my_event` of class `Event`, + you might write: + if my_event.type == VisionType.image: + # process an image frame + + The methods below implement ordering and hashing + based on the integer `.value` of each enum member. + """ + + @property + def modality(self): + return self.__class__ + + def __lt__(self, other): + if isinstance(other, ModalityType): + return self.value < other.value + raise NotImplementedError() + + def __eq__(self, other): + if isinstance(other, ModalityType): + return self.value == other.value + raise NotImplementedError() + + def __hash__(self): + return hash(self.value) + + +# NOTE: modality types need to be unique +class VisionType(ModalityType): + """ + Enum for vision modalities such as key video frames. + Typically used in video processing or image sequences. + + Members: + image: A single image frame. + """ + + image = 0 + + +class TextType(ModalityType): + """ + Enum for text tokens and padding. + + Members: + text: Actual textual tokens. + padding: Padding tokens used in sequence batching. + """ + + text = 1 + padding = 2 + + +@dataclass +class Event: + """Represents a single modality event with spatial/temporal dimensions.""" + """ + Represents a single data occurrence (with a specific type, time interval, and data payload). + + Attributes: + data (Any): The actual data payload (e.g. a torch.Tensor, a string, etc.). + type (ModalityType): The modality type of the data (e.g., VisionType.image). + time (Tuple[float, float]): (start_time, end_time) indicating when this Event occurs. + role (Optional[str]): The role associated with this event (e.g., "user", "agent", "system"). + If None, the event is always included in loss calculation. + + Example usage: + evt = Event(data=torch.zeros((1, 224, 224, 3)), # e.g. a single image frame + type=VisionType.image, + time=(0.0, 0.04), + role="user") + """ + # Descriptors + modality_type: ModalityType + + # Structure + dims_virtual: list[int] | None = None # virtual/processed dimensions (e.g., pixel-shuffled) + dims_real: list[int] | None = None # real/actual tensor dimensions + idx_range: tuple[int, int] | None = None + + def dims(self, virtual: bool = True) -> list[int] | None: + """ + Get the dimensions of this event. + + Args: + virtual: If True (default), return virtual/processed dimensions (e.g., pixel-shuffled). + If False, return real/actual tensor dimensions. + + Returns: + Dimensions list or None if not measured. + """ + if virtual: + return self.dims_virtual + else: + return self.dims_real + + def num_tokens(self, partial=True, virtual=True) -> int: + if not virtual: + assert partial is False and isinstance(self.data, torch.Tensor) + return math.prod(self.dims(virtual=False)) + return self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims()) + + +@dataclass +class Stream: + """ + Represents an ordered sequence of Event objects, each with + a specific ModalityType and a time range. + + Attributes: + events (List[Event]): The list of Event objects in the stream. + priority (List[ModalityType]): A list of modality types that define + how we might want to reorder or prioritize events if scheduling is needed. + + Example usage: + # Create two events of different types + evt1 = Event(torch.zeros((1, 224, 224, 3)), VisionType.image, (0.0, 0.04)) + evt2 = Event(torch.randint(0, 1000, (16, 1)), TextType.text, (0.0, 0.32)) + + # Make a stream with a given priority + s = Stream(events=[evt1, evt2], + priority=[VisionType.image, TextType.text]) + + print(s) + """ + + events: list[Event] + + def __len__(self): + """Returns the number of Event objects in this Stream.""" + return len(self.events) + + def __getitem__(self, key: int) -> Stream | Event: + return self.events[key] + + def __iter__(self): + """ + Yields each Event in the Stream, enabling iteration like: + for event in my_stream: + ... + """ + yield from self.events + + +# TODO: implement all types of cool indexing which can happen since TensorStream assuems Event.data = Tensor +@dataclass +class TensorStream: + streams: list[Stream] + _device: torch.device | None = None + + @property + def device(self): + return self._device + + @property + def shape(self): + seq_lens = [sum([ev.num_tokens() for ev in stream]) for stream in self.streams] + assert all([sl == seq_lens[0] for sl in seq_lens]), ( + f"each stream must have same token count to have a shape: {seq_lens}" + ) + return (len(seq_lens), seq_lens[0]) + + +def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Tensor: + """ + Create a (batch, T, n_pos_dims) position tensor in one sweep. + The first dim is the running “time” index, the rest are spatial (or 1-fillers). + + Args: + ts : TensorStream + n_pos_dims : total coordinate dimensions (default 3) + + Returns: + torch.LongTensor - shape (batch_size, seq_len, n_pos_dims) + """ + + # Manually iterate through streams and events like map_compact does, + # but maintain cumulative time offset for each stream + all_coords = [] + for stream in ts.streams: # one Stream == one batch sample + cumulative_offset = 0 # running time index for this stream + + for event in stream: + # --- build coordinate grid for THIS event using itertools (no tensor ops) --- + dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) + + # Create ranges for each dimension (similar to old _finalize implementation) + first_dim = range(cumulative_offset, cumulative_offset + dims[0]) + cumulative_offset += dims[0] # advance time for the next event + other_dims = [range(d) for d in dims[1:]] + + # Use itertools.product to create all coordinate combinations + full_coords = list(itertools.product(first_dim, *other_dims)) + + # Slice if the event is partial + s, e = event.idx_range + coords = full_coords[s:e] + + # Extend the flattened coordinate list + all_coords.extend(coords) + + # Convert to tensor and reshape to (B, T, n_pos_dims) + B, T = ts.shape + return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape(B, T, n_pos_dims) + + +def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: + """Create boolean mask for specific modality type in the tensor stream.""" + B, T = ts.shape + mask = torch.zeros((B, T), dtype=torch.bool, device=ts.device) + + for batch_idx, stream in enumerate(ts.streams): + seq_idx = 0 + for event in stream: + if event.modality_type == modality_type: + start, end = event.idx_range + mask[batch_idx, seq_idx:seq_idx+(end-start)] = True + seq_idx += (event.idx_range[1] - event.idx_range[0]) + + return mask + +# ===== End TensorStream Compatibility Layer ===== + +class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): + """Vision configuration for Isaac with Pixel Shuffle support. + + Extends Siglip2VisionConfig with additional fields for pixel shuffle. + """ + + model_type = "pixel_shuffle_siglip2" + base_config_key = "vision_config" + + def __init__( + self, + pixel_shuffle_scale_factor: int = 1, + num_patches: int = 256, + **kwargs, + ): + super().__init__(**kwargs) + + # Add our custom fields + self.pixel_shuffle_scale_factor = pixel_shuffle_scale_factor + self.num_patches = num_patches + + +def create_cumulative_seq_lengths(seq_sizes: torch.Tensor, device: torch.device) -> tuple[torch.Tensor, int]: + """Create cumulative sequence lengths for variable-length attention.""" + cu_seqlens = torch.zeros(len(seq_sizes) + 1, dtype=torch.int32, device=device) + cu_seqlens[1:] = seq_sizes.cumsum(0) + max_seqlen = int(seq_sizes.max().item()) if len(seq_sizes) > 0 else 0 + return cu_seqlens, max_seqlen + + +class Siglip2VariableSequenceEmbeddings(nn.Module): + def __init__(self, config: PixelShuffleSiglip2VisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.patch_size = config.patch_size + + self.patch_embedding = nn.Linear( + in_features=config.num_channels * self.patch_size * self.patch_size, + out_features=self.embed_dim, + ) + + self.num_patches = config.num_patches + self.position_embedding_size = int(self.num_patches**0.5) + self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim) + + def positional_embeddings( + self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ) -> torch.Tensor: + # Prepare positional embeddings grid: (1, embed_dim, h, w) + positional_embeddings = ( + self.position_embedding.weight.reshape(self.position_embedding_size, self.position_embedding_size, -1) + .permute(2, 0, 1) + .unsqueeze(0) + ) + + _seq_patches, _seq_sizes, spatial_shapes = packed_seq_patches + pos_embeds_list = [] + mode = "bilinear" + align_corners = False + antialias = True + for spatial_shape in spatial_shapes: + height, width = spatial_shape + # Guard to ensure height and width are positive for torch.compile + if height > 0 and width > 0: + resized_pos_embed = F.interpolate( + positional_embeddings, + size=(height, width), + mode=mode, + align_corners=align_corners, + antialias=antialias, + ) + # Reshape from (1, embed_dim, height, width) to (height*width, embed_dim) + resized_pos_embed = resized_pos_embed.reshape(self.embed_dim, height * width).transpose(0, 1) + else: + # Fallback - should never happen in practice + resized_pos_embed = positional_embeddings.reshape( + self.embed_dim, self.position_embedding_size * self.position_embedding_size + ).transpose(0, 1)[: height * width] + pos_embeds_list.append(resized_pos_embed) + + # Concatenate all positional embeddings along the sequence dimension + pos_embeds = torch.cat(pos_embeds_list, dim=0) + return pos_embeds + + def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor]): + seq_patches, _seq_sizes, _spatial_shapes = packed_seq_patches + + # Apply patch embeddings + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(seq_patches.to(dtype=target_dtype)) + pos_embeds = self.positional_embeddings(packed_seq_patches) + + # Flatten patch embeddings to match positional embeddings format + # From [batch, patches_per_image, embed_dim] to [total_patches, embed_dim] + batch_size, patches_per_image, embed_dim = patch_embeds.shape + + # For variable-length attention, we need to reshape to (total_tokens, embed_dim) + if batch_size != 1: + raise ValueError("Variable-length attention expects batch_size=1 for packed sequences") + + patch_embeds = patch_embeds.view(batch_size * patches_per_image, embed_dim) + + # Add positional embeddings to patch embeddings + embeddings = patch_embeds + pos_embeds + return embeddings + + +class Siglip2VariableLengthAttention(nn.Module): + """Custom attention that supports variable-length sequences with flash attention.""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward(self, hidden_states, cu_seqlens=None, max_seqlen=None): + batch_size, seq_len, _ = hidden_states.size() + + # For variable-length attention, we need to reshape to (total_tokens, embed_dim) + if batch_size != 1: + raise ValueError("Variable-length attention expects batch_size=1 for packed sequences") + hidden_states = hidden_states.squeeze(0) # Remove batch dimension: (seq_len, embed_dim) + + # Store original dtype + orig_dtype = hidden_states.dtype + + # 1. Linear projections + Q = self.q_proj(hidden_states) # (seq_len, embed_dim) + K = self.k_proj(hidden_states) # (seq_len, embed_dim) + V = self.v_proj(hidden_states) # (seq_len, embed_dim) + + # 2. Reshape for multi-head attention: (seq_len, n_heads, head_dim) + Q = Q.view(-1, self.num_heads, self.embed_dim // self.num_heads) + K = K.view(-1, self.num_heads, self.embed_dim // self.num_heads) + V = V.view(-1, self.num_heads, self.embed_dim // self.num_heads) + + # 3. Apply variable-length attention using flash attention + attn_output, _, _, _, _ = torch.ops.aten._flash_attention_forward( + query=Q, + key=K, + value=V, + cum_seq_q=cu_seqlens, + cum_seq_k=cu_seqlens, + max_q=max_seqlen, + max_k=max_seqlen, + dropout_p=self.dropout if self.training else 0.0, + is_causal=False, + return_debug_mask=False, + scale=self.scale, + window_size_left=-1, + window_size_right=-1, + alibi_slopes=None, + ) + + # 4. Reshape attention output from (seq_len, n_heads, head_dim) to (seq_len, embed_dim) + attn_output = attn_output.reshape(seq_len, self.embed_dim) + + # 5. Convert back to original dtype if needed + if attn_output.dtype != orig_dtype: + attn_output = attn_output.to(orig_dtype) + + # 6. Project output + attn_output = self.out_proj(attn_output) # (seq_len, embed_dim) + + # 7. Add back batch dimension for compatibility + attn_output = attn_output.unsqueeze(0) # (1, seq_len, embed_dim) + + return attn_output, None + + +class IsaacSiglip2EncoderLayer(nn.Module): + """Siglip2 encoder layer with variable-length attention.""" + + def __init__(self, config: PixelShuffleSiglip2VisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = Siglip2VariableLengthAttention(config) + + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = Siglip2MLP(config) # Use HF's Siglip2MLP + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor = None, + max_seqlen: int = None, + ) -> tuple[torch.FloatTensor]: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return (hidden_states,) + + +class IsaacEncoder(nn.Module): + """Encoder using Isaac encoder layers with variable-length attention support.""" + + def __init__(self, config: PixelShuffleSiglip2VisionConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([IsaacSiglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + inputs_embeds, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: int | None = None, + output_hidden_states: bool = False, + ): + all_hidden_states = () if output_hidden_states else None + + hidden_states = inputs_embeds + + for encoder_layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = encoder_layer( + hidden_states, + cu_seqlens, + max_seqlen, + ) + + hidden_states = layer_outputs[0] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + return hidden_states, all_hidden_states, None + + +def create_pixel_shuffle_index_map( + seq_sizes: torch.Tensor, + token_grids: torch.Tensor, + scale_factor: int = 1, + device: torch.device | None = None, +) -> torch.Tensor: + """ + Build a gather-index map that tells us, for every *output* token after + pixel-shuffle, which `scale_factor**2` *input* tokens are being merged. + + Args + ---- + seq_sizes : (num_images,) - #patches in each image (row-major order) + token_grids : (num_images,2) - (height, width) for every image + scale_factor : spatial down-scale factor (≥2) + device : (optional) overrides `seq_sizes.device` + + Returns + ------- + gather_idx : (new_total_seq_len, scale_factor**2) int64 tensor. + gather_idx[i, j] is the *flat* index into the *original* + packed sequence for the j-th sub-patch that forms the + i-th output token. + """ + if device is None: + device = seq_sizes.device + + r = int(scale_factor) + if r < 2: + raise ValueError("`scale_factor` must be ≥ 2") + + # Safety: all spatial dims must be divisible by r + # Cannot run under torch compile fullgraph mode hence + if not torch.compiler.is_compiling(): + if not ((token_grids[:, 0] % r == 0).all() and (token_grids[:, 1] % r == 0).all()): + raise AssertionError( + f"Every (H,W) in `token_grids` must be divisible by scale_factor={r}, got {token_grids.tolist()}" + ) + + gather_chunks: list[torch.Tensor] = [] + tok_offset = 0 + + for seq_len, (h, w) in zip(seq_sizes.tolist(), token_grids.tolist(), strict=False): + # Build the (H, W) grid of flat indices for this image + grid = torch.arange(seq_len, device=device, dtype=torch.int64) + tok_offset + grid = grid.view(h, w) # (H, W) + + # -------- identical ordering to your fixed-res routine -------- + # Step 1: split width into blocks of r + grid = grid.view(h, w // r, r) # (H, W/r, r) + # Step 2: now split height into blocks of r + grid = grid.view(h // r, r, w // r, r) # (H/r, r, W/r, r) + # Step 3: final permutation to (H/r, W/r, r, r) + grid = grid.permute(0, 2, 1, 3).contiguous() # (H/r, W/r, r, r) + # Step 4: each (r, r) block forms one output token + gather_chunks.append(grid.reshape(-1, r * r)) # (H*W / r², r²) + + tok_offset += seq_len + + # Concatenate over all images in the packed batch + gather_idx = torch.cat(gather_chunks, dim=0) # (Σ_i HᵢWᵢ/r², r²) + return gather_idx + + +def pixel_shuffle_varlen( + x: torch.Tensor, + token_grids: torch.Tensor, + scale_factor: int = 1, +) -> torch.Tensor: + r"""Apply pixel shuffle to a packed vision sequence without unpacking per image. + + Args: + x (`torch.Tensor`): + Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or `(1, seq_len, hidden_size)` shapes + produced by stacking image patches. + token_grids (`torch.Tensor`): + Integer tensor of shape `(num_images, 2)` whose rows give the `(height, width)` patch grid sizes + corresponding to each image segment inside `x`. + scale_factor (`int`, *optional*, defaults to 1): + Spatial down-sampling factor specific to pixel shuffle. Values greater than one merge `scale_factor**2` neighboring patches into a + single embedding channel-group. + + Returns: + `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input convention: + `(seq_len, hidden_size * scale_factor**2)` when the input was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` + if the singleton batch dimension was present. + + Raises: + ValueError: If more than one batch item is provided. + """ + keep_batch_dim = x.dim() == 3 + if keep_batch_dim: + if x.size(0) != 1: + raise AssertionError("Packed sequence is expected to have batch_size == 1") + x_ = x.squeeze(0) # (seq, embed) + else: + x_ = x # (seq, embed) + + embed_dim = x_.size(-1) + r = int(scale_factor) + + # Calculate seq_sizes from token_grids + seq_sizes = torch.prod(token_grids, dim=-1) + + # Build index map and gather in one go + gather_idx = create_pixel_shuffle_index_map( + seq_sizes=seq_sizes, + token_grids=token_grids, + scale_factor=r, + device=x_.device, + ) # (new_seq, r²) + + # Gather → (new_seq, r², embed_dim) + gathered = x_[gather_idx] # fancy indexing keeps gradient + + # Merge the r² group dimension into channels to finish the shuffle + out = gathered.reshape(gathered.size(0), embed_dim * r * r) + + # Restore batch dimension if needed + if keep_batch_dim: + out = out.unsqueeze(0) + return out + + +class Siglip2SequenceVisionTransformer(nn.Module): + def __init__(self, config: PixelShuffleSiglip2VisionConfig): + super().__init__() + self.config = config + self.embeddings = Siglip2VariableSequenceEmbeddings(config) + self.encoder = IsaacEncoder(config) + self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pixel_shuffle_scale_factor = config.pixel_shuffle_scale_factor + + def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor]): + seq_patches, token_grids = packed_seq_patches + seq_sizes = torch.prod(token_grids, dim=-1) + + # Get embeddings from packed sequence + hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) + + # Add a pseudo batch dimension for the encoder + hidden_states = hidden_states.unsqueeze(0) + + # Generate cumulative sequence lengths for variable-length attention + cu_seqlens, max_seqlen = create_cumulative_seq_lengths(seq_sizes, hidden_states.device) + + # Pass through encoder with variable-length attention parameters + hidden_states, _, _ = self.encoder( + inputs_embeds=hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + + # Apply final layer normalization + hidden_states = self.post_layernorm(hidden_states) + + if self.pixel_shuffle_scale_factor > 1: + hidden_states = pixel_shuffle_varlen( + x=hidden_states, + token_grids=token_grids, + scale_factor=self.pixel_shuffle_scale_factor, + ) + # Remove the pseudo batch dimension we added earlier + hidden_states = hidden_states.squeeze(0) + + # Return the full sequence of embeddings + return hidden_states + + +# ============================================================================ +# Configuration +# ============================================================================ + +MAX_PIXELS = 60_000_000 # 60-megapixel ceiling ≈ 8200 × 7300 px + +# Vision preprocessing constants +VISION_MEAN = (0.5, 0.5, 0.5) +VISION_STD = (0.5, 0.5, 0.5) +VISION_SCALE = 1 / 255 + + +def _make_writeable(arr: np.ndarray) -> np.ndarray: + """Return *arr* itself if it is already writeable, otherwise try to flip the + write flag in-place and finally fall back to `arr.copy()`. + This guarantees the buffer handed to `torch.from_numpy()` is always + writeable, silencing the PyTorch warning about undefined behaviour. + """ + if arr.flags.writeable: + return arr + + # First, try the cheap path — in-place flag toggle (works for mmap'd arrays + # and some shared memory buffers): + try: + arr.setflags(write=True) + return arr # success: no data copy + except ValueError: + # Buffer is inherently read-only (e.g. backed by PyAV / PIL): make copy + return arr.copy() + + +def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None: + if image.width * image.height > MAX_PIXELS: + raise ValueError(f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`") + img = image if image.mode == "RGB" else image.convert("RGB") + arr = np.asarray(img) + arr = _make_writeable(arr) + return torch.from_numpy(arr) + + +def get_image_size_for_max_num_patches( + image_height: int, + image_width: int, + patch_size: int, + max_num_patches: int, + min_num_patches: int | None = None, + eps: float = 1e-5, + pixel_shuffle_scale: int = 1, +) -> tuple[int, int]: + r"""Compute a target resolution whose patch grid satisfies patching parametrization. + + Args: + image_height (`int`): + Height in pixels of the source image prior to any resizing. + image_width (`int`): + Width in pixels of the source image prior to any resizing. + patch_size (`int`): + Size of the square patch used by the vision encoder. + max_num_patches (`int`): + Upper bound on `(height / patch_size) * (width / patch_size)` after resizing. + min_num_patches (`int`, *optional*): + Lower bound on the number of patches. When provided the image will be scaled up if necessary. + eps (`float`, *optional*, defaults to 1e-5): + Convergence tolerance for the internal binary search to determing the target dimensions. + pixel_shuffle_scale (`int`, *optional*, defaults to 1): + Additional stride multiplier applied when pixel shuffle later reduces spatial resolution. + + Returns: + `tuple[int, int]`: Height and width (in pixels) that are multiples of `patch_size * pixel_shuffle_scale` + and respect both the maximum and optional minimum patch-count constraints. + """ + + def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale): + scaled_size = scale * original_size + divisor = patch_size * pixel_shuffle_scale + scaled_size = math.ceil(scaled_size / divisor) * divisor + scaled_size = max(divisor, scaled_size) + return int(scaled_size) + + # Ensure divisibility + divisor = patch_size * pixel_shuffle_scale + adjusted_height = math.ceil(image_height / divisor) * divisor + adjusted_height = max(divisor, adjusted_height) + adjusted_width = math.ceil(image_width / divisor) * divisor + adjusted_width = max(divisor, adjusted_width) + + num_patches = (adjusted_height / patch_size) * (adjusted_width / patch_size) + + if min_num_patches is not None and num_patches < min_num_patches: + # Scale up + scale_min, scale_max = 1.0, 100.0 + while (scale_max - scale_min) >= eps: + scale = (scale_min + scale_max) / 2 + target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) + target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + num_patches = (target_height / patch_size) * (target_width / patch_size) + if num_patches >= min_num_patches: + scale_max = scale + else: + scale_min = scale + scale = scale_max + target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) + target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + return target_height, target_width + elif num_patches <= max_num_patches: + return adjusted_height, adjusted_width + else: + # Scale down + scale_min, scale_max = eps / 10, 1.0 + while (scale_max - scale_min) >= eps: + scale = (scale_min + scale_max) / 2 + target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) + target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + num_patches = (target_height / patch_size) * (target_width / patch_size) + if num_patches <= max_num_patches: + scale_min = scale + else: + scale_max = scale + scale = scale_min + target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) + target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + return target_height, target_width + + +_MEAN_TENSOR = torch.tensor(VISION_MEAN, dtype=torch.float32).view(1, 1, 1, -1) +_STD_TENSOR = torch.tensor(VISION_STD, dtype=torch.float32).view(1, 1, 1, -1) + + +def prepare_image_tensor( + image: torch.Tensor, + scale: float = VISION_SCALE, +) -> torch.Tensor: + r"""Standardize RGB images prior to patch extraction via rescaling and whitening. + + Args: + image (`torch.Tensor`): + Tensor with shape `(..., height, width, 3)` containing RGB values. The tensor is converted to floating + point if needed. + scale (`float`, *optional*, defaults to `VISION_SCALE`): + Scalar multiplier applied before normalization. + Returns: + `torch.Tensor`: Normalized tensor with the same shape as the input and dtype `torch.float32`. + """ + if not torch.is_floating_point(image): + image = image.float() + rescaled = image * scale + + # Use precomputed tensors and move to the correct device if needed + mean_tensor = _MEAN_TENSOR.to(image.device) + std_tensor = _STD_TENSOR.to(image.device) + + normalized = (rescaled - mean_tensor) / std_tensor + return normalized + + +def patchify_vision(image: torch.Tensor, patch_size: int) -> torch.Tensor: + r"""Convert normalized images into flattened ViT-style patches. + + Args: + image (`torch.Tensor`): + Tensor of shape `(num_images, height, width, channels)`. + patch_size (`int`): + Edge length of the square patches + + Returns: + `torch.Tensor`: + Patch tensor where each position stores the flattened pixels belonging to that patch. + + Raises: + ValueError: If `height` or `width` is not divisible by `patch_size`. + """ + num_images, height, width, channels = image.shape + if height % patch_size or width % patch_size: + raise ValueError(f"Dimensions of images {image.shape} are not divisible by patch_size={patch_size}.") + patches = image.reshape(num_images, height // patch_size, patch_size, width // patch_size, patch_size, channels) + patches = patches.permute(0, 1, 3, 2, 4, 5) + patches = patches.reshape(num_images, height // patch_size, width // patch_size, channels * patch_size * patch_size) + return patches + + +def process_vision_for_patches( + images: torch.Tensor, + patch_size: int, + max_num_patches: int, + min_num_patches: int | None = None, + pixel_shuffle_scale: int = 1, +) -> tuple[torch.Tensor, list[int]]: + r"""Resize, normalize, and patchify RGB images for the vision encoder. + + Args: + images (`torch.Tensor`): + Either `(height, width, channels)` for a single image or `(num_images, height, width, channels)` for a + batch. Channels are expected to be RGB. + patch_size (`int`): + Edge length of square patches; implictly controls resize grid granularity. + max_num_patches (`int`): + Maximum number of patches allowed after resizing. + min_num_patches (`int`, *optional*): + Minimum number of patches. If provided, the routine upsamples images as needed to satisfy the lower bound. + pixel_shuffle_scale (`int`, *optional*, defaults to 1): + pixel shuffle scale factor; influences the target grid that the function produces. + + Returns: + `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)` where `patches` has shape + `(num_images, target_h / patch_size, target_w / patch_size, channels * patch_size**2)` and `dims_virtual` + encodes effective `(images, height, width)` dimensions after optional pixel shuffling. + """ + # Add batch dim if single image + if images.dim() == 3: + images = images.unsqueeze(0) + + # Permute to channel first for resize + images = images.permute(0, 3, 1, 2) + + # Get target dimensions + _, _, orig_height, orig_width = images.shape + target_height, target_width = get_image_size_for_max_num_patches( + orig_height, + orig_width, + patch_size, + max_num_patches, + min_num_patches=min_num_patches, + pixel_shuffle_scale=pixel_shuffle_scale, + ) + + # Resize + images = F.interpolate( + images, + size=(target_height, target_width), + mode="bilinear", + align_corners=False, + ) + + # Back to channel last + images = images.permute(0, 2, 3, 1) + + # Normalize + images = prepare_image_tensor(images) + + # Patchify + patches = patchify_vision(images, patch_size=patch_size) + + # Calculate dimensions for the patches + n_images, h_patches, w_patches, _ = patches.shape + dims_virtual = ( + [1, h_patches, w_patches] + if pixel_shuffle_scale == 1 + else [1, h_patches // pixel_shuffle_scale, w_patches // pixel_shuffle_scale] + ) + + return patches, dims_virtual + + +class IsaacConfig(Qwen3Config): + """Configuration class for Isaac multimodal model.""" + + model_type = "isaac" + sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig} + + def __init__( + self, + vision_config=None, + vision_patch_size: int = 16, + vision_max_num_patches: int = 256, + vision_min_num_patches: int | None = None, + pixel_shuffle_scale: int = 1, + max_sequence_length: int = 16384, + vision_token: str = "<|image_pad|>", + **kwargs, + ): + super().__init__(**kwargs) + + # EventStreamProcessor parameters (for backward compatibility) + self.video_patch_size = vision_patch_size + self.vision_max_num_patches = vision_max_num_patches + self.vision_min_num_patches = vision_min_num_patches + self.pixel_shuffle_scale = pixel_shuffle_scale + + # Processing parameters + self.max_sequence_length = max_sequence_length + self.vision_token = vision_token + + # Handle vision config - PixelShuffleSiglip2VisionConfig instance + self.vision_config = PixelShuffleSiglip2VisionConfig( + pixel_shuffle_scale_factor=pixel_shuffle_scale, + num_patches=vision_max_num_patches, + ) + + +class IsaacImageProcessorKwargs(TypedDict, total=False): + patch_size: int + max_num_patches: int + min_num_patches: int + pixel_shuffle_scale: int + #merge_size: int # kept for parity with other processors that expose it + + +class IsaacImageProcessor: + + patch_size = 16 + max_num_patches = 6144 + min_num_patches = 256 + pixel_shuffle_scale = 2 + + valid_kwargs = IsaacImageProcessorKwargs + model_input_names = ["pixel_values", "image_grid_thw"] + + def __init__(self, kwargs): + self.patch_size = kwargs.pop("patch_size", self.patch_size) + self.vision_max_num_patches = kwargs.pop("vision_max_num_patches", self.max_num_patches) + self.vision_min_num_patches = kwargs.pop("vision_min_num_patches", self.min_num_patches) + self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2) + + def preprocess( + self, + images: list[torch.Tensor], + return_tensors: Optional[Union[str, TensorType]], + **kwargs: Unpack[IsaacImageProcessorKwargs], + ) -> BatchFeature: + """Isaac's resize → normalize → patchify → pack.""" + + all_pixel_values: list[torch.Tensor] = [] + all_image_grids: list[torch.Tensor] = [] + + for image in images: + image_tensor = extract_image_pil(image) + + patches, dims_virtual = process_vision_for_patches( + image_tensor, + patch_size=self.patch_size, + max_num_patches=self.vision_max_num_patches, + min_num_patches=self.vision_min_num_patches, + pixel_shuffle_scale=self.pixel_shuffle_scale, + ) + + # Isaac packs a dummy temporal dim for images + patches = patches.unsqueeze(1) # [N, T=1, Hp, Wp, D] + + hp, wp, dim = patches.shape[-3], patches.shape[-2], patches.shape[-1] + current_num_patches = hp * wp + pixel_values = patches.reshape(current_num_patches, dim) # [N_tokens, D] + + # Use real patch dimensions for image_grid_thw, not virtual dimensions + # This ensures the vision model receives correct grid info for pixel shuffle + dims_real = [1, hp, wp] # Real patch dimensions + image_grid_thw = torch.tensor(dims_real).unsqueeze(0) # [1, [T, H, W]] + + all_pixel_values.append(pixel_values) + all_image_grids.append(image_grid_thw) + + if all_pixel_values: + final_pixel_values = torch.cat(all_pixel_values, dim=0) + final_image_grids = torch.cat(all_image_grids, dim=0) + else: + final_pixel_values = torch.empty(0, 0) + final_image_grids = torch.empty(0, 3) + + return BatchFeature( + data={"pixel_values": final_pixel_values, "image_grid_thw": final_image_grids}, + tensor_type=return_tensors, + ) + + +class IsaacProcessor: + """Processor wrapper (tokenizer + IsaacImageProcessor).""" + + attributes = ["tokenizer"] + tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") + + def __init__(self, image_processor=None, tokenizer=None, **kwargs): + self.image_processor = image_processor or IsaacImageProcessor(kwargs) + self.tokenizer = tokenizer + self.image_token = "<|image_pad|>" + + def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: + result = {} + + if text is not None: + result.update(self.tokenizer(text, **kwargs)) + if images is not None: + image_result = self.image_processor.preprocess(images, **kwargs) + result.update(image_result) + return BatchFeature(result) + + def apply_chat_template( + self, + messages: list[dict[str, Any]], + tokenize: bool = False, + add_generation_prompt: bool = False, + **kwargs, + ) -> Any: + # Convert mixed content messages to simple text format + processed_messages = [] + + for message in messages: + if "content" in message and isinstance(message["content"], list): + # Handle mixed content (text + image) + text_parts = [] + for content_item in message["content"]: + if content_item.get("type") == "text": + text_parts.append(content_item.get("text", "")) + elif content_item.get("type") == "image": + # Replace image with vision token + text_parts.append(self.image_token) + + processed_message = { + "role": message.get("role", "user"), + "content": "".join(text_parts) + } + processed_messages.append(processed_message) + else: + # Regular text message + processed_messages.append(message) + + return self.tokenizer.apply_chat_template( + processed_messages, tokenize=tokenize, add_generation_prompt=add_generation_prompt, **kwargs + ) + + +class IsaacProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self) -> IsaacConfig: + if hasattr(self.ctx, "get_hf_config"): + original_config = self.ctx.get_hf_config() + # Map HF config parameters to our vLLM config parameters + return IsaacConfig( + # Vision parameters - map from HF names + vision_config=getattr(original_config, "vision_config", None), + vision_patch_size=getattr(original_config, "video_patch_size", 16), + vision_max_num_patches=getattr(original_config, "vision_max_num_patches", 256), + vision_min_num_patches=getattr(original_config, "vision_min_num_patches", None), + pixel_shuffle_scale=getattr(original_config, "pixel_shuffle_scale", 1), + max_sequence_length=getattr(original_config, "max_sequence_length", 16384), + vision_token="<|image_pad|>", + ) + return IsaacConfig() + + def get_hf_processor(self, **kwargs) -> IsaacProcessor: + return self.ctx.get_hf_processor(IsaacProcessor, **kwargs) + + def get_tokenizer(self): + return self.ctx.tokenizer + + def get_image_size_with_most_features(self) -> ImageSize: + hf_config = self.get_hf_config() + # Get target dimensions + target_height, target_width = get_image_size_for_max_num_patches( + 9999999, + 9999999, + hf_config.video_patch_size, + hf_config.vision_max_num_patches, + min_num_patches=hf_config.vision_min_num_patches, + pixel_shuffle_scale=hf_config.pixel_shuffle_scale, + ) + return ImageSize(width=target_width, height=target_height) + + def get_image_processor(self, **kwargs) -> IsaacImageProcessor: + return self.get_hf_processor(**kwargs).image_processor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item( + self, seq_len: int, mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + hf_config = self.get_hf_config() + num_vision_tokens = hf_config.vision_max_num_patches // (hf_config.pixel_shuffle_scale**2) + return {"image": num_vision_tokens} + + +class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + hf_processor = self.info.get_hf_processor() + image_token: str = hf_processor.image_token + + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str] | None = None, + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = self.info.get_image_size_with_most_features() + image_overrides = mm_options.get("image") if mm_options else None + + return { + "image": self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + overrides=image_overrides, + ), + } + + +class IsaacMultiModalProcessor(BaseMultiModalProcessor): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + # Configure multimodal fields for Isaac model + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_grid_sizes = image_grid_thw.prod(-1) + + return { + "pixel_values": MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), + "image_grid_thw": MultiModalFieldConfig.batched("image"), + } + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + + #hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + + vocab = tokenizer.get_vocab() + placeholder_id = vocab.get("<|image_pad|>", 151655) + + pixel_shuffle_scale = getattr(image_processor, 'pixel_shuffle_scale', 2) + merge_length = pixel_shuffle_scale ** 2 + + def get_replacement_isaac(item_idx: int): + out_item = out_mm_kwargs["image"][item_idx] + grid_thw = out_item["image_grid_thw"].data + assert isinstance(grid_thw, torch.Tensor) + + num_tokens = int(grid_thw.prod()) // merge_length + return [placeholder_id] * num_tokens + + return [ + PromptReplacement( + modality="image", + target=[placeholder_id], + replacement=get_replacement_isaac, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + IsaacMultiModalProcessor, + info=IsaacProcessingInfo, + dummy_inputs=IsaacDummyInputsBuilder, +) +class IsaacForConditionalGeneration( + Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE +): + + supports_encoder_tp_data = True + + # To ensure correct weight loading and mapping. + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "model.vision_embedding.": "vision_embedding.", + } + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + if modality.startswith("image"): + return "<|image_pad|>" + + raise ValueError("Only image modality is supported") + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): + + config: IsaacConfig = vllm_config.model_config.hf_config + head_dim = config.head_dim + + calculated_mrope_section = [ + head_dim // 4, # 2x more for temporal dim + head_dim // 8, + head_dim // 8, + ] + + config.rope_scaling["mrope_section"] = calculated_mrope_section + self.config = config + + # Initialize the parent class with updated config + super().__init__(vllm_config=vllm_config, prefix=prefix) + + # Create the language model module to match checkpoint structure + self.language_model = nn.ModuleDict({ + "embed_tokens": self.model.embed_tokens, + "layers": self.model.layers, + "norm": self.model.norm + }) + + vision_cfg = config.vision_config + if vision_cfg is None: + raise ValueError("IsaacConfig should always have vision_config") + + hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) + self.vision_embedding = nn.Sequential( + Siglip2SequenceVisionTransformer(vision_cfg), + nn.Linear( + hidden_dim, + 4 * hidden_dim, + bias=False, + ), + nn.SiLU(), + nn.Linear(4 * hidden_dim, config.hidden_size, bias=False), + ) + + def get_mrope_input_positions( + self, + input_tokens: list[int], + hf_config: PretrainedConfig, + image_grid_thw: list[list[int]] | torch.Tensor, + video_grid_thw: list[list[int]] | torch.Tensor, + context_len: int = 0, + seq_len: int | None = None, + second_per_grid_ts: list[float] | None = None, + audio_feature_lengths: torch.Tensor | None = None, + use_audio_in_video: bool = False, + ) -> tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + + vision_token_id = getattr(self.config, 'image_token_id', 151655) + spatial_merge_size = hf_config.vision_config.pixel_shuffle_scale_factor + input_tokens_tensor = torch.tensor(input_tokens) + + # Find image token positions + image_positions = torch.where(input_tokens_tensor == vision_token_id)[0].tolist() + + # For text-only inputs, use Isaac's original logic from compute_position_ids_input_ids() + if len(image_positions) == 0: + seq_len = len(input_tokens) + # Create 3D positions where all dimensions get the same 1D temporal progression + position_ids = torch.arange(seq_len, dtype=torch.long) + position_ids = position_ids.view(1, -1).expand(1, -1) # [1, seq_len] + position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # [1, seq_len, 3] + + # vLLM expects shape [3, seq_len], so transpose + position_ids = position_ids.squeeze(0).transpose(0, 1) # [3, seq_len] + + return position_ids, 0 + + events = [] + image_idx = 0 + current_pos = 0 + last_processed_pos = -1 + + for image_pos in image_positions: + if image_pos <= last_processed_pos: + continue # Skip already processed positions + + # Add any text before this image + if image_pos > current_pos: + text_tokens = image_pos - current_pos + text_event = Event( + modality_type=TextType.text, + dims_virtual=[text_tokens, 1], + idx_range=(0, text_tokens), + ) + events.append(text_event) + + # Add image + t, h, w = image_grid_thw[image_idx] + llm_grid_h, llm_grid_w = h // spatial_merge_size, w // spatial_merge_size + image_tokens = t * llm_grid_h * llm_grid_w + + image_event = Event( + modality_type=VisionType.image, + dims_virtual=[t, llm_grid_h, llm_grid_w], + idx_range=(0, image_tokens), + ) + events.append(image_event) + + current_pos = image_pos + image_tokens + last_processed_pos = current_pos - 1 # Mark up to this position as processed + image_idx += 1 + + # Add final text segment if any + if current_pos < len(input_tokens): + text_tokens = len(input_tokens) - current_pos + text_event = Event( + modality_type=TextType.text, + dims_virtual=[text_tokens, 1], + idx_range=(0, text_tokens), + ) + events.append(text_event) + + stream = Stream(events) + tensor_stream = TensorStream([stream]) + + # Use Isaac's native MRoPE calculation + position_ids = compute_mrope_pos_tensor(tensor_stream, n_pos_dims=3) + + # Max position per batch across the 3 planes and sequence dimension: (B,) + m_per_batch = position_ids.amax(dim=(1, 2)) + + mrope_position_delta = (m_per_batch + 1 - len(input_tokens)).item() + + # vLLM expects shape [3, seq_len] but Isaac returns [batch, seq_len, 3] + # Transpose to match vLLM's expected format + position_ids = position_ids.squeeze(0).transpose(0, 1) + + return position_ids, mrope_position_delta + + def get_multimodal_embeddings( + self, **kwargs: object + ) -> MultiModalEmbeddings | None: + + pixel_values = kwargs.get("pixel_values") + image_grid_thw = kwargs.get("image_grid_thw") + + if pixel_values is None: + return [] + + # Convert image_grid_thw from [batch, 1, [T, H, W]] to [batch, [H, W]] + spatial_grids = image_grid_thw[:, 0, 1:3] # Extract H, W from [T, H, W] for each image + + # Process packed sequence patches through vision_embedding module + vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) + + # Split concatenated embeddings for each image item (following Qwen2-VL pattern) + merge_size = self.config.vision_config.pixel_shuffle_scale_factor # Isaac uses pixel shuffle + sizes = spatial_grids.prod(-1) // (merge_size * merge_size) # H * W / (merge_size^2) + + return vision_embeddings.split(sizes.tolist()) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + handle_oov_mm_token: bool = False, + ) -> torch.Tensor: + + # Get text embeddings from the base language model + inputs_embeds = super().get_input_embeddings(input_ids) + + # If we have multimodal embeddings, merge them with text embeddings + if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + return inputs_embeds + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + skip_prefixes = [] + if self.vision_embedding is None: + skip_prefixes.extend(["vision_embedding."]) + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="vision_embedding.3", # The final linear layer + tower_model="vision_embedding", + ) From 0150444c7ca0664e794f29d02eab8533a8198475 Mon Sep 17 00:00:00 2001 From: oscardev256 <42308241+oscardev256@users.noreply.github.com> Date: Sat, 8 Nov 2025 23:15:17 -0500 Subject: [PATCH 2/8] Update registry.py Added Isaac model architecture. Signed-off-by: Oscar Gonzalez --- vllm/model_executor/models/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 494398760620..5a0d9e4f6bfa 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -301,6 +301,7 @@ "idefics3", "Idefics3ForConditionalGeneration", ), + "IsaacForConditionalGeneration": ("isaac", "IsaacForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"), # noqa: E501 "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"), "KeyeVL1_5ForConditionalGeneration": ( From 61121a49fda1d68fc8a4937cd9f0d9383cc8d459 Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Sat, 15 Nov 2025 01:00:01 -0500 Subject: [PATCH 3/8] Updated to use Siglip2Encoder defined in siglip2navit.py. Signed-off-by: Oscar Gonzalez --- vllm/model_executor/models/isaac.py | 434 ++++++++++++++-------------- 1 file changed, 222 insertions(+), 212 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 4f29d1cff347..f3a589faa163 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -29,6 +29,7 @@ WeightsMapper, AutoWeightsLoader, _merge_multimodal_embeddings, + maybe_prefix, ) from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -308,14 +309,6 @@ def __init__( self.num_patches = num_patches -def create_cumulative_seq_lengths(seq_sizes: torch.Tensor, device: torch.device) -> tuple[torch.Tensor, int]: - """Create cumulative sequence lengths for variable-length attention.""" - cu_seqlens = torch.zeros(len(seq_sizes) + 1, dtype=torch.int32, device=device) - cu_seqlens[1:] = seq_sizes.cumsum(0) - max_seqlen = int(seq_sizes.max().item()) if len(seq_sizes) > 0 else 0 - return cu_seqlens, max_seqlen - - class Siglip2VariableSequenceEmbeddings(nn.Module): def __init__(self, config: PixelShuffleSiglip2VisionConfig): super().__init__() @@ -380,7 +373,6 @@ def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Te pos_embeds = self.positional_embeddings(packed_seq_patches) # Flatten patch embeddings to match positional embeddings format - # From [batch, patches_per_image, embed_dim] to [total_patches, embed_dim] batch_size, patches_per_image, embed_dim = patch_embeds.shape # For variable-length attention, we need to reshape to (total_tokens, embed_dim) @@ -394,158 +386,6 @@ def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Te return embeddings -class Siglip2VariableLengthAttention(nn.Module): - """Custom attention that supports variable-length sequences with flash attention.""" - - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) - self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout - - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - - def forward(self, hidden_states, cu_seqlens=None, max_seqlen=None): - batch_size, seq_len, _ = hidden_states.size() - - # For variable-length attention, we need to reshape to (total_tokens, embed_dim) - if batch_size != 1: - raise ValueError("Variable-length attention expects batch_size=1 for packed sequences") - hidden_states = hidden_states.squeeze(0) # Remove batch dimension: (seq_len, embed_dim) - - # Store original dtype - orig_dtype = hidden_states.dtype - - # 1. Linear projections - Q = self.q_proj(hidden_states) # (seq_len, embed_dim) - K = self.k_proj(hidden_states) # (seq_len, embed_dim) - V = self.v_proj(hidden_states) # (seq_len, embed_dim) - - # 2. Reshape for multi-head attention: (seq_len, n_heads, head_dim) - Q = Q.view(-1, self.num_heads, self.embed_dim // self.num_heads) - K = K.view(-1, self.num_heads, self.embed_dim // self.num_heads) - V = V.view(-1, self.num_heads, self.embed_dim // self.num_heads) - - # 3. Apply variable-length attention using flash attention - attn_output, _, _, _, _ = torch.ops.aten._flash_attention_forward( - query=Q, - key=K, - value=V, - cum_seq_q=cu_seqlens, - cum_seq_k=cu_seqlens, - max_q=max_seqlen, - max_k=max_seqlen, - dropout_p=self.dropout if self.training else 0.0, - is_causal=False, - return_debug_mask=False, - scale=self.scale, - window_size_left=-1, - window_size_right=-1, - alibi_slopes=None, - ) - - # 4. Reshape attention output from (seq_len, n_heads, head_dim) to (seq_len, embed_dim) - attn_output = attn_output.reshape(seq_len, self.embed_dim) - - # 5. Convert back to original dtype if needed - if attn_output.dtype != orig_dtype: - attn_output = attn_output.to(orig_dtype) - - # 6. Project output - attn_output = self.out_proj(attn_output) # (seq_len, embed_dim) - - # 7. Add back batch dimension for compatibility - attn_output = attn_output.unsqueeze(0) # (1, seq_len, embed_dim) - - return attn_output, None - - -class IsaacSiglip2EncoderLayer(nn.Module): - """Siglip2 encoder layer with variable-length attention.""" - - def __init__(self, config: PixelShuffleSiglip2VisionConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = Siglip2VariableLengthAttention(config) - - self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = Siglip2MLP(config) # Use HF's Siglip2MLP - self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - cu_seqlens: torch.Tensor = None, - max_seqlen: int = None, - ) -> tuple[torch.FloatTensor]: - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - cu_seqlens=cu_seqlens, - max_seqlen=max_seqlen, - ) - - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - return (hidden_states,) - - -class IsaacEncoder(nn.Module): - """Encoder using Isaac encoder layers with variable-length attention support.""" - - def __init__(self, config: PixelShuffleSiglip2VisionConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([IsaacSiglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) - - def forward( - self, - inputs_embeds, - cu_seqlens: torch.Tensor | None = None, - max_seqlen: int | None = None, - output_hidden_states: bool = False, - ): - all_hidden_states = () if output_hidden_states else None - - hidden_states = inputs_embeds - - for encoder_layer in self.layers: - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer_outputs = encoder_layer( - hidden_states, - cu_seqlens, - max_seqlen, - ) - - hidden_states = layer_outputs[0] - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - return hidden_states, all_hidden_states, None - - def create_pixel_shuffle_index_map( seq_sizes: torch.Tensor, token_grids: torch.Tensor, @@ -669,52 +509,6 @@ def pixel_shuffle_varlen( out = out.unsqueeze(0) return out - -class Siglip2SequenceVisionTransformer(nn.Module): - def __init__(self, config: PixelShuffleSiglip2VisionConfig): - super().__init__() - self.config = config - self.embeddings = Siglip2VariableSequenceEmbeddings(config) - self.encoder = IsaacEncoder(config) - self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.pixel_shuffle_scale_factor = config.pixel_shuffle_scale_factor - - def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor]): - seq_patches, token_grids = packed_seq_patches - seq_sizes = torch.prod(token_grids, dim=-1) - - # Get embeddings from packed sequence - hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) - - # Add a pseudo batch dimension for the encoder - hidden_states = hidden_states.unsqueeze(0) - - # Generate cumulative sequence lengths for variable-length attention - cu_seqlens, max_seqlen = create_cumulative_seq_lengths(seq_sizes, hidden_states.device) - - # Pass through encoder with variable-length attention parameters - hidden_states, _, _ = self.encoder( - inputs_embeds=hidden_states, - cu_seqlens=cu_seqlens, - max_seqlen=max_seqlen, - ) - - # Apply final layer normalization - hidden_states = self.post_layernorm(hidden_states) - - if self.pixel_shuffle_scale_factor > 1: - hidden_states = pixel_shuffle_varlen( - x=hidden_states, - token_grids=token_grids, - scale_factor=self.pixel_shuffle_scale_factor, - ) - # Remove the pseudo batch dimension we added earlier - hidden_states = hidden_states.squeeze(0) - - # Return the full sequence of embeddings - return hidden_states - - # ============================================================================ # Configuration # ============================================================================ @@ -1009,7 +803,6 @@ class IsaacImageProcessorKwargs(TypedDict, total=False): max_num_patches: int min_num_patches: int pixel_shuffle_scale: int - #merge_size: int # kept for parity with other processors that expose it class IsaacImageProcessor: @@ -1265,6 +1058,156 @@ def get_replacement_isaac(item_idx: int): ) ] +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) +from vllm.model_executor.models.utils import is_pp_missing_parameter +from vllm.model_executor.models.siglip2navit import Siglip2VisionEmbeddings, Siglip2Encoder +from vllm.attention.backends.registry import _Backend +from vllm.model_executor.layers.quantization import QuantizationConfig + +class Siglip2VisionTransformer(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE +): + + is_pooling_model = True + + merge_by_field_config = True + + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__( + self, + config, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + use_data_parallel: bool = False, + attn_backend_override: _Backend | None = None, + ): + super().__init__() + self.config = config + self.quant_config = quant_config + embed_dim = config.hidden_size + + self.embeddings = Siglip2VariableSequenceEmbeddings(config) + self.pixel_shuffle_scale_factor = config.pixel_shuffle_scale_factor + self.encoder = Siglip2Encoder( + config, + quant_config=quant_config, + prefix=f"{prefix}.encoder", + use_data_parallel=use_data_parallel, + attn_backend_override=attn_backend_override, + ) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + packed_seq_patches: tuple[torch.Tensor, torch.Tensor], + ) -> torch.Tensor: + r""" + spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`): + Tensor containing the spatial dimensions (height, width) + of the input images. + """ + + seq_patches, token_grids = packed_seq_patches + seq_sizes = torch.prod(token_grids, dim=-1) + + # Get embeddings from packed sequence + hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) + + grid_thws = torch.tensor([[1, token_grids[0][0].item(), token_grids[0][1].item()]]) + last_hidden_state = self.encoder(hidden_states, grid_thws) + hidden_states = self.post_layernorm(last_hidden_state) + + # Add a pseudo batch dimension for the encoder + hidden_states = hidden_states.unsqueeze(0) + + if self.pixel_shuffle_scale_factor > 1: + hidden_states = pixel_shuffle_varlen( + x=hidden_states, + token_grids=token_grids, + scale_factor=self.pixel_shuffle_scale_factor, + ) + # Remove the pseudo batch dimension we added earlier + hidden_states = hidden_states.squeeze(0) + + #return last_hidden_state + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + loaded_weight = ( + loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] + ) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + if weight_loader == default_weight_loader: + weight_loader(param, loaded_weight) + else: + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + if is_pp_missing_parameter(name, self): + continue + print(f"qwen2: name={name}") + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params @MULTIMODAL_REGISTRY.register_processor( IsaacMultiModalProcessor, @@ -1274,13 +1217,24 @@ def get_replacement_isaac(item_idx: int): class IsaacForConditionalGeneration( Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } supports_encoder_tp_data = True # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ - "model.vision_embedding.": "vision_embedding.", + "model.vision_embedding.": "vision_embedding.", } ) @@ -1315,13 +1269,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): "norm": self.model.norm }) + config.vision_config.preserve_original_pe = True + config.vision_config.use_rope = False + config.vision_config.hidden_stride = config.vision_config.pixel_shuffle_scale_factor + config.vision_config.window_size = 32*2 + config.vision_config.fullatt_block_indexes = None vision_cfg = config.vision_config if vision_cfg is None: raise ValueError("IsaacConfig should always have vision_config") hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) self.vision_embedding = nn.Sequential( - Siglip2SequenceVisionTransformer(vision_cfg), + Siglip2VisionTransformer(vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding")), nn.Linear( hidden_dim, 4 * hidden_dim, @@ -1472,10 +1431,61 @@ def get_input_embeddings( return inputs_embeds + def merge_qkv_weights( + weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[tuple[str, torch.Tensor]]: + """Merge separate Q, K, V projection weights into QKV format.""" + + # Buffer to collect q, k, v weights for each layer + qkv_buffer = {} + + for name, tensor in weights: + # Check if this is a q/k/v projection weight + if '.q_proj.' in name or '.k_proj.' in name or '.v_proj.' in name: + # Extract the base name (everything before q/k/v_proj) + if '.q_proj.' in name: + base_name = name.replace('.q_proj.', '.qkv_proj.') + proj_type = 'q' + elif '.k_proj.' in name: + base_name = name.replace('.k_proj.', '.qkv_proj.') + proj_type = 'k' + else: # v_proj + base_name = name.replace('.v_proj.', '.qkv_proj.') + proj_type = 'v' + + # Store in buffer + if base_name not in qkv_buffer: + qkv_buffer[base_name] = {} + qkv_buffer[base_name][proj_type] = tensor + + # If we have all three (q, k, v), merge and yield + if len(qkv_buffer[base_name]) == 3: + q = qkv_buffer[base_name]['q'] + k = qkv_buffer[base_name]['k'] + v = qkv_buffer[base_name]['v'] + + # Concatenate along dim 0 for weight, dim agnostic for bias + merged = torch.cat([q, k, v], dim=0) + yield base_name, merged + + # Clear buffer + del qkv_buffer[base_name] + else: + # Pass through non-qkv weights unchanged + yield name, tensor + + # Check if any incomplete qkv sets remain + if qkv_buffer: + raise ValueError(f"Incomplete QKV weights found: {list(qkv_buffer.keys())}") + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes = [] - if self.vision_embedding is None: - skip_prefixes.extend(["vision_embedding."]) + #if self.vision_embedding is None: + # skip_prefixes.extend(["vision_embedding."]) + + # Usage: + #weights = self.merge_qkv_weights(weights) loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From e693c6cd6182ca69382657b07260b09a914340f7 Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Tue, 18 Nov 2025 02:01:35 -0500 Subject: [PATCH 4/8] Updated load_weight for Siglip2VisionTransformer Signed-off-by: Oscar Gonzalez --- vllm/model_executor/models/isaac.py | 160 +++++----------------------- 1 file changed, 27 insertions(+), 133 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index f3a589faa163..786b1fe4e6f1 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -18,9 +18,6 @@ from transformers import PretrainedConfig, Qwen3Config from transformers.image_processing_utils import BatchFeature from transformers.tokenization_utils import TensorType -from transformers.models.siglip2.modeling_siglip2 import ( - Siglip2MLP, -) from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -30,6 +27,7 @@ AutoWeightsLoader, _merge_multimodal_embeddings, maybe_prefix, + init_vllm_registered_model, ) from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys @@ -54,6 +52,15 @@ SupportsPP, ) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, +) +from vllm.model_executor.models.siglip2navit import Siglip2Encoder +from vllm.attention.backends.registry import _Backend +from vllm.model_executor.layers.quantization import QuantizationConfig + +from vllm.model_executor.layers.linear import ReplicatedLinear + # ===== TensorStream Compatibility Layer for Isaac MRoPE ===== # Minimal implementation of TensorStream classes needed for Isaac's 3D positional encoding @@ -316,9 +323,10 @@ def __init__(self, config: PixelShuffleSiglip2VisionConfig): self.embed_dim = config.hidden_size self.patch_size = config.patch_size - self.patch_embedding = nn.Linear( - in_features=config.num_channels * self.patch_size * self.patch_size, - out_features=self.embed_dim, + self.patch_embedding = ReplicatedLinear( + input_size=config.num_channels * self.patch_size * self.patch_size, + output_size=self.embed_dim, + return_bias=False, ) self.num_patches = config.num_patches @@ -1058,37 +1066,10 @@ def get_replacement_isaac(item_idx: int): ) ] -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, - maybe_remap_kv_scale_name, -) -from vllm.model_executor.models.utils import is_pp_missing_parameter -from vllm.model_executor.models.siglip2navit import Siglip2VisionEmbeddings, Siglip2Encoder -from vllm.attention.backends.registry import _Backend -from vllm.model_executor.layers.quantization import QuantizationConfig - -class Siglip2VisionTransformer(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE -): - - is_pooling_model = True - - merge_by_field_config = True - - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - +class Siglip2VisionTransformer(nn.Module): def __init__( self, - config, + config: PixelShuffleSiglip2VisionConfig, quant_config: QuantizationConfig | None = None, prefix: str = "", use_data_parallel: bool = False, @@ -1151,64 +1132,28 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) + params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() + for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - if name.endswith("scale"): - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue + param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - if weight_loader == default_weight_loader: - weight_loader(param, loaded_weight) - else: - weight_loader(param, loaded_weight, shard_id) + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) break else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - if is_pp_missing_parameter(name, self): - continue - print(f"qwen2: name={name}") param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + @MULTIMODAL_REGISTRY.register_processor( IsaacMultiModalProcessor, info=IsaacProcessingInfo, @@ -1217,6 +1162,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: class IsaacForConditionalGeneration( Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): + packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1230,7 +1176,7 @@ class IsaacForConditionalGeneration( } supports_encoder_tp_data = True - + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -1261,14 +1207,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): # Initialize the parent class with updated config super().__init__(vllm_config=vllm_config, prefix=prefix) - + # Create the language model module to match checkpoint structure self.language_model = nn.ModuleDict({ "embed_tokens": self.model.embed_tokens, "layers": self.model.layers, "norm": self.model.norm }) - + config.vision_config.preserve_original_pe = True config.vision_config.use_rope = False config.vision_config.hidden_stride = config.vision_config.pixel_shuffle_scale_factor @@ -1431,61 +1377,9 @@ def get_input_embeddings( return inputs_embeds - def merge_qkv_weights( - weights: Iterable[tuple[str, torch.Tensor]] - ) -> Iterable[tuple[str, torch.Tensor]]: - """Merge separate Q, K, V projection weights into QKV format.""" - - # Buffer to collect q, k, v weights for each layer - qkv_buffer = {} - - for name, tensor in weights: - # Check if this is a q/k/v projection weight - if '.q_proj.' in name or '.k_proj.' in name or '.v_proj.' in name: - # Extract the base name (everything before q/k/v_proj) - if '.q_proj.' in name: - base_name = name.replace('.q_proj.', '.qkv_proj.') - proj_type = 'q' - elif '.k_proj.' in name: - base_name = name.replace('.k_proj.', '.qkv_proj.') - proj_type = 'k' - else: # v_proj - base_name = name.replace('.v_proj.', '.qkv_proj.') - proj_type = 'v' - - # Store in buffer - if base_name not in qkv_buffer: - qkv_buffer[base_name] = {} - qkv_buffer[base_name][proj_type] = tensor - - # If we have all three (q, k, v), merge and yield - if len(qkv_buffer[base_name]) == 3: - q = qkv_buffer[base_name]['q'] - k = qkv_buffer[base_name]['k'] - v = qkv_buffer[base_name]['v'] - - # Concatenate along dim 0 for weight, dim agnostic for bias - merged = torch.cat([q, k, v], dim=0) - yield base_name, merged - - # Clear buffer - del qkv_buffer[base_name] - else: - # Pass through non-qkv weights unchanged - yield name, tensor - - # Check if any incomplete qkv sets remain - if qkv_buffer: - raise ValueError(f"Incomplete QKV weights found: {list(qkv_buffer.keys())}") - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes = [] - #if self.vision_embedding is None: - # skip_prefixes.extend(["vision_embedding."]) - - # Usage: - #weights = self.merge_qkv_weights(weights) + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From 9478531113777a1e4088ab4c8efd59ff880e93c8 Mon Sep 17 00:00:00 2001 From: Yang Date: Thu, 20 Nov 2025 15:39:18 -0800 Subject: [PATCH 5/8] org and add imports and fix lint error Signed-off-by: Yang --- vllm/model_executor/models/isaac.py | 477 +++++++++++++++++----------- 1 file changed, 295 insertions(+), 182 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 786b1fe4e6f1..5c61e5bf48a7 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -1,68 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations -from collections.abc import Mapping, Sequence, Iterable -from typing import Any, Optional, Union -from typing_extensions import TypedDict, Unpack - import itertools -from enum import Enum +import math +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass +from enum import Enum +from typing import Any -import math import numpy as np import PIL.Image import torch import torch.nn as nn import torch.nn.functional as F - from transformers import PretrainedConfig, Qwen3Config from transformers.image_processing_utils import BatchFeature -from transformers.tokenization_utils import TensorType from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig +from transformers.tokenization_utils import TensorType +from typing_extensions import TypedDict, Unpack -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.model_executor.models.interfaces import SupportsMultiModal +from vllm.attention.backends.registry import _Backend +from vllm.config import VllmConfig +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, +) +from vllm.model_executor.models.interfaces import ( + MultiModalEmbeddings, + SupportsLoRA, + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, +) +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM +from vllm.model_executor.models.siglip2navit import Siglip2Encoder from vllm.model_executor.models.utils import ( - WeightsMapper, AutoWeightsLoader, + WeightsMapper, _merge_multimodal_embeddings, maybe_prefix, - init_vllm_registered_model, ) -from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM -from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalFieldConfig, + MultiModalKwargs, +) +from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import ( BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, + PromptUpdate, ) -from vllm.multimodal.parse import MultiModalDataItems, ImageSize from vllm.multimodal.profiling import BaseDummyInputsBuilder -from vllm.multimodal.inputs import ( - MultiModalFieldConfig, - MultiModalKwargs, - MultiModalDataDict, -) -from vllm.config import VllmConfig -from vllm.model_executor.models.interfaces import ( - MultiModalEmbeddings, - SupportsLoRA, - SupportsMRoPE, - SupportsMultiModal, - SupportsPP, -) - -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, -) -from vllm.model_executor.models.siglip2navit import Siglip2Encoder -from vllm.attention.backends.registry import _Backend -from vllm.model_executor.layers.quantization import QuantizationConfig - -from vllm.model_executor.layers.linear import ReplicatedLinear # ===== TensorStream Compatibility Layer for Isaac MRoPE ===== -# Minimal implementation of TensorStream classes needed for Isaac's 3D positional encoding +# Minimal implementation of TensorStream classes needed for Isaac's 3D positional +# encoding + class ModalityType(Enum): """ @@ -127,37 +126,46 @@ class TextType(ModalityType): @dataclass class Event: """Represents a single modality event with spatial/temporal dimensions.""" + """ - Represents a single data occurrence (with a specific type, time interval, and data payload). + Represents a single data occurrence (with a specific type, time interval, and + data payload). Attributes: - data (Any): The actual data payload (e.g. a torch.Tensor, a string, etc.). - type (ModalityType): The modality type of the data (e.g., VisionType.image). - time (Tuple[float, float]): (start_time, end_time) indicating when this Event occurs. - role (Optional[str]): The role associated with this event (e.g., "user", "agent", "system"). - If None, the event is always included in loss calculation. + data (Any): The actual data payload (e.g. a torch.Tensor, a string, + etc.). + type (ModalityType): The modality type of the data (e.g., + VisionType.image). + time (Tuple[float, float]): (start_time, end_time) indicating when this + Event occurs. + role (Optional[str]): The role associated with this event (e.g., "user", + "agent", "system"). If None, the event is always included in loss + calculation. Example usage: evt = Event(data=torch.zeros((1, 224, 224, 3)), # e.g. a single image frame type=VisionType.image, time=(0.0, 0.04), role="user") - """ + """ # Descriptors modality_type: ModalityType - + # Structure - dims_virtual: list[int] | None = None # virtual/processed dimensions (e.g., pixel-shuffled) + dims_virtual: list[int] | None = ( + None # virtual/processed dimensions (e.g., pixel-shuffled) + ) dims_real: list[int] | None = None # real/actual tensor dimensions idx_range: tuple[int, int] | None = None - + def dims(self, virtual: bool = True) -> list[int] | None: """ Get the dimensions of this event. Args: - virtual: If True (default), return virtual/processed dimensions (e.g., pixel-shuffled). - If False, return real/actual tensor dimensions. + virtual: If True (default), return virtual/processed dimensions + (e.g., pixel-shuffled). If False, return real/actual tensor + dimensions. Returns: Dimensions list or None if not measured. @@ -171,7 +179,9 @@ def num_tokens(self, partial=True, virtual=True) -> int: if not virtual: assert partial is False and isinstance(self.data, torch.Tensor) return math.prod(self.dims(virtual=False)) - return self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims()) + return ( + self.idx_range[1] - self.idx_range[0] if partial else math.prod(self.dims()) + ) @dataclass @@ -215,7 +225,8 @@ def __iter__(self): yield from self.events -# TODO: implement all types of cool indexing which can happen since TensorStream assuems Event.data = Tensor +# TODO: implement all types of cool indexing which can happen since TensorStream +# assumes Event.data = Tensor @dataclass class TensorStream: streams: list[Stream] @@ -254,7 +265,8 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten cumulative_offset = 0 # running time index for this stream for event in stream: - # --- build coordinate grid for THIS event using itertools (no tensor ops) --- + # --- build coordinate grid for THIS event using itertools + # (no tensor ops) --- dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) # Create ranges for each dimension (similar to old _finalize implementation) @@ -274,26 +286,30 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten # Convert to tensor and reshape to (B, T, n_pos_dims) B, T = ts.shape - return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape(B, T, n_pos_dims) + return torch.tensor(all_coords, dtype=torch.long, device=ts.device).reshape( + B, T, n_pos_dims + ) def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: """Create boolean mask for specific modality type in the tensor stream.""" B, T = ts.shape mask = torch.zeros((B, T), dtype=torch.bool, device=ts.device) - + for batch_idx, stream in enumerate(ts.streams): seq_idx = 0 for event in stream: if event.modality_type == modality_type: start, end = event.idx_range - mask[batch_idx, seq_idx:seq_idx+(end-start)] = True - seq_idx += (event.idx_range[1] - event.idx_range[0]) - + mask[batch_idx, seq_idx : seq_idx + (end - start)] = True + seq_idx += event.idx_range[1] - event.idx_range[0] + return mask + # ===== End TensorStream Compatibility Layer ===== + class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig): """Vision configuration for Isaac with Pixel Shuffle support. @@ -338,7 +354,9 @@ def positional_embeddings( ) -> torch.Tensor: # Prepare positional embeddings grid: (1, embed_dim, h, w) positional_embeddings = ( - self.position_embedding.weight.reshape(self.position_embedding_size, self.position_embedding_size, -1) + self.position_embedding.weight.reshape( + self.position_embedding_size, self.position_embedding_size, -1 + ) .permute(2, 0, 1) .unsqueeze(0) ) @@ -359,12 +377,16 @@ def positional_embeddings( align_corners=align_corners, antialias=antialias, ) - # Reshape from (1, embed_dim, height, width) to (height*width, embed_dim) - resized_pos_embed = resized_pos_embed.reshape(self.embed_dim, height * width).transpose(0, 1) + # Reshape from (1, embed_dim, height, width) to + # (height*width, embed_dim) + resized_pos_embed = resized_pos_embed.reshape( + self.embed_dim, height * width + ).transpose(0, 1) else: # Fallback - should never happen in practice resized_pos_embed = positional_embeddings.reshape( - self.embed_dim, self.position_embedding_size * self.position_embedding_size + self.embed_dim, + self.position_embedding_size * self.position_embedding_size, ).transpose(0, 1)[: height * width] pos_embeds_list.append(resized_pos_embed) @@ -372,7 +394,9 @@ def positional_embeddings( pos_embeds = torch.cat(pos_embeds_list, dim=0) return pos_embeds - def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor]): + def forward( + self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ): seq_patches, _seq_sizes, _spatial_shapes = packed_seq_patches # Apply patch embeddings @@ -385,7 +409,9 @@ def forward(self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor, torch.Te # For variable-length attention, we need to reshape to (total_tokens, embed_dim) if batch_size != 1: - raise ValueError("Variable-length attention expects batch_size=1 for packed sequences") + raise ValueError( + "Variable-length attention expects batch_size=1 for packed sequences" + ) patch_embeds = patch_embeds.view(batch_size * patches_per_image, embed_dim) @@ -427,11 +453,13 @@ def create_pixel_shuffle_index_map( # Safety: all spatial dims must be divisible by r # Cannot run under torch compile fullgraph mode hence - if not torch.compiler.is_compiling(): - if not ((token_grids[:, 0] % r == 0).all() and (token_grids[:, 1] % r == 0).all()): - raise AssertionError( - f"Every (H,W) in `token_grids` must be divisible by scale_factor={r}, got {token_grids.tolist()}" - ) + if not torch.compiler.is_compiling() and not ( + (token_grids[:, 0] % r == 0).all() and (token_grids[:, 1] % r == 0).all() + ): + raise AssertionError( + "Every (H,W) in `token_grids` must be divisible by " + f"scale_factor={r}, got {token_grids.tolist()}" + ) gather_chunks: list[torch.Tensor] = [] tok_offset = 0 @@ -467,19 +495,23 @@ def pixel_shuffle_varlen( Args: x (`torch.Tensor`): - Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or `(1, seq_len, hidden_size)` shapes - produced by stacking image patches. + Concatenated vision embeddings. Accepts `(seq_len, hidden_size)` or + `(1, seq_len, hidden_size)` shapes produced by stacking image + patches. token_grids (`torch.Tensor`): - Integer tensor of shape `(num_images, 2)` whose rows give the `(height, width)` patch grid sizes - corresponding to each image segment inside `x`. + Integer tensor of shape `(num_images, 2)` whose rows give the + `(height, width)` patch grid sizes corresponding to each image + segment inside `x`. scale_factor (`int`, *optional*, defaults to 1): - Spatial down-sampling factor specific to pixel shuffle. Values greater than one merge `scale_factor**2` neighboring patches into a + Spatial down-sampling factor specific to pixel shuffle. Values + greater than one merge `scale_factor**2` neighboring patches into a single embedding channel-group. Returns: - `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input convention: - `(seq_len, hidden_size * scale_factor**2)` when the input was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` - if the singleton batch dimension was present. + `torch.Tensor`: Pixel-shuffled embeddings with shape matching the input + convention: `(seq_len, hidden_size * scale_factor**2)` when the input + was 2D, or `(1, seq_len, hidden_size * scale_factor**2)` if the + singleton batch dimension was present. Raises: ValueError: If more than one batch item is provided. @@ -517,6 +549,7 @@ def pixel_shuffle_varlen( out = out.unsqueeze(0) return out + # ============================================================================ # Configuration # ============================================================================ @@ -550,7 +583,9 @@ def _make_writeable(arr: np.ndarray) -> np.ndarray: def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None: if image.width * image.height > MAX_PIXELS: - raise ValueError(f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`") + raise ValueError( + f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`" + ) img = image if image.mode == "RGB" else image.convert("RGB") arr = np.asarray(img) arr = _make_writeable(arr) @@ -576,17 +611,22 @@ def get_image_size_for_max_num_patches( patch_size (`int`): Size of the square patch used by the vision encoder. max_num_patches (`int`): - Upper bound on `(height / patch_size) * (width / patch_size)` after resizing. + Upper bound on `(height / patch_size) * (width / patch_size)` after + resizing. min_num_patches (`int`, *optional*): - Lower bound on the number of patches. When provided the image will be scaled up if necessary. + Lower bound on the number of patches. When provided the image will + be scaled up if necessary. eps (`float`, *optional*, defaults to 1e-5): - Convergence tolerance for the internal binary search to determing the target dimensions. + Convergence tolerance for the internal binary search to determine + the target dimensions. pixel_shuffle_scale (`int`, *optional*, defaults to 1): - Additional stride multiplier applied when pixel shuffle later reduces spatial resolution. + Additional stride multiplier applied when pixel shuffle later + reduces spatial resolution. Returns: - `tuple[int, int]`: Height and width (in pixels) that are multiples of `patch_size * pixel_shuffle_scale` - and respect both the maximum and optional minimum patch-count constraints. + `tuple[int, int]`: Height and width (in pixels) that are multiples of + `patch_size * pixel_shuffle_scale` and respect both the maximum and + optional minimum patch-count constraints. """ def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale): @@ -610,16 +650,24 @@ def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale) scale_min, scale_max = 1.0, 100.0 while (scale_max - scale_min) >= eps: scale = (scale_min + scale_max) / 2 - target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) - target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + target_height = get_scaled_image_size( + scale, image_height, patch_size, pixel_shuffle_scale + ) + target_width = get_scaled_image_size( + scale, image_width, patch_size, pixel_shuffle_scale + ) num_patches = (target_height / patch_size) * (target_width / patch_size) if num_patches >= min_num_patches: scale_max = scale else: scale_min = scale scale = scale_max - target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) - target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + target_height = get_scaled_image_size( + scale, image_height, patch_size, pixel_shuffle_scale + ) + target_width = get_scaled_image_size( + scale, image_width, patch_size, pixel_shuffle_scale + ) return target_height, target_width elif num_patches <= max_num_patches: return adjusted_height, adjusted_width @@ -628,16 +676,24 @@ def get_scaled_image_size(scale, original_size, patch_size, pixel_shuffle_scale) scale_min, scale_max = eps / 10, 1.0 while (scale_max - scale_min) >= eps: scale = (scale_min + scale_max) / 2 - target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) - target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + target_height = get_scaled_image_size( + scale, image_height, patch_size, pixel_shuffle_scale + ) + target_width = get_scaled_image_size( + scale, image_width, patch_size, pixel_shuffle_scale + ) num_patches = (target_height / patch_size) * (target_width / patch_size) if num_patches <= max_num_patches: scale_min = scale else: scale_max = scale scale = scale_min - target_height = get_scaled_image_size(scale, image_height, patch_size, pixel_shuffle_scale) - target_width = get_scaled_image_size(scale, image_width, patch_size, pixel_shuffle_scale) + target_height = get_scaled_image_size( + scale, image_height, patch_size, pixel_shuffle_scale + ) + target_width = get_scaled_image_size( + scale, image_width, patch_size, pixel_shuffle_scale + ) return target_height, target_width @@ -653,12 +709,13 @@ def prepare_image_tensor( Args: image (`torch.Tensor`): - Tensor with shape `(..., height, width, 3)` containing RGB values. The tensor is converted to floating - point if needed. + Tensor with shape `(..., height, width, 3)` containing RGB values. + The tensor is converted to floating point if needed. scale (`float`, *optional*, defaults to `VISION_SCALE`): Scalar multiplier applied before normalization. Returns: - `torch.Tensor`: Normalized tensor with the same shape as the input and dtype `torch.float32`. + `torch.Tensor`: Normalized tensor with the same shape as the input and + dtype `torch.float32`. """ if not torch.is_floating_point(image): image = image.float() @@ -683,17 +740,33 @@ def patchify_vision(image: torch.Tensor, patch_size: int) -> torch.Tensor: Returns: `torch.Tensor`: - Patch tensor where each position stores the flattened pixels belonging to that patch. + Patch tensor where each position stores the flattened pixels + belonging to that patch. Raises: ValueError: If `height` or `width` is not divisible by `patch_size`. """ num_images, height, width, channels = image.shape if height % patch_size or width % patch_size: - raise ValueError(f"Dimensions of images {image.shape} are not divisible by patch_size={patch_size}.") - patches = image.reshape(num_images, height // patch_size, patch_size, width // patch_size, patch_size, channels) + raise ValueError( + "Dimensions of images " + f"{image.shape} are not divisible by patch_size={patch_size}." + ) + patches = image.reshape( + num_images, + height // patch_size, + patch_size, + width // patch_size, + patch_size, + channels, + ) patches = patches.permute(0, 1, 3, 2, 4, 5) - patches = patches.reshape(num_images, height // patch_size, width // patch_size, channels * patch_size * patch_size) + patches = patches.reshape( + num_images, + height // patch_size, + width // patch_size, + channels * patch_size * patch_size, + ) return patches @@ -708,21 +781,26 @@ def process_vision_for_patches( Args: images (`torch.Tensor`): - Either `(height, width, channels)` for a single image or `(num_images, height, width, channels)` for a - batch. Channels are expected to be RGB. + Either `(height, width, channels)` for a single image or + `(num_images, height, width, channels)` for a batch. Channels are + expected to be RGB. patch_size (`int`): Edge length of square patches; implictly controls resize grid granularity. max_num_patches (`int`): Maximum number of patches allowed after resizing. min_num_patches (`int`, *optional*): - Minimum number of patches. If provided, the routine upsamples images as needed to satisfy the lower bound. + Minimum number of patches. If provided, the routine upsamples images + as needed to satisfy the lower bound. pixel_shuffle_scale (`int`, *optional*, defaults to 1): - pixel shuffle scale factor; influences the target grid that the function produces. + Pixel shuffle scale factor; influences the target grid that the + function produces. Returns: - `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)` where `patches` has shape - `(num_images, target_h / patch_size, target_w / patch_size, channels * patch_size**2)` and `dims_virtual` - encodes effective `(images, height, width)` dimensions after optional pixel shuffling. + `tuple[torch.Tensor, list[int]]`: A pair `(patches, dims_virtual)` + where `patches` has shape `(num_images, target_h / patch_size, target_w + / patch_size, channels * patch_size**2)` and `dims_virtual` encodes + effective `(images, height, width)` dimensions after optional pixel + shuffling. """ # Add batch dim if single image if images.dim() == 3: @@ -788,7 +866,7 @@ def __init__( **kwargs, ): super().__init__(**kwargs) - + # EventStreamProcessor parameters (for backward compatibility) self.video_patch_size = vision_patch_size self.vision_max_num_patches = vision_max_num_patches @@ -814,7 +892,6 @@ class IsaacImageProcessorKwargs(TypedDict, total=False): class IsaacImageProcessor: - patch_size = 16 max_num_patches = 6144 min_num_patches = 256 @@ -825,14 +902,18 @@ class IsaacImageProcessor: def __init__(self, kwargs): self.patch_size = kwargs.pop("patch_size", self.patch_size) - self.vision_max_num_patches = kwargs.pop("vision_max_num_patches", self.max_num_patches) - self.vision_min_num_patches = kwargs.pop("vision_min_num_patches", self.min_num_patches) + self.vision_max_num_patches = kwargs.pop( + "vision_max_num_patches", self.max_num_patches + ) + self.vision_min_num_patches = kwargs.pop( + "vision_min_num_patches", self.min_num_patches + ) self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2) def preprocess( self, images: list[torch.Tensor], - return_tensors: Optional[Union[str, TensorType]], + return_tensors: str | TensorType | None, **kwargs: Unpack[IsaacImageProcessorKwargs], ) -> BatchFeature: """Isaac's resize → normalize → patchify → pack.""" @@ -840,9 +921,9 @@ def preprocess( all_pixel_values: list[torch.Tensor] = [] all_image_grids: list[torch.Tensor] = [] - for image in images: + for image in images: image_tensor = extract_image_pil(image) - + patches, dims_virtual = process_vision_for_patches( image_tensor, patch_size=self.patch_size, @@ -874,7 +955,10 @@ def preprocess( final_image_grids = torch.empty(0, 3) return BatchFeature( - data={"pixel_values": final_pixel_values, "image_grid_thw": final_image_grids}, + data={ + "pixel_values": final_pixel_values, + "image_grid_thw": final_image_grids, + }, tensor_type=return_tensors, ) @@ -899,7 +983,7 @@ def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: image_result = self.image_processor.preprocess(images, **kwargs) result.update(image_result) return BatchFeature(result) - + def apply_chat_template( self, messages: list[dict[str, Any]], @@ -909,7 +993,7 @@ def apply_chat_template( ) -> Any: # Convert mixed content messages to simple text format processed_messages = [] - + for message in messages: if "content" in message and isinstance(message["content"], list): # Handle mixed content (text + image) @@ -920,23 +1004,25 @@ def apply_chat_template( elif content_item.get("type") == "image": # Replace image with vision token text_parts.append(self.image_token) - + processed_message = { "role": message.get("role", "user"), - "content": "".join(text_parts) + "content": "".join(text_parts), } processed_messages.append(processed_message) else: # Regular text message processed_messages.append(message) - + return self.tokenizer.apply_chat_template( - processed_messages, tokenize=tokenize, add_generation_prompt=add_generation_prompt, **kwargs + processed_messages, + tokenize=tokenize, + add_generation_prompt=add_generation_prompt, + **kwargs, ) class IsaacProcessingInfo(BaseProcessingInfo): - def get_hf_config(self) -> IsaacConfig: if hasattr(self.ctx, "get_hf_config"): original_config = self.ctx.get_hf_config() @@ -945,10 +1031,16 @@ def get_hf_config(self) -> IsaacConfig: # Vision parameters - map from HF names vision_config=getattr(original_config, "vision_config", None), vision_patch_size=getattr(original_config, "video_patch_size", 16), - vision_max_num_patches=getattr(original_config, "vision_max_num_patches", 256), - vision_min_num_patches=getattr(original_config, "vision_min_num_patches", None), + vision_max_num_patches=getattr( + original_config, "vision_max_num_patches", 256 + ), + vision_min_num_patches=getattr( + original_config, "vision_min_num_patches", None + ), pixel_shuffle_scale=getattr(original_config, "pixel_shuffle_scale", 1), - max_sequence_length=getattr(original_config, "max_sequence_length", 16384), + max_sequence_length=getattr( + original_config, "max_sequence_length", 16384 + ), vision_token="<|image_pad|>", ) return IsaacConfig() @@ -975,18 +1067,22 @@ def get_image_size_with_most_features(self) -> ImageSize: def get_image_processor(self, **kwargs) -> IsaacImageProcessor: return self.get_hf_processor(**kwargs).image_processor - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} def get_mm_max_tokens_per_item( - self, seq_len: int, mm_counts: Mapping[str, int], + self, + seq_len: int, + mm_counts: Mapping[str, int], ) -> Mapping[str, int]: hf_config = self.get_hf_config() - num_vision_tokens = hf_config.vision_max_num_patches // (hf_config.pixel_shuffle_scale**2) + num_vision_tokens = hf_config.vision_max_num_patches // ( + hf_config.pixel_shuffle_scale**2 + ) return {"image": num_vision_tokens} -class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): +class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) @@ -1017,19 +1113,19 @@ def get_dummy_mm_data( class IsaacMultiModalProcessor(BaseMultiModalProcessor): - def _get_mm_fields_config( self, hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - # Configure multimodal fields for Isaac model image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) image_grid_sizes = image_grid_thw.prod(-1) return { - "pixel_values": MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), + "pixel_values": MultiModalFieldConfig.flat_from_sizes( + "image", image_grid_sizes + ), "image_grid_thw": MultiModalFieldConfig.batched("image"), } @@ -1039,24 +1135,23 @@ def _get_prompt_updates( hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> Sequence[PromptUpdate]: - - #hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + # hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() vocab = tokenizer.get_vocab() placeholder_id = vocab.get("<|image_pad|>", 151655) - - pixel_shuffle_scale = getattr(image_processor, 'pixel_shuffle_scale', 2) - merge_length = pixel_shuffle_scale ** 2 - + + pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2) + merge_length = pixel_shuffle_scale**2 + def get_replacement_isaac(item_idx: int): out_item = out_mm_kwargs["image"][item_idx] grid_thw = out_item["image_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) num_tokens = int(grid_thw.prod()) // merge_length - return [placeholder_id] * num_tokens + return [placeholder_id] * num_tokens return [ PromptReplacement( @@ -1066,6 +1161,7 @@ def get_replacement_isaac(item_idx: int): ) ] + class Siglip2VisionTransformer(nn.Module): def __init__( self, @@ -1107,7 +1203,9 @@ def forward( # Get embeddings from packed sequence hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) - grid_thws = torch.tensor([[1, token_grids[0][0].item(), token_grids[0][1].item()]]) + grid_thws = torch.tensor( + [[1, token_grids[0][0].item(), token_grids[0][1].item()]] + ) last_hidden_state = self.encoder(hidden_states, grid_thws) hidden_states = self.post_layernorm(last_hidden_state) @@ -1123,7 +1221,7 @@ def forward( # Remove the pseudo batch dimension we added earlier hidden_states = hidden_states.squeeze(0) - #return last_hidden_state + # return last_hidden_state return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -1160,9 +1258,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: dummy_inputs=IsaacDummyInputsBuilder, ) class IsaacForConditionalGeneration( - Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE + Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): - packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1176,11 +1273,11 @@ class IsaacForConditionalGeneration( } supports_encoder_tp_data = True - + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ - "model.vision_embedding.": "vision_embedding.", + "model.vision_embedding.": "vision_embedding.", } ) @@ -1188,11 +1285,10 @@ class IsaacForConditionalGeneration( def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): return "<|image_pad|>" - + raise ValueError("Only image modality is supported") def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): - config: IsaacConfig = vllm_config.model_config.hf_config head_dim = config.head_dim @@ -1207,18 +1303,22 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): # Initialize the parent class with updated config super().__init__(vllm_config=vllm_config, prefix=prefix) - + # Create the language model module to match checkpoint structure - self.language_model = nn.ModuleDict({ - "embed_tokens": self.model.embed_tokens, - "layers": self.model.layers, - "norm": self.model.norm - }) - + self.language_model = nn.ModuleDict( + { + "embed_tokens": self.model.embed_tokens, + "layers": self.model.layers, + "norm": self.model.norm, + } + ) + config.vision_config.preserve_original_pe = True config.vision_config.use_rope = False - config.vision_config.hidden_stride = config.vision_config.pixel_shuffle_scale_factor - config.vision_config.window_size = 32*2 + config.vision_config.hidden_stride = ( + config.vision_config.pixel_shuffle_scale_factor + ) + config.vision_config.window_size = 32 * 2 config.vision_config.fullatt_block_indexes = None vision_cfg = config.vision_config if vision_cfg is None: @@ -1226,7 +1326,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) self.vision_embedding = nn.Sequential( - Siglip2VisionTransformer(vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding")), + Siglip2VisionTransformer( + vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding") + ), nn.Linear( hidden_dim, 4 * hidden_dim, @@ -1250,26 +1352,32 @@ def get_mrope_input_positions( ) -> tuple[torch.Tensor, int]: """Get mrope input positions and delta value.""" - vision_token_id = getattr(self.config, 'image_token_id', 151655) + vision_token_id = getattr(self.config, "image_token_id", 151655) spatial_merge_size = hf_config.vision_config.pixel_shuffle_scale_factor input_tokens_tensor = torch.tensor(input_tokens) - + # Find image token positions - image_positions = torch.where(input_tokens_tensor == vision_token_id)[0].tolist() - - # For text-only inputs, use Isaac's original logic from compute_position_ids_input_ids() + image_positions = torch.where(input_tokens_tensor == vision_token_id)[ + 0 + ].tolist() + + # For text-only inputs, use Isaac's original logic from + # compute_position_ids_input_ids() if len(image_positions) == 0: seq_len = len(input_tokens) - # Create 3D positions where all dimensions get the same 1D temporal progression + # Create 3D positions where all dimensions get the same 1D temporal + # progression position_ids = torch.arange(seq_len, dtype=torch.long) position_ids = position_ids.view(1, -1).expand(1, -1) # [1, seq_len] - position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3) # [1, seq_len, 3] + position_ids = position_ids.unsqueeze(2).expand( + -1, -1, 3 + ) # [1, seq_len, 3] # vLLM expects shape [3, seq_len], so transpose position_ids = position_ids.squeeze(0).transpose(0, 1) # [3, seq_len] - + return position_ids, 0 - + events = [] image_idx = 0 current_pos = 0 @@ -1278,7 +1386,7 @@ def get_mrope_input_positions( for image_pos in image_positions: if image_pos <= last_processed_pos: continue # Skip already processed positions - + # Add any text before this image if image_pos > current_pos: text_tokens = image_pos - current_pos @@ -1288,21 +1396,23 @@ def get_mrope_input_positions( idx_range=(0, text_tokens), ) events.append(text_event) - + # Add image t, h, w = image_grid_thw[image_idx] llm_grid_h, llm_grid_w = h // spatial_merge_size, w // spatial_merge_size image_tokens = t * llm_grid_h * llm_grid_w - + image_event = Event( modality_type=VisionType.image, dims_virtual=[t, llm_grid_h, llm_grid_w], idx_range=(0, image_tokens), ) events.append(image_event) - + current_pos = image_pos + image_tokens - last_processed_pos = current_pos - 1 # Mark up to this position as processed + last_processed_pos = ( + current_pos - 1 + ) # Mark up to this position as processed image_idx += 1 # Add final text segment if any @@ -1314,7 +1424,7 @@ def get_mrope_input_positions( idx_range=(0, text_tokens), ) events.append(text_event) - + stream = Stream(events) tensor_stream = TensorStream([stream]) @@ -1334,8 +1444,7 @@ def get_mrope_input_positions( def get_multimodal_embeddings( self, **kwargs: object - ) -> MultiModalEmbeddings | None: - + ) -> MultiModalEmbeddings | None: pixel_values = kwargs.get("pixel_values") image_grid_thw = kwargs.get("image_grid_thw") @@ -1343,15 +1452,21 @@ def get_multimodal_embeddings( return [] # Convert image_grid_thw from [batch, 1, [T, H, W]] to [batch, [H, W]] - spatial_grids = image_grid_thw[:, 0, 1:3] # Extract H, W from [T, H, W] for each image - + spatial_grids = image_grid_thw[ + :, 0, 1:3 + ] # Extract H, W from [T, H, W] for each image + # Process packed sequence patches through vision_embedding module vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) # Split concatenated embeddings for each image item (following Qwen2-VL pattern) - merge_size = self.config.vision_config.pixel_shuffle_scale_factor # Isaac uses pixel shuffle - sizes = spatial_grids.prod(-1) // (merge_size * merge_size) # H * W / (merge_size^2) - + merge_size = ( + self.config.vision_config.pixel_shuffle_scale_factor + ) # Isaac uses pixel shuffle + sizes = spatial_grids.prod(-1) // ( + merge_size * merge_size + ) # H * W / (merge_size^2) + return vision_embeddings.split(sizes.tolist()) def get_input_embeddings( @@ -1362,13 +1477,11 @@ def get_input_embeddings( is_multimodal: torch.Tensor | None = None, handle_oov_mm_token: bool = False, ) -> torch.Tensor: - # Get text embeddings from the base language model inputs_embeds = super().get_input_embeddings(input_ids) - + # If we have multimodal embeddings, merge them with text embeddings if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: - inputs_embeds = _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, @@ -1379,7 +1492,7 @@ def get_input_embeddings( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes = [] - + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From ab38672419d0892ea7c2dedd0735109ffadc067e Mon Sep 17 00:00:00 2001 From: Yang Date: Fri, 21 Nov 2025 19:52:24 -0800 Subject: [PATCH 6/8] [Feature] Enhance Isaac model with vision embedding and attention mechanisms Signed-off-by: Yang --- vllm/model_executor/models/isaac.py | 452 +++++++++++++++++++++++++--- 1 file changed, 417 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 5c61e5bf48a7..d2d980a9aadf 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -14,15 +14,30 @@ import torch import torch.nn as nn import torch.nn.functional as F +from einops import rearrange from transformers import PretrainedConfig, Qwen3Config from transformers.image_processing_utils import BatchFeature from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig from transformers.tokenization_utils import TensorType from typing_extensions import TypedDict, Unpack -from vllm.attention.backends.registry import _Backend +from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.attention.layer import ( + check_upstream_fa_availability, + maybe_get_vit_flash_attn_backend, +) +from vllm.attention.ops.vit_attn_wrappers import ( + vit_xformers_attn_wrapper, +) from vllm.config import VllmConfig -from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, @@ -36,13 +51,14 @@ ) from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM -from vllm.model_executor.models.siglip2navit import Siglip2Encoder +from vllm.model_executor.models.siglip import SiglipMLP from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, _merge_multimodal_embeddings, maybe_prefix, ) +from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, @@ -332,6 +348,16 @@ def __init__( self.num_patches = num_patches +def create_cumulative_seq_lengths( + seq_sizes: torch.Tensor, device: torch.device +) -> tuple[torch.Tensor, int]: + """Create cumulative sequence lengths for variable-length attention.""" + cu_seqlens = torch.zeros(len(seq_sizes) + 1, dtype=torch.int32, device=device) + cu_seqlens[1:] = seq_sizes.cumsum(0) + max_seqlen = int(seq_sizes.max().item()) if len(seq_sizes) > 0 else 0 + return cu_seqlens, max_seqlen + + class Siglip2VariableSequenceEmbeddings(nn.Module): def __init__(self, config: PixelShuffleSiglip2VisionConfig): super().__init__() @@ -367,7 +393,7 @@ def positional_embeddings( align_corners = False antialias = True for spatial_shape in spatial_shapes: - height, width = spatial_shape + height, width = int(spatial_shape[0]), int(spatial_shape[1]) # Guard to ensure height and width are positive for torch.compile if height > 0 and width > 0: resized_pos_embed = F.interpolate( @@ -399,21 +425,16 @@ def forward( ): seq_patches, _seq_sizes, _spatial_shapes = packed_seq_patches - # Apply patch embeddings - target_dtype = self.patch_embedding.weight.dtype - patch_embeds = self.patch_embedding(seq_patches.to(dtype=target_dtype)) + target_weight = self.patch_embedding.weight + seq_patches = seq_patches.to( + device=target_weight.device, dtype=target_weight.dtype + ) + patch_embeds = self.patch_embedding(seq_patches) pos_embeds = self.positional_embeddings(packed_seq_patches) # Flatten patch embeddings to match positional embeddings format - batch_size, patches_per_image, embed_dim = patch_embeds.shape - - # For variable-length attention, we need to reshape to (total_tokens, embed_dim) - if batch_size != 1: - raise ValueError( - "Variable-length attention expects batch_size=1 for packed sequences" - ) - - patch_embeds = patch_embeds.view(batch_size * patches_per_image, embed_dim) + if patch_embeds.dim() == 3: + patch_embeds = patch_embeds.view(-1, patch_embeds.size(-1)) # Add positional embeddings to patch embeddings embeddings = patch_embeds + pos_embeds @@ -1162,6 +1183,313 @@ def get_replacement_isaac(item_idx: int): ] +def all_gather_interleave(local_tensor: torch.Tensor, hidden_size: int, tp_size: int): + """All-gather the input tensor interleavely across model parallel group.""" + import torch.distributed as dist + + gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)] + dist.all_gather( + gathered_tensors, local_tensor, group=parallel_state.get_tp_group().device_group + ) + + gathered_tensors_split = [ + torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors + ] + ordered_tensors = [ + tensor for pair in zip(*gathered_tensors_split) for tensor in pair + ] + return torch.cat(ordered_tensors, dim=-1) + + +class Siglip2VisionAttention(nn.Module): + def __init__( + self, + config: PixelShuffleSiglip2VisionConfig, + quant_config: QuantizationConfig | None = None, + *, + prefix: str = "", + use_data_parallel: bool = False, + use_upstream_fa: bool = False, + attn_backend: AttentionBackendEnum | None = None, + attn_backend_override: AttentionBackendEnum | None = None, + ) -> None: + super().__init__() + + self.tp_size = ( + 1 + if use_data_parallel + else parallel_state.get_tensor_model_parallel_world_size() + ) + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.hidden_size_per_attention_head = dist_utils.divide( + config.hidden_size, config.num_attention_heads + ) + self.num_attention_heads_per_partition = dist_utils.divide( + config.num_attention_heads, self.tp_size + ) + + self.qkv_proj = QKVParallelLinear( + hidden_size=config.hidden_size, + head_size=self.hidden_size_per_attention_head, + total_num_heads=config.num_attention_heads, + total_num_kv_heads=config.num_attention_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + disable_tp=use_data_parallel, + ) + self.out_proj = RowParallelLinear( + input_size=config.hidden_size, + output_size=config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + disable_tp=use_data_parallel, + ) + + self.use_upstream_fa = use_upstream_fa + self.attn_backend = attn_backend + + if self.attn_backend not in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + } and check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = AttentionBackendEnum.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.TORCH_SDPA, + AttentionBackendEnum.XFORMERS, + AttentionBackendEnum.ROCM_AITER_FA, + }: + raise RuntimeError( + f"Isaac vision embedding does not support {self.attn_backend} backend." + ) + self.attn_backend, self.flash_attn_varlen_func = ( + maybe_get_vit_flash_attn_backend( + self.attn_backend, + self.use_upstream_fa, + attn_backend_override=attn_backend_override, + ) + ) + self.is_flash_attn_backend = self.attn_backend in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + } + + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = all_gather_interleave(qkv, self.qkv_proj.hidden_size, self.tp_size) + + q, k, v = qkv.chunk(3, dim=2) + + if self.tp_size > 1: + q = dist_utils.split_tensor_along_last_dim(q, self.tp_size)[self.tp_rank] + k = dist_utils.split_tensor_along_last_dim(k, self.tp_size)[self.tp_rank] + v = dist_utils.split_tensor_along_last_dim(v, self.tp_size)[self.tp_rank] + + new_shape = ( + seq_len, + bs, + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + + def forward( + self, + hidden_states: torch.Tensor, + *, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor | None, + seqlens: torch.Tensor | None, + ) -> torch.Tensor: + batch_size, _, _ = hidden_states.shape + if batch_size != 1: + raise ValueError("packed variable-length attention expects batch_size=1") + x = rearrange(hidden_states, "b s d -> s b d") + x, _ = self.qkv_proj(x) + q, k, v = self.split_qkv(x) + q, k, v = (rearrange(t, "s b h d -> b s h d") for t in (q, k, v)) + + if self.is_flash_attn_backend: + q, k, v = (rearrange(t, "b s ... -> (b s) ...") for t in (q, k, v)) + output = self.flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + dropout_p=0.0, + causal=False, + ) + context_layer = rearrange( + output, "(b s) h d -> s b (h d)", b=batch_size + ).contiguous() + elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA: + outputs = [] + for i in range(1, len(cu_seqlens)): + start_idx = cu_seqlens[i - 1] + end_idx = cu_seqlens[i] + q_i = q[:, start_idx:end_idx] + k_i = k[:, start_idx:end_idx] + v_i = v[:, start_idx:end_idx] + q_i, k_i, v_i = ( + rearrange(tensor, "b s h d -> b h s d") + for tensor in (q_i, k_i, v_i) + ) + output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0) + output_i = rearrange(output_i, "b h s d -> b s h d") + outputs.append(output_i) + context_layer = torch.cat(outputs, dim=1) + context_layer = rearrange( + context_layer, "b s h d -> s b (h d)" + ).contiguous() + elif self.attn_backend == AttentionBackendEnum.XFORMERS: + if seqlens is None: + raise ValueError("xFormers attention backend requires seqlens tensor.") + context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) + else: + raise RuntimeError( + f"Isaac vision embedding does not support {self.attn_backend} backend." + ) + + output, _ = self.out_proj(context_layer) + output = rearrange(output, "s b d -> b s d") + return output + + +class Siglip2EncoderLayer(nn.Module): + def __init__( + self, + config: PixelShuffleSiglip2VisionConfig, + quant_config: QuantizationConfig | None = None, + *, + prefix: str = "", + attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, + attn_backend_override: AttentionBackendEnum | None = None, + use_upstream_fa: bool = False, + use_data_parallel: bool = False, + ) -> None: + super().__init__() + self.embed_dim = config.hidden_size + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.self_attn = Siglip2VisionAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + use_data_parallel=use_data_parallel, + use_upstream_fa=use_upstream_fa, + attn_backend=attn_backend, + attn_backend_override=attn_backend_override, + ) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = SiglipMLP( + config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + + def forward( + self, + hidden_states: torch.Tensor, + *, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor | None, + seqlens: torch.Tensor | None, + ) -> torch.Tensor: + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states = self.self_attn( + hidden_states=hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Siglip2Encoder(nn.Module): + def __init__( + self, + config: PixelShuffleSiglip2VisionConfig, + quant_config: QuantizationConfig | None = None, + *, + prefix: str = "", + use_data_parallel: bool = False, + attn_backend_override: AttentionBackendEnum | None = None, + ) -> None: + super().__init__() + self.config = config + embed_dim = config.hidden_size + num_heads = config.num_attention_heads + head_dim = embed_dim // num_heads + self.attn_backend = get_vit_attn_backend( + head_size=head_dim, + dtype=torch.get_default_dtype(), + attn_backend_override=attn_backend_override, + ) + self.use_upstream_fa = False + if self.attn_backend not in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + } and check_upstream_fa_availability(torch.get_default_dtype()): + self.attn_backend = AttentionBackendEnum.FLASH_ATTN + self.use_upstream_fa = True + if self.attn_backend not in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.TORCH_SDPA, + AttentionBackendEnum.XFORMERS, + AttentionBackendEnum.ROCM_AITER_FA, + }: + raise RuntimeError( + f"Isaac vision embedding does not support {self.attn_backend} backend." + ) + self.layers = nn.ModuleList( + [ + Siglip2EncoderLayer( + config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}", + attn_backend=self.attn_backend, + attn_backend_override=attn_backend_override, + use_upstream_fa=self.use_upstream_fa, + use_data_parallel=use_data_parallel, + ) + for layer_idx in range(config.num_hidden_layers) + ] + ) + + def forward( + self, + inputs_embeds: torch.Tensor, + *, + cu_seqlens: torch.Tensor | None = None, + max_seqlen: torch.Tensor | None = None, + seqlens: torch.Tensor | None = None, + ) -> torch.Tensor: + hidden_states = inputs_embeds + for encoder_layer in self.layers: + hidden_states = encoder_layer( + hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + return hidden_states + + class Siglip2VisionTransformer(nn.Module): def __init__( self, @@ -1169,7 +1497,7 @@ def __init__( quant_config: QuantizationConfig | None = None, prefix: str = "", use_data_parallel: bool = False, - attn_backend_override: _Backend | None = None, + attn_backend_override: AttentionBackendEnum | None = None, ): super().__init__() self.config = config @@ -1187,6 +1515,19 @@ def __init__( ) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + def compute_attn_mask_seqlen( + self, cu_seqlens: torch.Tensor + ) -> tuple[torch.Tensor | None, torch.Tensor | None]: + max_seqlen, seqlens = None, None + if self.encoder.attn_backend in { + AttentionBackendEnum.FLASH_ATTN, + AttentionBackendEnum.ROCM_AITER_FA, + }: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() + elif self.encoder.attn_backend == AttentionBackendEnum.XFORMERS: + seqlens = cu_seqlens[1:] - cu_seqlens[:-1] + return max_seqlen, seqlens + def forward( self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor], @@ -1203,15 +1544,20 @@ def forward( # Get embeddings from packed sequence hidden_states = self.embeddings((seq_patches, seq_sizes, token_grids)) - grid_thws = torch.tensor( - [[1, token_grids[0][0].item(), token_grids[0][1].item()]] - ) - last_hidden_state = self.encoder(hidden_states, grid_thws) - hidden_states = self.post_layernorm(last_hidden_state) - # Add a pseudo batch dimension for the encoder hidden_states = hidden_states.unsqueeze(0) + cu_seqlens, _ = create_cumulative_seq_lengths(seq_sizes, hidden_states.device) + max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens) + + hidden_states = self.encoder( + inputs_embeds=hidden_states, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + seqlens=seqlens, + ) + hidden_states = self.post_layernorm(hidden_states) + if self.pixel_shuffle_scale_factor > 1: hidden_states = pixel_shuffle_varlen( x=hidden_states, @@ -1252,6 +1598,44 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return loaded_params +class IsaacVisionEmbedding(nn.Module): + def __init__( + self, + vision_cfg: PixelShuffleSiglip2VisionConfig, + hidden_dim: int, + output_dim: int, + prefix: str, + ): + super().__init__() + self.transformer = Siglip2VisionTransformer( + vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding") + ) + self.linear_fc1 = ColumnParallelLinear( + hidden_dim, + 4 * hidden_dim, + bias=False, + prefix=maybe_prefix(prefix, "vision_embedding.1"), + return_bias=False, + ) + self.act = nn.SiLU() + self.linear_fc2 = RowParallelLinear( + 4 * hidden_dim, + output_dim, + bias=False, + prefix=maybe_prefix(prefix, "vision_embedding.3"), + return_bias=False, + ) + + def forward( + self, packed_seq_patches: tuple[torch.Tensor, torch.Tensor] + ) -> torch.Tensor: + hidden_states = self.transformer(packed_seq_patches) + hidden_states = self.linear_fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_fc2(hidden_states) + return hidden_states + + @MULTIMODAL_REGISTRY.register_processor( IsaacMultiModalProcessor, info=IsaacProcessingInfo, @@ -1278,6 +1662,10 @@ class IsaacForConditionalGeneration( hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_embedding.": "vision_embedding.", + "vision_embedding.0": "vision_embedding.transformer", + "vision_embedding.1": "vision_embedding.linear_fc1", + "vision_embedding.2": "vision_embedding.act", + "vision_embedding.3": "vision_embedding.linear_fc2", } ) @@ -1325,17 +1713,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): raise ValueError("IsaacConfig should always have vision_config") hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) - self.vision_embedding = nn.Sequential( - Siglip2VisionTransformer( - vision_cfg, prefix=maybe_prefix(prefix, "vision_embedding") - ), - nn.Linear( - hidden_dim, - 4 * hidden_dim, - bias=False, - ), - nn.SiLU(), - nn.Linear(4 * hidden_dim, config.hidden_size, bias=False), + self.vision_embedding = IsaacVisionEmbedding( + vision_cfg=vision_cfg, + hidden_dim=hidden_dim, + output_dim=config.hidden_size, + prefix=prefix, ) def get_mrope_input_positions( @@ -1502,6 +1884,6 @@ def get_mm_mapping(self) -> MultiModelKeys: """ return MultiModelKeys.from_string_field( language_model="language_model", - connector="vision_embedding.3", # The final linear layer + connector="vision_embedding.linear_fc2", # The final linear layer tower_model="vision_embedding", ) From f6695fe13457732a41e8383d9a4e4e9d305f94da Mon Sep 17 00:00:00 2001 From: Yang Date: Wed, 26 Nov 2025 15:09:47 -0800 Subject: [PATCH 7/8] 1. Add support for Isaac model in the registry and documentation 2. optimize Isaac model implementation. Signed-off-by: Yang --- docs/models/supported_models.md | 1 + tests/models/registry.py | 4 + vllm/model_executor/models/isaac.py | 396 +++++++++++++++------------- 3 files changed, 213 insertions(+), 188 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 626904a97415..b6af900632ed 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -681,6 +681,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | +| `IsaacForConditionalGeneration` | Isaac | T + I+ | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ | | `InternS1ForConditionalGeneration` | Intern-S1 | T + IE+ + VE+ | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ | | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + IE+ + VE+ | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index b088e16756d7..5574f9bc6c41 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -630,6 +630,10 @@ def check_available_online( "HuggingFaceM4/Idefics3-8B-Llama3", extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}, ), + "IsaacForConditionalGeneration": _HfExamplesInfo( + "PerceptronAI/Isaac-0.1", + trust_remote_code=True, + ), "InternS1ForConditionalGeneration": _HfExamplesInfo( "internlm/Intern-S1", trust_remote_code=True ), diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index d2d980a9aadf..82dae62cb56e 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -4,7 +4,7 @@ import itertools import math -from collections.abc import Iterable, Mapping, Sequence +from collections.abc import Iterable, Iterator, Mapping, Sequence from dataclasses import dataclass from enum import Enum from typing import Any @@ -15,7 +15,7 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from transformers import PretrainedConfig, Qwen3Config +from transformers import Qwen3Config from transformers.image_processing_utils import BatchFeature from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig from transformers.tokenization_utils import TensorType @@ -30,8 +30,10 @@ vit_xformers_attn_wrapper, ) from vllm.config import VllmConfig +from vllm.config.model import ModelConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils +from vllm.logger import init_logger from vllm.model_executor.layers.linear import ( ColumnParallelLinear, QKVParallelLinear, @@ -50,18 +52,18 @@ SupportsPP, ) from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM from vllm.model_executor.models.siglip import SiglipMLP from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, - _merge_multimodal_embeddings, + init_vllm_registered_model, maybe_prefix, ) from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalDataDict, + MultiModalFeatureSpec, MultiModalFieldConfig, MultiModalKwargs, ) @@ -73,6 +75,13 @@ PromptUpdate, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.tokenizer import ( + get_cached_tokenizer, + get_tokenizer, +) + +logger = init_logger(__name__) # ===== TensorStream Compatibility Layer for Isaac MRoPE ===== # Minimal implementation of TensorStream classes needed for Isaac's 3D positional @@ -286,12 +295,14 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten dims = (event.dims() or [1]) + [1] * (n_pos_dims - len(event.dims() or [])) # Create ranges for each dimension (similar to old _finalize implementation) - first_dim = range(cumulative_offset, cumulative_offset + dims[0]) + first_dim = list(range(cumulative_offset, cumulative_offset + dims[0])) cumulative_offset += dims[0] # advance time for the next event - other_dims = [range(d) for d in dims[1:]] - # Use itertools.product to create all coordinate combinations - full_coords = list(itertools.product(first_dim, *other_dims)) + if event.modality_type != VisionType.image: + full_coords = [(t, t, t) for t in first_dim] + else: + other_dims = [range(d) for d in dims[1:]] + full_coords = list(itertools.product(first_dim, *other_dims)) # Slice if the event is partial s, e = event.idx_range @@ -307,6 +318,19 @@ def compute_mrope_pos_tensor(ts: TensorStream, n_pos_dims: int = 3) -> torch.Ten ) +def _resolve_vision_token_id(model_config: ModelConfig, vision_token: str) -> int: + tokenizer_name = model_config.tokenizer or model_config.model + tokenizer = get_cached_tokenizer( + get_tokenizer( + tokenizer_name, + tokenizer_mode=model_config.tokenizer_mode, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.tokenizer_revision or model_config.revision, + ) + ) + return tokenizer.encode(vision_token, add_special_tokens=False)[0] + + def modality_mask(ts: TensorStream, modality_type: ModalityType) -> torch.Tensor: """Create boolean mask for specific modality type in the tensor stream.""" B, T = ts.shape @@ -883,7 +907,8 @@ def __init__( vision_min_num_patches: int | None = None, pixel_shuffle_scale: int = 1, max_sequence_length: int = 16384, - vision_token: str = "<|image_pad|>", + vision_token: str = "", + vision_attn_implementation: str | None = None, **kwargs, ): super().__init__(**kwargs) @@ -899,10 +924,25 @@ def __init__( self.vision_token = vision_token # Handle vision config - PixelShuffleSiglip2VisionConfig instance - self.vision_config = PixelShuffleSiglip2VisionConfig( - pixel_shuffle_scale_factor=pixel_shuffle_scale, - num_patches=vision_max_num_patches, + if isinstance(vision_config, dict): + self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config) + elif vision_config is None: + self.vision_config = PixelShuffleSiglip2VisionConfig() + else: + self.vision_config = vision_config + + # Ensure compatibility with pretrained checkpoints + self.vision_config.pixel_shuffle_scale_factor = getattr( + self.vision_config, + "pixel_shuffle_scale_factor", + pixel_shuffle_scale, + ) + self.vision_config.num_patches = getattr( + self.vision_config, + "num_patches", + vision_max_num_patches, ) + self.vision_attn_implementation = vision_attn_implementation class IsaacImageProcessorKwargs(TypedDict, total=False): @@ -991,9 +1031,9 @@ class IsaacProcessor: tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") def __init__(self, image_processor=None, tokenizer=None, **kwargs): + self.image_token = kwargs.pop("image_token", "") self.image_processor = image_processor or IsaacImageProcessor(kwargs) self.tokenizer = tokenizer - self.image_token = "<|image_pad|>" def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: result = {} @@ -1062,12 +1102,20 @@ def get_hf_config(self) -> IsaacConfig: max_sequence_length=getattr( original_config, "max_sequence_length", 16384 ), - vision_token="<|image_pad|>", + vision_token=getattr(original_config, "vision_token", ""), + vision_attn_implementation=getattr( + original_config, "vision_attn_implementation", None + ), ) return IsaacConfig() def get_hf_processor(self, **kwargs) -> IsaacProcessor: - return self.ctx.get_hf_processor(IsaacProcessor, **kwargs) + hf_config = self.get_hf_config() + processor_kwargs = { + "image_token": hf_config.vision_token, + } + processor_kwargs.update(kwargs) + return self.ctx.get_hf_processor(IsaacProcessor, **processor_kwargs) def get_tokenizer(self): return self.ctx.tokenizer @@ -1157,11 +1205,13 @@ def _get_prompt_updates( out_mm_kwargs: MultiModalKwargs, ) -> Sequence[PromptUpdate]: # hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + hf_config = self.info.get_hf_config() image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) tokenizer = self.info.get_tokenizer() - - vocab = tokenizer.get_vocab() - placeholder_id = vocab.get("<|image_pad|>", 151655) + placeholder_id = tokenizer.encode( + hf_config.vision_token, + add_special_tokens=False, + ) pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2) merge_length = pixel_shuffle_scale**2 @@ -1172,12 +1222,12 @@ def get_replacement_isaac(item_idx: int): assert isinstance(grid_thw, torch.Tensor) num_tokens = int(grid_thw.prod()) // merge_length - return [placeholder_id] * num_tokens + return placeholder_id * num_tokens return [ PromptReplacement( modality="image", - target=[placeholder_id], + target=placeholder_id, replacement=get_replacement_isaac, ) ] @@ -1278,16 +1328,7 @@ def __init__( def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: seq_len, bs, _ = qkv.shape - if self.tp_size > 1: - qkv = all_gather_interleave(qkv, self.qkv_proj.hidden_size, self.tp_size) - q, k, v = qkv.chunk(3, dim=2) - - if self.tp_size > 1: - q = dist_utils.split_tensor_along_last_dim(q, self.tp_size)[self.tp_rank] - k = dist_utils.split_tensor_along_last_dim(k, self.tp_size)[self.tp_rank] - v = dist_utils.split_tensor_along_last_dim(v, self.tp_size)[self.tp_rank] - new_shape = ( seq_len, bs, @@ -1604,7 +1645,8 @@ def __init__( vision_cfg: PixelShuffleSiglip2VisionConfig, hidden_dim: int, output_dim: int, - prefix: str, + quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.transformer = Siglip2VisionTransformer( @@ -1614,6 +1656,7 @@ def __init__( hidden_dim, 4 * hidden_dim, bias=False, + quant_config=quant_config, prefix=maybe_prefix(prefix, "vision_embedding.1"), return_bias=False, ) @@ -1622,6 +1665,7 @@ def __init__( 4 * hidden_dim, output_dim, bias=False, + quant_config=quant_config, prefix=maybe_prefix(prefix, "vision_embedding.3"), return_bias=False, ) @@ -1642,8 +1686,9 @@ def forward( dummy_inputs=IsaacDummyInputsBuilder, ) class IsaacForConditionalGeneration( - Qwen3ForCausalLM, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE + nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE ): + merge_by_field_config = True packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1661,221 +1706,196 @@ class IsaacForConditionalGeneration( # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.vision_embedding.0": "vision_embedding.transformer", + "model.vision_embedding.1": "vision_embedding.linear_fc1", + "model.vision_embedding.2": "vision_embedding.act", + "model.vision_embedding.3": "vision_embedding.linear_fc2", "model.vision_embedding.": "vision_embedding.", - "vision_embedding.0": "vision_embedding.transformer", - "vision_embedding.1": "vision_embedding.linear_fc1", - "vision_embedding.2": "vision_embedding.act", - "vision_embedding.3": "vision_embedding.linear_fc2", + "model.": "language_model.model.", } ) @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): - return "<|image_pad|>" + return "" raise ValueError("Only image modality is supported") def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"): + super().__init__() config: IsaacConfig = vllm_config.model_config.hf_config - head_dim = config.head_dim + quant_config = vllm_config.quant_config + self.config = config + self.multimodal_config = vllm_config.model_config.multimodal_config + head_dim = config.head_dim calculated_mrope_section = [ head_dim // 4, # 2x more for temporal dim head_dim // 8, head_dim // 8, ] - config.rope_scaling["mrope_section"] = calculated_mrope_section - self.config = config - - # Initialize the parent class with updated config - super().__init__(vllm_config=vllm_config, prefix=prefix) - - # Create the language model module to match checkpoint structure - self.language_model = nn.ModuleDict( - { - "embed_tokens": self.model.embed_tokens, - "layers": self.model.layers, - "norm": self.model.norm, - } + self.vision_token_id = _resolve_vision_token_id( + vllm_config.model_config, config.vision_token ) + config.image_token_id = self.vision_token_id - config.vision_config.preserve_original_pe = True - config.vision_config.use_rope = False - config.vision_config.hidden_stride = ( - config.vision_config.pixel_shuffle_scale_factor + logger.info("vllm config: %s", repr(vllm_config)) + config.rope_scaling["mrope_section"] = calculated_mrope_section + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + architectures=["Qwen3ForCausalLM"], + prefix=maybe_prefix(prefix, "language_model"), ) - config.vision_config.window_size = 32 * 2 - config.vision_config.fullatt_block_indexes = None + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) + vision_cfg = config.vision_config if vision_cfg is None: raise ValueError("IsaacConfig should always have vision_config") + vision_cfg.preserve_original_pe = True + vision_cfg.use_rope = False + vision_cfg.hidden_stride = vision_cfg.pixel_shuffle_scale_factor + vision_cfg.window_size = 32 * 2 + vision_cfg.fullatt_block_indexes = None + attn_impl = ( + config.vision_attn_implementation + if config.vision_attn_implementation is not None + else getattr(config, "_attn_implementation", None) + ) + if attn_impl is not None: + vision_cfg._attn_implementation = attn_impl hidden_dim = vision_cfg.hidden_size * (vision_cfg.pixel_shuffle_scale_factor**2) self.vision_embedding = IsaacVisionEmbedding( vision_cfg=vision_cfg, hidden_dim=hidden_dim, output_dim=config.hidden_size, - prefix=prefix, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "vision_embedding"), ) + def iter_mm_grid_hw( + self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec] + ) -> Iterator[tuple[int, int, int]]: + spatial_merge_size = self.config.vision_config.pixel_shuffle_scale_factor + for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset): + offset = mm_feature.mm_position.offset + if mm_feature.modality == "image": + t, h, w = mm_feature.data["image_grid_thw"].data.tolist() + assert t == 1, f"Image must have 1 frame, got {t}" + yield offset, h // spatial_merge_size, w // spatial_merge_size + else: + raise ValueError(f"Unsupported modality: {mm_feature.modality}") + def get_mrope_input_positions( self, input_tokens: list[int], - hf_config: PretrainedConfig, - image_grid_thw: list[list[int]] | torch.Tensor, - video_grid_thw: list[list[int]] | torch.Tensor, - context_len: int = 0, - seq_len: int | None = None, - second_per_grid_ts: list[float] | None = None, - audio_feature_lengths: torch.Tensor | None = None, - use_audio_in_video: bool = False, + mm_features: list[MultiModalFeatureSpec], ) -> tuple[torch.Tensor, int]: - """Get mrope input positions and delta value.""" - - vision_token_id = getattr(self.config, "image_token_id", 151655) - spatial_merge_size = hf_config.vision_config.pixel_shuffle_scale_factor - input_tokens_tensor = torch.tensor(input_tokens) - - # Find image token positions - image_positions = torch.where(input_tokens_tensor == vision_token_id)[ - 0 - ].tolist() - - # For text-only inputs, use Isaac's original logic from - # compute_position_ids_input_ids() - if len(image_positions) == 0: - seq_len = len(input_tokens) - # Create 3D positions where all dimensions get the same 1D temporal - # progression - position_ids = torch.arange(seq_len, dtype=torch.long) - position_ids = position_ids.view(1, -1).expand(1, -1) # [1, seq_len] - position_ids = position_ids.unsqueeze(2).expand( - -1, -1, 3 - ) # [1, seq_len, 3] - - # vLLM expects shape [3, seq_len], so transpose - position_ids = position_ids.squeeze(0).transpose(0, 1) # [3, seq_len] - - return position_ids, 0 - - events = [] - image_idx = 0 - current_pos = 0 - last_processed_pos = -1 - - for image_pos in image_positions: - if image_pos <= last_processed_pos: - continue # Skip already processed positions - - # Add any text before this image - if image_pos > current_pos: - text_tokens = image_pos - current_pos - text_event = Event( - modality_type=TextType.text, - dims_virtual=[text_tokens, 1], - idx_range=(0, text_tokens), - ) - events.append(text_event) + llm_pos_ids_list = [] + st = 0 + for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw( + input_tokens, mm_features + ): + text_len = offset - st + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append( + np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx + ) - # Add image - t, h, w = image_grid_thw[image_idx] - llm_grid_h, llm_grid_w = h // spatial_merge_size, w // spatial_merge_size - image_tokens = t * llm_grid_h * llm_grid_w + grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1) + grid_indices[0, :] = grid_indices[0, :] + text_len + st_idx + llm_pos_ids_list.append(grid_indices) + st = offset + llm_grid_h * llm_grid_w - image_event = Event( - modality_type=VisionType.image, - dims_virtual=[t, llm_grid_h, llm_grid_w], - idx_range=(0, image_tokens), + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1][0, -1] + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx ) - events.append(image_event) - - current_pos = image_pos + image_tokens - last_processed_pos = ( - current_pos - 1 - ) # Mark up to this position as processed - image_idx += 1 - - # Add final text segment if any - if current_pos < len(input_tokens): - text_tokens = len(input_tokens) - current_pos - text_event = Event( - modality_type=TextType.text, - dims_virtual=[text_tokens, 1], - idx_range=(0, text_tokens), - ) - events.append(text_event) - stream = Stream(events) - tensor_stream = TensorStream([stream]) + llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1) + mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() - # Use Isaac's native MRoPE calculation - position_ids = compute_mrope_pos_tensor(tensor_stream, n_pos_dims=3) + return torch.from_numpy(llm_positions), mrope_position_delta - # Max position per batch across the 3 planes and sequence dimension: (B,) - m_per_batch = position_ids.amax(dim=(1, 2)) + def _parse_and_validate_image_input( + self, **kwargs: object + ) -> dict[str, torch.Tensor] | None: + pixel_values = kwargs.get("pixel_values") + image_grid_thw = kwargs.get("image_grid_thw") + if pixel_values is None or image_grid_thw is None: + return None + return { + "pixel_values": pixel_values, + "image_grid_thw": image_grid_thw, + } - mrope_position_delta = (m_per_batch + 1 - len(input_tokens)).item() + def _process_image_input( + self, + image_input: dict[str, torch.Tensor], + ) -> tuple[torch.Tensor, ...]: + pixel_values = image_input["pixel_values"] + image_grid_thw = image_input["image_grid_thw"] + if pixel_values.numel() == 0: + return () + + device = next(self.language_model.parameters()).device + dtype = self.vision_embedding.linear_fc1.weight.dtype + pixel_values = pixel_values.to(device=device, dtype=dtype) + if image_grid_thw.dim() == 3: + image_grid_thw = image_grid_thw[0] + spatial_grids = image_grid_thw[:, 1:3].to(device, dtype=torch.int32) - # vLLM expects shape [3, seq_len] but Isaac returns [batch, seq_len, 3] - # Transpose to match vLLM's expected format - position_ids = position_ids.squeeze(0).transpose(0, 1) + vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) + merge_size = self.config.vision_config.pixel_shuffle_scale_factor + sizes = spatial_grids.prod(-1) // (merge_size * merge_size) + return tuple(vision_embeddings.split(sizes.tolist())) - return position_ids, mrope_position_delta + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return () + return self._process_image_input(image_input) def get_multimodal_embeddings( self, **kwargs: object ) -> MultiModalEmbeddings | None: - pixel_values = kwargs.get("pixel_values") - image_grid_thw = kwargs.get("image_grid_thw") - - if pixel_values is None: + # Backward compatibility for older runners. + embeddings = self.embed_multimodal(**kwargs) + if not embeddings: return [] + return embeddings - # Convert image_grid_thw from [batch, 1, [T, H, W]] to [batch, [H, W]] - spatial_grids = image_grid_thw[ - :, 0, 1:3 - ] # Extract H, W from [T, H, W] for each image - - # Process packed sequence patches through vision_embedding module - vision_embeddings = self.vision_embedding((pixel_values, spatial_grids)) - - # Split concatenated embeddings for each image item (following Qwen2-VL pattern) - merge_size = ( - self.config.vision_config.pixel_shuffle_scale_factor - ) # Isaac uses pixel shuffle - sizes = spatial_grids.prod(-1) // ( - merge_size * merge_size - ) # H * W / (merge_size^2) - - return vision_embeddings.split(sizes.tolist()) + def get_language_model(self) -> torch.nn.Module: + return self.language_model - def get_input_embeddings( + def forward( self, input_ids: torch.Tensor, - multimodal_embeddings: MultiModalEmbeddings | None = None, - *, - is_multimodal: torch.Tensor | None = None, - handle_oov_mm_token: bool = False, - ) -> torch.Tensor: - # Get text embeddings from the base language model - inputs_embeds = super().get_input_embeddings(input_ids) - - # If we have multimodal embeddings, merge them with text embeddings - if multimodal_embeddings is not None and len(multimodal_embeddings) != 0: - inputs_embeds = _merge_multimodal_embeddings( - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, - ) + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ) -> torch.Tensor | IntermediateTensors: + return self.language_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs, + ) - return inputs_embeds + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None: + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - skip_prefixes = [] - - loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_mm_mapping(self) -> MultiModelKeys: From a69bdafe094a8b801595719b995d070743e5a139 Mon Sep 17 00:00:00 2001 From: Oscar Gonzalez Date: Tue, 2 Dec 2025 01:09:19 -0500 Subject: [PATCH 8/8] 1. Remove upstream fa checks (#29471) 2. Remove deprecated xformers (#29262) 3. Updated _get_prompt_updates() Signed-off-by: Oscar Gonzalez --- vllm/model_executor/models/isaac.py | 46 ++++------------------------- 1 file changed, 5 insertions(+), 41 deletions(-) diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 82dae62cb56e..e5a2d5440724 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -23,12 +23,8 @@ from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.layer import ( - check_upstream_fa_availability, maybe_get_vit_flash_attn_backend, ) -from vllm.attention.ops.vit_attn_wrappers import ( - vit_xformers_attn_wrapper, -) from vllm.config import VllmConfig from vllm.config.model import ModelConfig from vllm.distributed import parallel_state @@ -73,6 +69,7 @@ BaseProcessingInfo, PromptReplacement, PromptUpdate, + PromptUpdateDetails, ) from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors @@ -1204,14 +1201,7 @@ def _get_prompt_updates( hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> Sequence[PromptUpdate]: - # hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - hf_config = self.info.get_hf_config() image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs) - tokenizer = self.info.get_tokenizer() - placeholder_id = tokenizer.encode( - hf_config.vision_token, - add_special_tokens=False, - ) pixel_shuffle_scale = getattr(image_processor, "pixel_shuffle_scale", 2) merge_length = pixel_shuffle_scale**2 @@ -1221,13 +1211,14 @@ def get_replacement_isaac(item_idx: int): grid_thw = out_item["image_grid_thw"].data assert isinstance(grid_thw, torch.Tensor) - num_tokens = int(grid_thw.prod()) // merge_length - return placeholder_id * num_tokens + feature_size = int(grid_thw.prod()) // merge_length + repl_full = "<|image_pad|>" * feature_size + return PromptUpdateDetails.select_text(repl_full, "<|image_pad|>") return [ PromptReplacement( modality="image", - target=placeholder_id, + target="", replacement=get_replacement_isaac, ) ] @@ -1259,7 +1250,6 @@ def __init__( *, prefix: str = "", use_data_parallel: bool = False, - use_upstream_fa: bool = False, attn_backend: AttentionBackendEnum | None = None, attn_backend_override: AttentionBackendEnum | None = None, ) -> None: @@ -1296,19 +1286,11 @@ def __init__( disable_tp=use_data_parallel, ) - self.use_upstream_fa = use_upstream_fa self.attn_backend = attn_backend - if self.attn_backend not in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.ROCM_AITER_FA, - } and check_upstream_fa_availability(torch.get_default_dtype()): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN - self.use_upstream_fa = True if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.TORCH_SDPA, - AttentionBackendEnum.XFORMERS, AttentionBackendEnum.ROCM_AITER_FA, }: raise RuntimeError( @@ -1317,7 +1299,6 @@ def __init__( self.attn_backend, self.flash_attn_varlen_func = ( maybe_get_vit_flash_attn_backend( self.attn_backend, - self.use_upstream_fa, attn_backend_override=attn_backend_override, ) ) @@ -1389,10 +1370,6 @@ def forward( context_layer = rearrange( context_layer, "b s h d -> s b (h d)" ).contiguous() - elif self.attn_backend == AttentionBackendEnum.XFORMERS: - if seqlens is None: - raise ValueError("xFormers attention backend requires seqlens tensor.") - context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens) else: raise RuntimeError( f"Isaac vision embedding does not support {self.attn_backend} backend." @@ -1412,7 +1389,6 @@ def __init__( prefix: str = "", attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA, attn_backend_override: AttentionBackendEnum | None = None, - use_upstream_fa: bool = False, use_data_parallel: bool = False, ) -> None: super().__init__() @@ -1423,7 +1399,6 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.self_attn", use_data_parallel=use_data_parallel, - use_upstream_fa=use_upstream_fa, attn_backend=attn_backend, attn_backend_override=attn_backend_override, ) @@ -1481,17 +1456,9 @@ def __init__( dtype=torch.get_default_dtype(), attn_backend_override=attn_backend_override, ) - self.use_upstream_fa = False - if self.attn_backend not in { - AttentionBackendEnum.FLASH_ATTN, - AttentionBackendEnum.ROCM_AITER_FA, - } and check_upstream_fa_availability(torch.get_default_dtype()): - self.attn_backend = AttentionBackendEnum.FLASH_ATTN - self.use_upstream_fa = True if self.attn_backend not in { AttentionBackendEnum.FLASH_ATTN, AttentionBackendEnum.TORCH_SDPA, - AttentionBackendEnum.XFORMERS, AttentionBackendEnum.ROCM_AITER_FA, }: raise RuntimeError( @@ -1505,7 +1472,6 @@ def __init__( prefix=f"{prefix}.layers.{layer_idx}", attn_backend=self.attn_backend, attn_backend_override=attn_backend_override, - use_upstream_fa=self.use_upstream_fa, use_data_parallel=use_data_parallel, ) for layer_idx in range(config.num_hidden_layers) @@ -1565,8 +1531,6 @@ def compute_attn_mask_seqlen( AttentionBackendEnum.ROCM_AITER_FA, }: max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() - elif self.encoder.attn_backend == AttentionBackendEnum.XFORMERS: - seqlens = cu_seqlens[1:] - cu_seqlens[:-1] return max_seqlen, seqlens def forward(