Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions src/anomalib/models/image/dinomaly/torch_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,9 @@ def get_encoder_decoder_outputs(self, x: torch.Tensor) -> tuple[list[torch.Tenso
- en: List of fused encoder features reshaped to spatial dimensions
- de: List of fused decoder features reshaped to spatial dimensions
"""
h_patches = x.shape[2] // self.encoder.patch_size
w_patches = x.shape[3] // self.encoder.patch_size
x = self.encoder.prepare_tokens(x)

encoder_features = []
decoder_features = []

Expand All @@ -216,8 +217,7 @@ def get_encoder_decoder_outputs(self, x: torch.Tensor) -> tuple[list[torch.Tenso
continue
if i in self.target_layers:
encoder_features.append(x)
side = int(math.sqrt(encoder_features[0].shape[1] - 1 - self.encoder.num_register_tokens))


if self.remove_class_token:
encoder_features = [e[:, 1 + self.encoder.num_register_tokens :, :] for e in encoder_features]

Expand All @@ -237,8 +237,8 @@ def get_encoder_decoder_outputs(self, x: torch.Tensor) -> tuple[list[torch.Tenso
de = [self._fuse_feature([decoder_features[idx] for idx in idxs]) for idxs in self.fuse_layer_decoder]

# Process features for spatial output
en = self._process_features_for_spatial_output(en, side)
de = self._process_features_for_spatial_output(de, side)
en = self._process_features_for_spatial_output(en, h_patches, w_patches)
de = self._process_features_for_spatial_output(de, h_patches, w_patches)
return en, de

def forward(self, batch: torch.Tensor, global_step: int | None = None) -> torch.Tensor | InferenceBatch:
Expand All @@ -262,7 +262,7 @@ def forward(self, batch: torch.Tensor, global_step: int | None = None) -> torch.

"""
en, de = self.get_encoder_decoder_outputs(batch)
image_size = batch.shape[2]
image_size = (batch.shape[2], batch.shape[3])

if self.training:
if global_step is None:
Expand Down Expand Up @@ -376,13 +376,14 @@ def _get_architecture_config(encoder_name: str, target_layers: list[int] | None)
def _process_features_for_spatial_output(
self,
features: list[torch.Tensor],
side: int,
h_patches: int, w_patches: int
) -> list[torch.Tensor]:
"""Process features for spatial output by removing tokens and reshaping.

Args:
features: List of feature tensors
side: Side length for spatial reshaping
h_patches: Number of patches in height dimension
w_patches: Number of patches in width dimension

Returns:
List of processed feature tensors with spatial dimensions
Expand All @@ -393,7 +394,7 @@ def _process_features_for_spatial_output(

# Reshape to spatial dimensions
batch_size = features[0].shape[0]
return [f.permute(0, 2, 1).reshape([batch_size, -1, side, side]).contiguous() for f in features]
return [f.permute(0, 2, 1).reshape([batch_size, -1, h_patches, w_patches]).contiguous() for f in features]


class DecoderViTBlock(nn.Module):
Expand Down
Loading
Loading