Spaces:

timm
/

timmAttentionViz

Running

App Files Files Community

rwightman HF staff commited on Aug 28

Commit

7163838

•

1 Parent(s): 8af18ab

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -5

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ class AttentionExtractor:
                 self.attention_maps[module.full_name] = output
         for name, module in self.model.named_modules():
             if name.lower().endswith('.attn_drop'):
                 module.full_name = name
                 print('hooking', name)
@@ -34,8 +35,9 @@ class AttentionExtractor:
 def get_attention_models() -> List[str]:
     """Get a list of timm models that have attention blocks."""
-    all_models = timm.list_models()
-    attention_models = [model for model in all_models if 'vit' in model.lower()]  # Focusing on ViT models for simplicity
     return attention_models
 def load_model(model_name: str) -> Tuple[torch.nn.Module, AttentionExtractor]:
@@ -88,7 +90,8 @@ def visualize_attention(image: Image.Image, model_name: str) -> List[Image.Image
     """Visualize attention maps for the given image and model."""
     model, extractor = load_model(model_name)
     attention_maps = process_image(image, model, extractor)
     # Convert PIL Image to numpy array
     image_np = np.array(image)
@@ -97,8 +100,8 @@ def visualize_attention(image: Image.Image, model_name: str) -> List[Image.Image
     for layer_name, attn_map in attention_maps.items():
         print(f"Attention map shape for {layer_name}: {attn_map.shape}")
-        # Remove the CLS token attention and average over heads
-        attn_map = attn_map[0, :, 0, 1:].mean(0)  # Shape: (seq_len-1,)
         # Reshape the attention map to 2D
         num_patches = int(np.sqrt(attn_map.shape[0]))

                 self.attention_maps[module.full_name] = output
         for name, module in self.model.named_modules():
+            # FIXME need to make more generic outside of vit
             if name.lower().endswith('.attn_drop'):
                 module.full_name = name
                 print('hooking', name)
 def get_attention_models() -> List[str]:
     """Get a list of timm models that have attention blocks."""
+    all_models = timm.list_pretrained()
+    # FIXME Focusing on ViT models for initial impl
+    attention_models = [model for model in all_models if any([model.lower().startswith(p) for p in ('vit', 'deit', 'beit', 'eva')]
     return attention_models
 def load_model(model_name: str) -> Tuple[torch.nn.Module, AttentionExtractor]:
     """Visualize attention maps for the given image and model."""
     model, extractor = load_model(model_name)
     attention_maps = process_image(image, model, extractor)
+    num_prefix_tokens = getattr(model, 'num_prefix_tokens', 0)
     # Convert PIL Image to numpy array
     image_np = np.array(image)
     for layer_name, attn_map in attention_maps.items():
         print(f"Attention map shape for {layer_name}: {attn_map.shape}")
+        # Remove the CLS token attention and average over heads
+        attn_map = attn_map[0, :, 0, num_prefix_tokens:].mean(0)  # Shape: (seq_len-1,)
         # Reshape the attention map to 2D
         num_patches = int(np.sqrt(attn_map.shape[0]))