attention-rel-pos-embedd

vgrau98 · vgrau98 · commit 67ccdbec76cd · 2023-12-29T23:47:47.000+01:00
Signed-off-by: vgrau98 &lt;victor.grau93@gmail.com&gt;
diff --git a/monai/networks/blocks/selfattention.py b/monai/networks/blocks/selfattention.py
@@ -11,8 +11,11 @@
 
 from __future__ import annotations
 
+from typing import Optional, Tuple
+
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 from monai.utils import optional_import
 
@@ -23,6 +26,7 @@ class SABlock(nn.Module):
     """
     A self-attention block, based on: "Dosovitskiy et al.,
     An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>"
+    One can setup relative positional embedding as described in <https://arxiv.org/abs/2112.01526>
     """
 
     def __init__(
@@ -32,13 +36,18 @@ def __init__(
         dropout_rate: float = 0.0,
         qkv_bias: bool = False,
         save_attn: bool = False,
+        use_rel_pos: bool = False,
+        input_size: Optional[Tuple[int, int]] = None,
     ) -> None:
         """
         Args:
             hidden_size (int): dimension of hidden layer.
             num_heads (int): number of attention heads.
             dropout_rate (float, optional): fraction of the input units to drop. Defaults to 0.0.
             qkv_bias (bool, optional): bias term for the qkv linear layer. Defaults to False.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map. Only support 2D inputs.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
             save_attn (bool, optional): to make accessible the attention matrix. Defaults to False.
 
         """
@@ -62,11 +71,43 @@ def __init__(
         self.scale = self.head_dim**-0.5
         self.save_attn = save_attn
         self.att_mat = torch.Tensor()
+        self.use_rel_pos = use_rel_pos
+        self.input_size = input_size
+
+        if self.use_rel_pos:
+            assert input_size is not None, "Input size must be provided if using relative positional encoding."
+            assert len(input_size) == 2, "Relative positional embedding is only supported for 2D"
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, self.head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, self.head_dim))
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): input tensor. B x (s_dim_1 * ... * s_dim_n) x C
 
-    def forward(self, x):
+        Return:
+            torch.Tensor: B x (s_dim_1 * ... * s_dim_n) x C
+        """
         output = self.input_rearrange(self.qkv(x))
         q, k, v = output[0], output[1], output[2]
-        att_mat = (torch.einsum("blxd,blyd->blxy", q, k) * self.scale).softmax(dim=-1)
+        att_mat = torch.einsum("blxd,blyd->blxy", q, k) * self.scale
+
+        if self.use_rel_pos:
+            batch = x.shape[0]
+            h, w = self.input_size
+            att_mat = add_decomposed_rel_pos(
+                att_mat.view(batch * self.num_heads, h * w, h * w),
+                q.view(batch * self.num_heads, h * w, -1),
+                self.rel_pos_h,
+                self.rel_pos_w,
+                (h, w),
+                (h, w),
+            )
+            att_mat = att_mat.reshape(batch, self.num_heads, h * w, h * w)
+
+        att_mat = att_mat.softmax(dim=-1)
+
         if self.save_attn:
             # no gradients and new tensor;
             # https://pytorch.org/docs/stable/generated/torch.Tensor.detach.html
@@ -78,3 +119,74 @@ def forward(self, x):
         x = self.out_proj(x)
         x = self.drop_output(x)
         return x
+
+
+def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    rel_pos_resized: torch.Tensor = torch.Tensor()
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), size=max_rel_dist, mode="linear"
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_rel_pos(
+    attn: torch.Tensor,
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: Tuple[int, int],
+    k_size: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    batch, _, dim = q.shape
+    r_q = q.reshape(batch, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, rw)
+
+    attn = (attn.view(batch, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]).view(
+        batch, q_h * q_w, k_h * k_w
+    )
+
+    return attn