feat: add rope indices (#1997)

k223kim · pre-commit-ci[bot] · t-vi · web-flow · commit 5bf8d00e9772 · 2025-04-03T13:45:53.000+02:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Thomas Viehmann &lt;tv.code@beamnet.de&gt;
diff --git a/litgpt/config.py b/litgpt/config.py
@@ -82,6 +82,7 @@ class Config:
     # The base period of the RoPE embeddings for local attention.
     # If not provided, rope_theta will be used for both local and global attention.
     rope_local_base_freq: Optional[float] = None
+    rope_indices: Optional[List] = None
 
     def __post_init__(self):
         if not self.name:
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -154,8 +154,19 @@ def forward(
         if self.config.scale_embeddings:
             x = x * torch.tensor(self.config.n_embd**0.5, dtype=x.dtype)
 
-        for block in self.transformer.h:
-            x = block(x, cos, sin, mask, input_pos, input_pos_maxp1)
+        for block_idx, block in enumerate(self.transformer.h):
+            if self.config.rope_indices is not None:
+                x = block(
+                    x,
+                    cos[..., self.config.rope_indices[block_idx]],
+                    sin[..., self.config.rope_indices[block_idx]],
+                    mask,
+                    input_pos,
+                    input_pos_maxp1,
+                )
+            else:
+                x = block(x, cos, sin, mask, input_pos, input_pos_maxp1)
+
         x = self.transformer.ln_f(x)
         clamp_head = (
             partial(do_softcapping, thresh=self.config.final_logit_softcapping)
@@ -215,7 +226,10 @@ def set_kv_cache(
         dtype: Optional[torch.dtype] = None,
     ) -> None:
         if rope_cache_length is None:
-            rope_cache_length = self.cos.size(-1)
+            if len(self.cos.shape) == 2:
+                rope_cache_length = self.cos.size(-1)
+            else:
+                rope_cache_length = self.cos.size(-2)
 
         if max_seq_length is None:
             max_seq_length = self.max_seq_length