feat: add linear rope type (#1982)

k223kim · web-flow · commit 6eba5335ef3c · 2025-04-02T02:02:52.000+02:00
diff --git a/litgpt/model.py b/litgpt/model.py
@@ -588,19 +588,22 @@ def build_rope_cache(
     theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device).float() / n_elem))
 
     if extra_config is not None:
-        orig_context_len = extra_config["original_max_seq_len"]
         factor = extra_config["factor"]
-        low_freq_factor = extra_config["low_freq_factor"]
-        high_freq_factor = extra_config["high_freq_factor"]
-
-        wavelen = 2 * torch.pi / theta
-        ratio = orig_context_len / wavelen
-        smooth_factor = (ratio - low_freq_factor) / (high_freq_factor - low_freq_factor)
-        smooth_factor = torch.clamp(smooth_factor, min=0.0, max=1.0)
-
-        # Compute adjusted_theta without masked indexing
-        adjusted_theta = (1 - smooth_factor) * (theta / factor) + smooth_factor * theta
-        theta = adjusted_theta
+        if "original_max_seq_len" in extra_config:
+            orig_context_len = extra_config["original_max_seq_len"]
+            low_freq_factor = extra_config["low_freq_factor"]
+            high_freq_factor = extra_config["high_freq_factor"]
+
+            wavelen = 2 * torch.pi / theta
+            ratio = orig_context_len / wavelen
+            smooth_factor = (ratio - low_freq_factor) / (high_freq_factor - low_freq_factor)
+            smooth_factor = torch.clamp(smooth_factor, min=0.0, max=1.0)
+
+            # Compute adjusted_theta without masked indexing
+            adjusted_theta = (1 - smooth_factor) * (theta / factor) + smooth_factor * theta
+            theta = adjusted_theta
+        else:
+            theta = theta / factor
 
     # Create position indices `[0, 1, ..., seq_len - 1]`
     seq_idx = torch.arange(seq_len, device=device) / condense_ratio
diff --git a/tests/test_rope.py b/tests/test_rope.py
@@ -1,5 +1,6 @@
 # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.
 
+import pytest
 import torch
 from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXRotaryEmbedding
 from transformers.models.gpt_neox.modeling_gpt_neox import apply_rotary_pos_emb as apply_rotary_pos_emb_gptneo
@@ -218,6 +219,59 @@ def test_rope_llama_3_2():
     torch.testing.assert_close(theirs_k_rot, ours_k_rot)
 
 
+# See https://huggingface.co/google/gemma-3-27b-it/blob/main/config.json for settings
+# TODO: update HF transformers version to support Gemma3 and fix errors that causes after the update
+@pytest.mark.skip(reason="This test fails due to the HF transformers version not supporting Gemma3")
+@torch.inference_mode()
+def test_rope_gemma_3():
+    from transformers.models.gemma3.configuration_gemma3 import Gemma3TextConfig
+    from transformers.models.gemma3.modeling_gemma3 import Gemma3RotaryEmbedding, apply_rotary_pos_emb
+
+    head_dim = 32
+    rope_theta = 50_000
+    their_rope_config = {
+        "factor": 8.0,
+        "rope_type": "linear",
+    }
+
+    our_rope_config = {"factor": 8.0}
+
+    ##################################
+    # Compare cos and sin
+    ##################################
+    # transformer rope
+    config = Gemma3TextConfig(rope_theta=rope_theta, rope_scaling=their_rope_config, head_dim=head_dim)
+    rot_emb = Gemma3RotaryEmbedding(config=config)
+    batch_size, seq_len = 1, 10
+    qk_tensor = torch.randn(batch_size, seq_len, head_dim)
+    position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0)
+    theirs_cos, theirs_sin = rot_emb(qk_tensor, position_ids)
+
+    # our rope
+    ours_cos, ours_sin = build_rope_cache(seq_len, n_elem=head_dim, base=rope_theta, extra_config=our_rope_config)
+    ours_cos = ours_cos.unsqueeze(0)
+    ours_sin = ours_sin.unsqueeze(0)
+    torch.testing.assert_close(theirs_cos, ours_cos)
+    torch.testing.assert_close(theirs_sin, ours_sin)
+
+    ##################################
+    # Compare rotated tensors
+    ##################################
+    # Settings
+    num_heads = 4
+
+    # Dummy query and key tensors
+    torch.manual_seed(123)
+    queries = torch.randn(batch_size, num_heads, seq_len, head_dim)
+    keys = torch.randn(batch_size, num_heads, seq_len, head_dim)
+
+    ours_q_rot = apply_rope(queries, ours_cos, ours_sin)
+    ours_k_rot = apply_rope(keys, ours_cos, ours_sin)
+    theirs_q_rot, theirs_k_rot = apply_rotary_pos_emb(queries, keys, theirs_cos, theirs_sin)
+    torch.testing.assert_close(theirs_q_rot, ours_q_rot)
+    torch.testing.assert_close(theirs_k_rot, ours_k_rot)
+
+
 @torch.inference_mode()
 def test_rope_cos_sin_shapes_if_rope_n_elem_is_odd():
     bs, seq_len, n_head, n_embed = 1, 6, 2, 8