Refine paged_attn with compile (#3641)

jianan-gu · web-flow · commit c12230bd8c6a · 2025-04-14T10:00:34.000+08:00
diff --git a/csrc/cpu/aten/PagedAttention.cpp b/csrc/cpu/aten/PagedAttention.cpp
@@ -13,7 +13,7 @@ IPEX_DEFINE_DISPATCH(flash_attn_var_len_kernel_stub);
 /*
  *Caculate the masked multihead attention for decoder layer in decoder only
  */
-void single_query_cached_kv_attention_forward_cpu(
+at::Tensor single_query_cached_kv_attention_forward_cpu(
     at::Tensor& out, // [num_seqs, num_heads, head_size]
     at::Tensor& query, // [num_seqs, num_heads, head_size]
     at::Tensor& key_cache, // [num_blocks,  block_size, num_heads, head_size]
@@ -29,7 +29,7 @@ void single_query_cached_kv_attention_forward_cpu(
     const double k_scale,
     const double v_scale,
     const double softcap) {
-  return single_query_cached_kv_attention_kernel_stub(
+  single_query_cached_kv_attention_kernel_stub(
       kCPU,
       out,
       query,
@@ -46,9 +46,10 @@ void single_query_cached_kv_attention_forward_cpu(
       k_scale,
       v_scale,
       softcap);
+  return out;
 }
 
-void reshape_and_cache_cpu(
+std::tuple<at::Tensor, at::Tensor> reshape_and_cache_cpu(
     at::Tensor& key,
     at::Tensor& value,
     at::Tensor& key_cache,
@@ -57,7 +58,7 @@ void reshape_and_cache_cpu(
     const std::string& kv_cache_dtype,
     const double k_scale,
     const double v_scale) {
-  return reshape_and_cache_kernel_stub(
+  reshape_and_cache_kernel_stub(
       kCPU,
       key,
       value,
@@ -67,9 +68,10 @@ void reshape_and_cache_cpu(
       kv_cache_dtype,
       k_scale,
       v_scale);
+  return std::make_tuple(key_cache, value_cache);
 }
 
-void flash_attn_varlen_cpu(
+at::Tensor flash_attn_varlen_cpu(
     at::Tensor& out,
     at::Tensor& query,
     at::Tensor& key,
@@ -84,11 +86,11 @@ void flash_attn_varlen_cpu(
     const c10::optional<at::Tensor>& alibi_slopes,
     int64_t window_size_left,
     int64_t window_size_right,
-    const std::string& kv_cache_dtype,
+    const std::string_view& kv_cache_dtype,
     const double k_scale,
     const double v_scale,
     const double softcap) {
-  return flash_attn_var_len_kernel_stub(
+  flash_attn_var_len_kernel_stub(
       kCPU,
       out,
       query,
@@ -108,6 +110,7 @@ void flash_attn_varlen_cpu(
       k_scale,
       v_scale,
       softcap);
+  return out;
 }
 
 } // namespace cpu
diff --git a/csrc/cpu/aten/PagedAttention.h b/csrc/cpu/aten/PagedAttention.h
@@ -8,7 +8,7 @@ namespace cpu {
 
 namespace {
 
-void single_query_cached_kv_attention(
+at::Tensor single_query_cached_kv_attention_forward_cpu(
     at::Tensor& out, // [num_seqs, num_heads, head_size]
     at::Tensor& query, // [num_seqs, num_heads, head_size]
     at::Tensor& key_cache, // [num_blocks,  block_size, num_heads, head_size]
@@ -24,9 +24,8 @@ void single_query_cached_kv_attention(
     const double k_scale,
     const double v_scale,
     const double softcap);
-}
 
-void reshape_and_cache(
+std::tuple<at::Tensor, at::Tensor> reshape_and_cache_cpu(
     at::Tensor& key,
     at::Tensor& value,
     at::Tensor& key_cache,
@@ -36,7 +35,7 @@ void reshape_and_cache(
     const double k_scale,
     const double v_scale);
 
-void flash_attn_varlen(
+at::Tensor flash_attn_varlen_cpu(
     at::Tensor& out,
     at::Tensor& query,
     at::Tensor& key,
@@ -51,11 +50,13 @@ void flash_attn_varlen(
     const c10::optional<at::Tensor>& alibi_slopes,
     int64_t window_size_left,
     int64_t window_size_right,
-    const std::string& kv_cache_dtype,
+    const std::string_view& kv_cache_dtype,
     const double k_scale,
     const double v_scale,
     const double softcap);
 
+} // namespace
+
 using single_query_cached_kv_attention_fn = void (*)(
     at::Tensor& out, // [num_seqs, num_heads, head_size]
     at::Tensor& query, // [num_seqs, num_heads, head_size]
@@ -98,7 +99,7 @@ using flash_attn_var_len_fn = void (*)(
     const c10::optional<at::Tensor>& alibi_slopes,
     int64_t window_size_left,
     int64_t window_size_right,
-    const std::string& kv_cache_dtype,
+    const std::string_view& kv_cache_dtype,
     const double k_scale,
     const double v_scale,
     const double softcap);
diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp
@@ -2016,7 +2016,7 @@ void flash_attn_varlen_cpu_kernel_impl(
     const c10::optional<at::Tensor>& alibi_slopes,
     int64_t window_size_left,
     int64_t window_size_right,
-    const std::string& kv_cache_dtype,
+    const std::string_view& kv_cache_dtype,
     const double k_scale,
     const double v_scale,
     const double softcap) {
diff --git a/intel_extension_for_pytorch/_meta_registrations.py b/intel_extension_for_pytorch/_meta_registrations.py
@@ -132,9 +132,9 @@ def is_channels_last_3d(ten):
 
 @register_meta("reshape_and_cache")
 def meta_reshape_and_cache(
-    key, value, key_cache, value_cache, slot_mapping, k_scale, v_scale
+    key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, k_scale, v_scale
 ):
-    return None
+    return key_cache, value_cache
 
 
 @register_meta("single_query_cached_kv_attention")
@@ -153,8 +153,33 @@ def meta_single_query_cached_kv_attention(
     window_size,
     k_scale,
     v_scale,
+    softcap,
 ):
-    return None
+    return output
+
+
+@register_meta("flash_attn_varlen_func")
+def meta_flash_attn_varlen_func(
+    output,
+    query,
+    k_cache,
+    v_cache,
+    cu_seq_lens_q,
+    cu_seq_lens_kv,
+    max_seq_len_q,
+    max_seq_len_kv,
+    scale,
+    is_causal,
+    block_table,
+    alibi_slopes,
+    window_size_left,
+    window_size_right,
+    kv_cache_dtype,
+    k_scale,
+    v_scale,
+    softcap,
+):
+    return output
 
 
 @register_meta("convolution_forward")
diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py
@@ -368,7 +368,7 @@ def reshape_and_cache(
         elif kv_cache_dtype != "auto":
             raise TypeError("unsupported kv_cache_dtype")
 
-        torch.ops.torch_ipex.reshape_and_cache(
+        return torch.ops.torch_ipex.reshape_and_cache(
             key,
             value,
             key_cache,
@@ -391,7 +391,7 @@ def reshape_and_cache_flash(
         k_scale=1.0,
         v_scale=1.0,
     ):
-        torch.ops.torch_ipex.reshape_and_cache(
+        return torch.ops.torch_ipex.reshape_and_cache(
             key,
             value,
             key_cache,
@@ -421,7 +421,7 @@ def single_query_cached_kv_attention(
         v_scale=1.0,
         softcap=-1.0,
     ):
-        torch.ops.torch_ipex.single_query_cached_kv_attention(
+        return torch.ops.torch_ipex.single_query_cached_kv_attention(
             output,
             query,
             key_cache,
@@ -469,7 +469,7 @@ def flash_attn_varlen_func(
                 raise TypeError("only float8_e5m2 supported")
         elif kv_cache_dtype != "auto":
             raise TypeError("unsupported kv_cache_dtype")
-        torch.ops.torch_ipex.flash_attn_varlen_func(
+        return torch.ops.torch_ipex.flash_attn_varlen_func(
             output,
             query,
             k_cache,
diff --git a/tests/cpu/test_flash_attention_varlen.py b/tests/cpu/test_flash_attention_varlen.py
@@ -76,7 +76,7 @@ def mha_ref(q, k, v, scale, is_causal, window_size, softcap):
 
 class TestFlashAttnVarLen(TestCase):
 
-    @torch.inference_mode()
+    @torch.no_grad()
     def _test_flash_attn_varlen(
         self,
         num_heads: int,
@@ -86,6 +86,7 @@ def _test_flash_attn_varlen(
         is_causal: bool,
         dtype: torch.dtype,
         softcap: float,
+        is_compile: bool,
     ) -> None:
         random.seed(0)
         torch.manual_seed(0)
@@ -163,7 +164,11 @@ def _test_flash_attn_varlen(
             output_ref[cu_seq_lens_q[i] : cu_seq_lens_q[i + 1]] = output_i
 
         output = torch.empty_like(query)
-        ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
+        if is_compile:
+            f_c = torch.compile(ipex.llm.modules.PagedAttention.flash_attn_varlen_func)
+        else:
+            f_c = ipex.llm.modules.PagedAttention.flash_attn_varlen_func
+        f_c(
             output,
             query,
             k_cache,
@@ -185,7 +190,7 @@ def _test_flash_attn_varlen(
             output_ref, output, atol=1e-6 if dtype == torch.float else 5e-2
         )
 
-    @torch.inference_mode()
+    @torch.no_grad()
     def _test_flash_attn_varlen_fp8(
         self,
         num_heads: int,
@@ -195,6 +200,7 @@ def _test_flash_attn_varlen_fp8(
         is_causal: bool,
         dtype: torch.dtype,
         softcap: float,
+        is_compile: bool,
     ) -> None:
         random.seed(0)
         torch.manual_seed(0)
@@ -255,6 +261,7 @@ def _test_flash_attn_varlen_fp8(
         scale = float(1.0 / (head_size**0.5))
 
         output_ref = torch.empty_like(query)
+
         ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
             output_ref,
             query,
@@ -272,9 +279,12 @@ def _test_flash_attn_varlen_fp8(
             window_size[1],
             softcap=softcap,
         )
-
+        if is_compile:
+            f_c = torch.compile(ipex.llm.modules.PagedAttention.flash_attn_varlen_func)
+        else:
+            f_c = ipex.llm.modules.PagedAttention.flash_attn_varlen_func
         output = torch.empty_like(query)
-        ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
+        f_c(
             output,
             query,
             k_cache.to(torch.float8_e5m2),
@@ -297,6 +307,9 @@ def _test_flash_attn_varlen_fp8(
         )
 
     def test_flash_attn_varlen(self):
+        COMPILE_TEST = (
+            1  # test torch.compile function for once, avoiding recompile in CI
+        )
         for (
             num_heads,
             num_queries_per_kv,
@@ -314,17 +327,24 @@ def test_flash_attn_varlen(self):
             DTYPES,
             SOFTCAP,
         ):
-            self._test_flash_attn_varlen(
-                num_heads,
-                num_queries_per_kv,
-                head_size,
-                window_size,
-                is_causal,
-                dtype,
-                softcap,
-            )
+            COMPILE = [True, False] if COMPILE_TEST == 1 else [False]
+            COMPILE_TEST = COMPILE_TEST - 1
+            for is_compile in COMPILE:
+                self._test_flash_attn_varlen(
+                    num_heads,
+                    num_queries_per_kv,
+                    head_size,
+                    window_size,
+                    is_causal,
+                    dtype,
+                    softcap,
+                    is_compile,
+                )
 
     def test_flash_attn_varlen_fp8(self):
+        COMPILE_TEST = (
+            1  # test torch.compile function for once, avoiding recompile in CI
+        )
         for (
             num_heads,
             num_queries_per_kv,
@@ -342,15 +362,19 @@ def test_flash_attn_varlen_fp8(self):
             [torch.float, torch.bfloat16],
             SOFTCAP,
         ):
-            self._test_flash_attn_varlen_fp8(
-                num_heads,
-                num_queries_per_kv,
-                head_size,
-                window_size,
-                is_causal,
-                dtype,
-                softcap,
-            )
+            COMPILE = [True, False] if COMPILE_TEST == 1 else [False]
+            COMPILE_TEST = COMPILE_TEST - 1
+            for is_compile in COMPILE:
+                self._test_flash_attn_varlen_fp8(
+                    num_heads,
+                    num_queries_per_kv,
+                    head_size,
+                    window_size,
+                    is_causal,
+                    dtype,
+                    softcap,
+                    is_compile,
+                )
 
 
 if __name__ == "__main__":
diff --git a/tests/cpu/test_paged_attention.py b/tests/cpu/test_paged_attention.py