fix accuracy issue on reshape_and_cache by selecting correct index (#3307)

shiyang-weng · web-flow · commit 638a7d26acb3 · 2024-10-17T11:22:01.000+08:00
diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp
@@ -555,20 +555,24 @@ void reshape_and_cache_kernel(
   auto cache_strideN = key_cache.stride(0);
   auto cache_strideP = key_cache.stride(2);
   auto cache_strideH = key_cache.stride(1);
-  auto state_strideN = key.stride(0);
-  auto state_strideH = key.stride(1);
+  auto key_state_strideN = key.stride(0);
+  auto key_state_strideH = key.stride(1);
+  auto value_state_strideN = value.stride(0);
+  auto value_state_strideH = value.stride(1);
 #pragma omp parallel for collapse(2)
   for (auto ti = 0; ti < num_tokens; ti++) {
     for (auto hi = 0; hi < head_num; hi++) {
       auto physical_block_id = slot_mapping_ptr[ti] / block_size;
       auto block_offset = slot_mapping_ptr[ti] % block_size;
       auto cache_offset = physical_block_id * cache_strideN +
           block_offset * cache_strideP + hi * cache_strideH;
-      auto state_offset = ti * state_strideN + hi * state_strideH;
+      auto key_state_offset = ti * key_state_strideN + hi * key_state_strideH;
+      auto value_state_offset =
+          ti * value_state_strideN + hi * value_state_strideH;
       auto key_cache_start = key_cache_ptr + cache_offset;
-      auto key_ptr_start = key_ptr + state_offset;
+      auto key_ptr_start = key_ptr + key_state_offset;
       auto value_cache_start = value_cache_ptr + cache_offset;
-      auto value_ptr_start = value_ptr + state_offset;
+      auto value_ptr_start = value_ptr + value_state_offset;
       torch_ipex::cpu::kernel::move_ker<DST_T, SRC_T>(
           key_cache_start, key_ptr_start, head_size);
       torch_ipex::cpu::kernel::move_ker<DST_T, SRC_T>(
diff --git a/tests/cpu/test_paged_attention.py b/tests/cpu/test_paged_attention.py
@@ -252,6 +252,8 @@ def _test_reshape_and_cache_func(
         num_blocks: int,
         dtype: torch.dtype,
         seed: int,
+        key_is_contiguous: bool,
+        value_is_contiguous: bool,
     ) -> None:
         random.seed(seed)
         torch.random.manual_seed(seed)
@@ -264,6 +266,13 @@ def _test_reshape_and_cache_func(
 
         qkv = torch.randn(num_token, 3, num_head, head_size, dtype=dtype, device="cpu")
         _, key, value = qkv.unbind(dim=1)
+        if key.shape[0] != 1:
+            if not key_is_contiguous:
+                key = key.transpose(0, 1).contiguous()
+                key = key.transpose(0, 1)
+            if not value_is_contiguous:
+                value = value.transpose(0, 1).contiguous()
+                value = value.transpose(0, 1)
         # Create the KV caches.
         key_caches, value_caches = self.create_kv_caches(
             num_blocks, block_size, 1, num_head, head_size, dtype, seed
@@ -300,6 +309,8 @@ def test_reshape_and_cache(self):
         head_sizes = [64, 80, 128, 96, 112, 128, 256]
         block_sizes = [16, 32]
         dtypes = [torch.bfloat16, torch.float]
+        key_modes = [True, False]
+        value_modes = [True, False]
         if core.onednn_has_fp16_support():
             dtypes.append(torch.float16)
         seeds = [0]
@@ -310,16 +321,28 @@ def test_reshape_and_cache(self):
             block_size,
             dtype,
             seed,
+            key_is_contiguous,
+            value_is_contiguous,
         ) in product(
             num_tokens,
             num_kv_heads,
             head_sizes,
             block_sizes,
             dtypes,
             seeds,
+            key_modes,
+            value_modes,
         ):
             self._test_reshape_and_cache_func(
-                num_token, num_kv_head, head_size, block_size, num_blocks, dtype, seed
+                num_token,
+                num_kv_head,
+                head_size,
+                block_size,
+                num_blocks,
+                dtype,
+                seed,
+                key_is_contiguous,
+                value_is_contiguous,
             )