Skip to content

sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs #12858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
May 9, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions ggml/src/ggml-sycl/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,24 @@
#ifndef GGML_SYCL_BACKEND_HPP
#define GGML_SYCL_BACKEND_HPP

#include "concat.hpp"
#include "common.hpp"
#include "concat.hpp"
#include "conv.hpp"
#include "convert.hpp"
#include "cpy.hpp"
#include "dequantize.hpp"
#include "dmmv.hpp"
#include "element_wise.hpp"
#include "gla.hpp"
#include "im2col.hpp"
#include "mmq.hpp"
#include "mmvq.hpp"
#include "rope.hpp"
#include "norm.hpp"
#include "outprod.hpp"
#include "quants.hpp"
#include "rope.hpp"
#include "softmax.hpp"
#include "tsembd.hpp"
#include "im2col.hpp"
#include "wkv.hpp"
#include "outprod.hpp"
#include "element_wise.hpp"
#include "cpy.hpp"
#include "gla.hpp"

#endif // GGML_SYCL_BACKEND_HPP
#endif // GGML_SYCL_BACKEND_HPP
5 changes: 5 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -776,4 +776,9 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
}

bool gpu_has_xmx(sycl::device &dev);

constexpr size_t safe_div(const size_t m, const size_t n) {
return (m + n - 1) / n;
}

#endif // GGML_SYCL_COMMON_HPP
45 changes: 33 additions & 12 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2887,6 +2887,15 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
return false;
}

inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
return true;
default:
return false;
}
}

static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
Expand All @@ -2906,13 +2915,14 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
}
}

static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
int64_t min_compute_capability = INT_MAX;
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
ggml_tensor * dst) {
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
int64_t min_compute_capability = INT_MAX;

if (split) {
ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
ggml_backend_sycl_split_buffer_type_context * buft_ctx =
(ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
auto & tensor_split = buft_ctx->tensor_split;
for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
// skip devices that are not going to do any work:
Expand All @@ -2925,7 +2935,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
}
}
} else {
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
}

// check data types and tensor shapes for custom matrix multiplication kernels:
Expand All @@ -2947,9 +2957,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // SYCL_USE_XMX

const bool reorder = static_cast<ggml_tensor_extra_gpu *>(dst->src[0]->extra) &&
static_cast<ggml_tensor_extra_gpu *>(dst->src[0]->extra)->optimized_feature.reorder;

// mmvq path is faster in the CUDA backend.
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
// is enabled takes precedence over DMMV, the current if-else implementation
// requires disabling DMMV if both conditions are met
|| (reorder && ggml_sycl_supports_reorder_mmvq(src0->type))) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is named for Intel GPUs.
Why change the code for CUDA?
In fact, reorder the src0 won't happen for non-intel GPU.
So this code has no impact.
Suggest remove it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your comment aligns with my suspicion that this change is obscure. This line changes the kernels from DMMV to MMVQ if reorder is enabled and it's supported, so it's no longer only for CUDA devices.

I need to rethink how the dispatcher does the work.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, please ignore this comment.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another comment:
The reorder behavior impact the code path in this PR: use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;

This code works well for CUDA, instead of Intel GPU.
That's why it's limited for only CUDA backend.
Some cases (models) will get benefit from it, some will become bad for Intel GPU.

I suggest removing this behavior.
Only optimize the OPs by reorder. Not change the code path.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what we have measured the new mmvq code path with the reorder optimization is more optimized on Intel devices as well (cf the PR description). Can you let us know if you find a model or device where this is causing a performance regression? That's why we suggest to enable it by default now.

Copy link
Collaborator

@NeoZhangJianyu NeoZhangJianyu Apr 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Driver and OS only impact the performance in general.
You must care the shape of tensor.
For example, some code work well for 32 * n, but bad for 24 * n.

I test it on Linux.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the clarification. It's hard to understand the issues you are finding because I don't fully know what you are testing. I'll try to replicate the results locally and depending on the findings see if the PR has to be split or the dispatch could be slightly improved.

Copy link
Collaborator

@NeoZhangJianyu NeoZhangJianyu Apr 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use following cmd to test.

./examples/sycl/build.sh
./examples/sycl/run-llama2.sh

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thanks for that. I was able to replicate the issue using an Arc 770, I'm investigating the root cause, since it seems specific of the 7XX architecture.

Copy link
Collaborator Author

@Alcpz Alcpz Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been trying to find and address all the issues discussed above. After rebasing on top of #13003 I 've found no explicit difference between our MMVQ implementation, this PR implementation, and CUDA's mmvq implementation: For the same input we get identical outputs.

I've also checked multiple times (with and without a set seed) the output of the example scripts:

sampler seed: 0
sampler params:
        repeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000
        dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 4096
        top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, top_n_sigma = -1.000, temp = 0.800
        mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist
generate: n_ctx = 4096, n_batch = 2048, n_predict = 400, n_keep = 1

 Building a website can be done in 10 simple steps:
Step 1: Get domain and hosting
Step 2: Choose a theme
Step 3: Choose your colors
Step 4: Build your homepage
Step 5: Build your pages
Step 6: Build your blog
Step 7: Build your contact page
Step 8: Build your about page
Step 9: Add social media icons
Step 10: Add some copy
How much does it cost to build a website?
Is it easy to create a website?
What are the benefits of building a website?
How can you create a website for free?
There are many different ways to build a website, and the best way for you depends on your goals, budget, and expertise. However, there are some basic steps you can take to get started.
The first step is to choose a domain name and hosting plan. A domain name is your website’s address (e.g., www.example.com), while hosting is where your website files live on the internet. You’ll need to purchase both of these from a third-party provider.
Once you’ve got your domain and hosting, you’ll need to choose a website builder. A website builder is a platform that allows you to create and edit your website without having to know how to code. There are many different website builders to choose from, each with their own set of features and pricing plans.
Once you’ve chosen a website builder, the next step is to choose a theme. A theme is the look and feel of your website. There are many different themes to choose from, and each one will have its own set of features. You can usually find a demo of a theme to help you decide if it’s the right fit for your website.
After you’ve chosen a theme, it’s time to choose your colors. Colors can have a big impact on your website’s look and

So, I think almost all comments are addressed for this PR. I still need to rebase to cleanup the helper code I and @Rbiessy added, but it should be in a good state soon.

As for the final question about correctness due to the code path change, I think it's a question of:

Do we want to switch from DMMV to MMVQ?

CUDA for example seems to have been using the Q8_1 + fma / dp4a approach, which, yeah may sacrifice some precision, but as discussed in #11972 it seems a conscious choice (That discussion addresses a different topic, but the underlying issue is the same).
We can discuss what to do with the expected codepath in the SYCL discussion.

As for this PR, should we go with (yet another) environment variable? I'm going to advocate for MMVQ as the default path since I've only seen the precision issues in Arc770 and the performance is noticeable for text generation. In case we agree to it I'd need to add an entry in the documentation.

I'll continue running tests for other model outputs in the meantime.

use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
}

if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// TODO: Refactor and cleanup of mul mat dispatching.
Expand All @@ -2968,14 +2986,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// KQ + KQV multi-batch
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
constexpr bool convert_src1_to_q8_1 = false;
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
} else if (use_mul_mat_vec_q) {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
constexpr bool convert_src1_to_q8_1 = true;
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
} else if (use_mul_mat_q) {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
constexpr bool convert_src1_to_q8_1 = true;
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
} else {
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
constexpr bool convert_src1_to_q8_1 = false;
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
}
}

Expand Down
Loading
Loading