-
Notifications
You must be signed in to change notification settings - Fork 12k
sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs #12858
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
187451b
9c8d809
52b1622
e8555ab
b60d637
fc768f3
c7500c9
1e0c4cf
de60819
dc19cd5
351ef2b
34f7bed
d61dda3
48480c8
6afb367
6fe27eb
e809b07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2887,6 +2887,15 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { | |
return false; | ||
} | ||
|
||
inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { | ||
switch (type) { | ||
case GGML_TYPE_Q4_0: | ||
return true; | ||
default: | ||
return false; | ||
} | ||
} | ||
|
||
static bool ggml_sycl_supports_dmmv(enum ggml_type type) { | ||
switch (type) { | ||
case GGML_TYPE_Q4_0: | ||
|
@@ -2906,13 +2915,14 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) { | |
} | ||
} | ||
|
||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | ||
|
||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); | ||
int64_t min_compute_capability = INT_MAX; | ||
static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, | ||
ggml_tensor * dst) { | ||
const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); | ||
qnixsynapse marked this conversation as resolved.
Show resolved
Hide resolved
|
||
int64_t min_compute_capability = INT_MAX; | ||
|
||
if (split) { | ||
ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context; | ||
ggml_backend_sycl_split_buffer_type_context * buft_ctx = | ||
(ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context; | ||
auto & tensor_split = buft_ctx->tensor_split; | ||
for (int id = 0; id < ggml_sycl_info().device_count; ++id) { | ||
// skip devices that are not going to do any work: | ||
|
@@ -2925,7 +2935,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |
} | ||
} | ||
} else { | ||
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc; | ||
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc; | ||
} | ||
|
||
// check data types and tensor shapes for custom matrix multiplication kernels: | ||
|
@@ -2947,9 +2957,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE); | ||
#endif // SYCL_USE_XMX | ||
|
||
const bool reorder = static_cast<ggml_tensor_extra_gpu *>(dst->src[0]->extra) && | ||
static_cast<ggml_tensor_extra_gpu *>(dst->src[0]->extra)->optimized_feature.reorder; | ||
|
||
// mmvq path is faster in the CUDA backend. | ||
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda) | ||
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda | ||
// Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization | ||
// is enabled takes precedence over DMMV, the current if-else implementation | ||
// requires disabling DMMV if both conditions are met | ||
|| (reorder && ggml_sycl_supports_reorder_mmvq(src0->type))) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This PR is named for Intel GPUs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Your comment aligns with my suspicion that this change is obscure. This line changes the kernels from DMMV to MMVQ if reorder is enabled and it's supported, so it's no longer only for CUDA devices. I need to rethink how the dispatcher does the work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, please ignore this comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another comment: This code works well for CUDA, instead of Intel GPU. I suggest removing this behavior. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From what we have measured the new mmvq code path with the reorder optimization is more optimized on Intel devices as well (cf the PR description). Can you let us know if you find a model or device where this is causing a performance regression? That's why we suggest to enable it by default now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Driver and OS only impact the performance in general. I test it on Linux. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the clarification. It's hard to understand the issues you are finding because I don't fully know what you are testing. I'll try to replicate the results locally and depending on the findings see if the PR has to be split or the dispatch could be slightly improved. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I use following cmd to test. ./examples/sycl/build.sh There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, thanks for that. I was able to replicate the issue using an Arc 770, I'm investigating the root cause, since it seems specific of the 7XX architecture. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've been trying to find and address all the issues discussed above. After rebasing on top of #13003 I 've found no explicit difference between our MMVQ implementation, this PR implementation, and CUDA's mmvq implementation: For the same input we get identical outputs. I've also checked multiple times (with and without a set seed) the output of the example scripts:
So, I think almost all comments are addressed for this PR. I still need to rebase to cleanup the helper code I and @Rbiessy added, but it should be in a good state soon. As for the final question about correctness due to the code path change, I think it's a question of: Do we want to switch from DMMV to MMVQ? CUDA for example seems to have been using the Q8_1 + fma / dp4a approach, which, yeah may sacrifice some precision, but as discussed in #11972 it seems a conscious choice (That discussion addresses a different topic, but the underlying issue is the same). As for this PR, should we go with (yet another) environment variable? I'm going to advocate for MMVQ as the default path since I've only seen the precision issues in Arc770 and the performance is noticeable for text generation. In case we agree to it I'd need to add an entry in the documentation. I'll continue running tests for other model outputs in the meantime. |
||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q; | ||
} | ||
|
||
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { | ||
// TODO: Refactor and cleanup of mul mat dispatching. | ||
|
@@ -2968,14 +2986,17 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor | |
// KQ + KQV multi-batch | ||
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); | ||
} else if (use_dequantize_mul_mat_vec) { | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); | ||
// save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream()); | ||
constexpr bool convert_src1_to_q8_1 = false; | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1); | ||
} else if (use_mul_mat_vec_q) { | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); | ||
constexpr bool convert_src1_to_q8_1 = true; | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1); | ||
} else if (use_mul_mat_q) { | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); | ||
constexpr bool convert_src1_to_q8_1 = true; | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1); | ||
} else { | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); | ||
constexpr bool convert_src1_to_q8_1 = false; | ||
ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1); | ||
} | ||
} | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.