Open
Description
SLP vectorizer change in 88e7b8b / #125725 introduced a performance regression.
A minimal reproducible example LLVM IR:
target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
define ptx_kernel void @test() {
%vec = bitcast i32 0 to <4 x i8>
%elem0 = extractelement <4 x i8> %vec, i64 0
%elem1 = extractelement <4 x i8> %vec, i64 1
%elem2 = extractelement <4 x i8> %vec, i64 2
%elem3 = extractelement <4 x i8> %vec, i64 3
br label %1
1: ; preds = %1, %0
%.p0 = phi i8 [ %elem0, %0 ], [ 0, %1 ]
%.p1 = phi i8 [ %elem1, %0 ], [ 0, %1 ]
%.p2 = phi i8 [ %elem2, %0 ], [ 0, %1 ]
%.p3 = phi i8 [ %elem3, %0 ], [ 0, %1 ]
%val0 = insertelement <4 x i8> poison, i8 %.p0, i64 0
%val1 = insertelement <4 x i8> %val0, i8 %.p1, i64 1
%val2 = insertelement <4 x i8> %val1, i8 %.p2, i64 2
%val3 = insertelement <4 x i8> %val2, i8 %.p3, i64 3
%val = bitcast <4 x i8> %val3 to i32
br label %1
}
SLP vectorizer behavior before the culprit:
target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
define ptx_kernel void @test() {
%vec = bitcast i32 0 to <4 x i8>
br label %1
1: ; preds = %1, %0
%2 = phi <4 x i8> [ %vec, %0 ], [ zeroinitializer, %1 ]
%val = bitcast <4 x i8> %2 to i32
br label %1
}
https://godbolt.org/z/o1orj4GWh
SLP vectorizer behavior after the culprit:
target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
define ptx_kernel void @test() {
%vec = bitcast i32 0 to <4 x i8>
%elem0 = extractelement <4 x i8> %vec, i64 0
%elem1 = extractelement <4 x i8> %vec, i64 1
%elem2 = extractelement <4 x i8> %vec, i64 2
%elem3 = extractelement <4 x i8> %vec, i64 3
br label %1
1: ; preds = %1, %0
%.p0 = phi i8 [ %elem0, %0 ], [ 0, %1 ]
%.p1 = phi i8 [ %elem1, %0 ], [ 0, %1 ]
%.p2 = phi i8 [ %elem2, %0 ], [ 0, %1 ]
%.p3 = phi i8 [ %elem3, %0 ], [ 0, %1 ]
%val0 = insertelement <4 x i8> poison, i8 %.p0, i64 0
%val1 = insertelement <4 x i8> %val0, i8 %.p1, i64 1
%val2 = insertelement <4 x i8> %val1, i8 %.p2, i64 2
%val3 = insertelement <4 x i8> %val2, i8 %.p3, i64 3
%val = bitcast <4 x i8> %val3 to i32
br label %1
}
https://godbolt.org/z/ha797ePhW
Out of curiosity, I tried reproducing at a quite recent git commit at 7ec60bf and the performance regression was still there.
Credit to @metaflow for finding the culprit.