Skip to content

Commit 376965e

Browse files
committed
[x86] Lower arithmetic vector reductions
1 parent 90da8ff commit 376965e

File tree

8 files changed

+697
-409
lines changed

8 files changed

+697
-409
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 248 additions & 177 deletions
Large diffs are not rendered by default.

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,11 @@ namespace llvm {
10581058
/// functions.
10591059
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget,
10601060
const MachineFunction &MF);
1061+
1062+
/// True if the target supports a fast implementation for the specific
1063+
/// operation and vector type combination.
1064+
bool isVectorReductionFast(const X86Subtarget &Subtarget, ISD::NodeType VectorOp, MVT VT);
1065+
10611066
} // end namespace X86
10621067

10631068
//===--------------------------------------------------------------------===//

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6576,21 +6576,41 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
65766576
}
65776577

65786578
bool llvm::X86TTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
6579+
ISD::NodeType OpCode;
65796580
switch (II->getIntrinsicID()) {
6580-
default:
6581-
return true;
6582-
6581+
case Intrinsic::vector_reduce_add:
6582+
OpCode = ISD::VECREDUCE_ADD;
6583+
break;
6584+
case Intrinsic::vector_reduce_fadd:
6585+
OpCode = ISD::VECREDUCE_FADD;
6586+
break;
6587+
case Intrinsic::vector_reduce_mul:
6588+
OpCode = ISD::VECREDUCE_MUL;
6589+
break;
65836590
case Intrinsic::vector_reduce_umin:
6591+
OpCode = ISD::VECREDUCE_UMIN;
6592+
break;
65846593
case Intrinsic::vector_reduce_umax:
6594+
OpCode = ISD::VECREDUCE_UMAX;
6595+
break;
65856596
case Intrinsic::vector_reduce_smin:
6597+
OpCode = ISD::VECREDUCE_SMIN;
6598+
break;
65866599
case Intrinsic::vector_reduce_smax:
6587-
auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
6588-
auto SType = VType->getScalarType();
6589-
bool CanUsePHMINPOSUW = ST->hasSSE41() && II->getType() == SType &&
6590-
(VType->getPrimitiveSizeInBits() % 128) == 0 &&
6591-
(SType->isIntegerTy(8) || SType->isIntegerTy(16));
6592-
return !CanUsePHMINPOSUW;
6600+
OpCode = ISD::VECREDUCE_SMAX;
6601+
break;
6602+
6603+
default:
6604+
return true;
65936605
}
6606+
6607+
auto *VType = dyn_cast<FixedVectorType>(
6608+
II->getOperand(II->getIntrinsicID() == Intrinsic::vector_reduce_fadd ? 1
6609+
: 0)
6610+
->getType());
6611+
auto VT = EVT::getEVT(VType).getSimpleVT();
6612+
6613+
return !X86::isVectorReductionFast(*ST, OpCode, VT);
65946614
}
65956615

65966616
bool X86TTIImpl::prefersVectorizedAddressing() const {

llvm/test/CodeGen/X86/horizontal-sum.ll

Lines changed: 55 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -833,34 +833,34 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
833833
;
834834
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
835835
; SSSE3-FAST: # %bb.0:
836-
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
837-
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
838-
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
839-
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
840-
; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
841-
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
842-
; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
843-
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
844-
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
845-
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
846-
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
847-
; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
848-
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
849-
; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
850-
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
851-
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
852-
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
853-
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
854-
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
855-
; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
856-
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
857-
; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
858-
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
859-
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
860836
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
861-
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
862-
; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
863-
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
837+
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3]
838+
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm5
839+
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
840+
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm6
841+
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3]
842+
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm7
843+
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
844+
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm8
845+
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm1[3,3]
846+
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm9
847+
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1]
848+
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm10
849+
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm0[3,3]
850+
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm11
851+
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
852+
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
853+
; SSSE3-FAST-NEXT: addss %xmm11, %xmm0
854+
; SSSE3-FAST-NEXT: addss %xmm10, %xmm0
855+
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1
856+
; SSSE3-FAST-NEXT: addss %xmm9, %xmm1
857+
; SSSE3-FAST-NEXT: addss %xmm8, %xmm1
858+
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
859+
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
860+
; SSSE3-FAST-NEXT: addss %xmm7, %xmm2
861+
; SSSE3-FAST-NEXT: addss %xmm6, %xmm2
862+
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm3
863+
; SSSE3-FAST-NEXT: addss %xmm5, %xmm3
864864
; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
865865
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
866866
; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
@@ -899,28 +899,28 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
899899
;
900900
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
901901
; AVX-FAST: # %bb.0:
902-
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
903-
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
904-
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
905-
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
906-
; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
907-
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
908-
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
909-
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
910-
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
911-
; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
902+
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3,3,3]
903+
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm3[1,0]
904+
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3]
905+
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm7 = xmm2[1,0]
906+
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm1[3,3,3,3]
907+
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm9 = xmm1[1,0]
908+
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm0[3,3,3,3]
909+
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm11 = xmm0[1,0]
910+
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
911+
; AVX-FAST-NEXT: vaddss %xmm0, %xmm11, %xmm0
912+
; AVX-FAST-NEXT: vaddss %xmm0, %xmm10, %xmm0
913+
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
914+
; AVX-FAST-NEXT: vaddss %xmm1, %xmm9, %xmm1
915+
; AVX-FAST-NEXT: vaddss %xmm1, %xmm8, %xmm1
912916
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
913917
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
914-
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
915-
; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
916-
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
917-
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
918+
; AVX-FAST-NEXT: vaddss %xmm7, %xmm1, %xmm1
919+
; AVX-FAST-NEXT: vaddss %xmm6, %xmm1, %xmm1
918920
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
919921
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
920-
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
921-
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
922-
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
923-
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
922+
; AVX-FAST-NEXT: vaddss %xmm5, %xmm1, %xmm1
923+
; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
924924
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
925925
; AVX-FAST-NEXT: retq
926926
%5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
@@ -964,21 +964,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
964964
;
965965
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
966966
; SSSE3-FAST: # %bb.0:
967-
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
968-
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
969-
; SSSE3-FAST-NEXT: addps %xmm4, %xmm0
970-
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
971-
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
972-
; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
973-
; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0
974-
; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
975-
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
976-
; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
977-
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
978-
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
979-
; SSSE3-FAST-NEXT: addps %xmm3, %xmm2
980-
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
981-
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
967+
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
968+
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
969+
; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0
982970
; SSSE3-FAST-NEXT: retq
983971
;
984972
; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
@@ -1002,17 +990,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
1002990
;
1003991
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1004992
; AVX-FAST: # %bb.0:
1005-
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1006-
; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
1007-
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1008-
; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
993+
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
1009994
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1010-
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1011-
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
1012-
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
1013-
; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
1014-
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1
1015-
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
995+
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
1016996
; AVX-FAST-NEXT: retq
1017997
%5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
1018998
%6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
@@ -1051,17 +1031,9 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
10511031
;
10521032
; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
10531033
; SSSE3-FAST: # %bb.0:
1054-
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1055-
; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
1056-
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1057-
; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
1058-
; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0
1059-
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1060-
; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
1061-
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1062-
; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
1063-
; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
1064-
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1034+
; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
1035+
; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
1036+
; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0
10651037
; SSSE3-FAST-NEXT: retq
10661038
;
10671039
; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
@@ -1089,17 +1061,9 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
10891061
;
10901062
; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
10911063
; AVX-FAST: # %bb.0:
1092-
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1093-
; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1094-
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1095-
; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1064+
; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
10961065
; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1097-
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1098-
; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1099-
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1100-
; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1101-
; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1102-
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1066+
; AVX-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0
11031067
; AVX-FAST-NEXT: retq
11041068
;
11051069
; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:

llvm/test/CodeGen/X86/optimize-reduction.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ define { i16, i16 } @test_reduce_v16i16_with_add(<16 x i16> %x, <16 x i16> %y) {
8989
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
9090
; AVX2-NEXT: vphaddw %xmm2, %xmm2, %xmm2
9191
; AVX2-NEXT: vmovd %xmm2, %eax
92+
; AVX2-NEXT: vmovd %eax, %xmm2
9293
; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
9394
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
9495
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2

llvm/test/CodeGen/X86/vector-reduce-add-mask.ll

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,20 +1006,34 @@ define i16 @test_v16i16_v16i8(<16 x i16> %a0) {
10061006
; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
10071007
; SSE41-NEXT: retq
10081008
;
1009-
; AVX1-LABEL: test_v16i16_v16i8:
1010-
; AVX1: # %bb.0:
1011-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1012-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1013-
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1014-
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1015-
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1016-
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1017-
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1018-
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1019-
; AVX1-NEXT: vmovd %xmm0, %eax
1020-
; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
1021-
; AVX1-NEXT: vzeroupper
1022-
; AVX1-NEXT: retq
1009+
; AVX1-SLOW-LABEL: test_v16i16_v16i8:
1010+
; AVX1-SLOW: # %bb.0:
1011+
; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1012+
; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1013+
; AVX1-SLOW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1014+
; AVX1-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1015+
; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
1016+
; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1017+
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1018+
; AVX1-SLOW-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1019+
; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
1020+
; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
1021+
; AVX1-SLOW-NEXT: vzeroupper
1022+
; AVX1-SLOW-NEXT: retq
1023+
;
1024+
; AVX1-FAST-LABEL: test_v16i16_v16i8:
1025+
; AVX1-FAST: # %bb.0:
1026+
; AVX1-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1027+
; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1028+
; AVX1-FAST-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
1029+
; AVX1-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
1030+
; AVX1-FAST-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
1031+
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1032+
; AVX1-FAST-NEXT: vpaddq %xmm1, %xmm0, %xmm0
1033+
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
1034+
; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
1035+
; AVX1-FAST-NEXT: vzeroupper
1036+
; AVX1-FAST-NEXT: retq
10231037
;
10241038
; AVX2-LABEL: test_v16i16_v16i8:
10251039
; AVX2: # %bb.0:

llvm/test/CodeGen/X86/vector-reduce-add-zext.ll

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,21 @@ define i32 @test_v4i32(<4 x i8> %a0) {
220220
; SSE41-NEXT: movd %xmm0, %eax
221221
; SSE41-NEXT: retq
222222
;
223-
; AVX1-LABEL: test_v4i32:
224-
; AVX1: # %bb.0:
225-
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
226-
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
227-
; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
228-
; AVX1-NEXT: vmovd %xmm0, %eax
229-
; AVX1-NEXT: retq
223+
; AVX1-SLOW-LABEL: test_v4i32:
224+
; AVX1-SLOW: # %bb.0:
225+
; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
226+
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
227+
; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
228+
; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
229+
; AVX1-SLOW-NEXT: retq
230+
;
231+
; AVX1-FAST-LABEL: test_v4i32:
232+
; AVX1-FAST: # %bb.0:
233+
; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
234+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
235+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
236+
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
237+
; AVX1-FAST-NEXT: retq
230238
;
231239
; AVX2-LABEL: test_v4i32:
232240
; AVX2: # %bb.0:
@@ -257,12 +265,37 @@ define i32 @test_v8i32_v8i8(<8 x i8> %a0) {
257265
; SSE-NEXT: movd %xmm1, %eax
258266
; SSE-NEXT: retq
259267
;
260-
; AVX-LABEL: test_v8i32_v8i8:
261-
; AVX: # %bb.0:
262-
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
263-
; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
264-
; AVX-NEXT: vmovd %xmm0, %eax
265-
; AVX-NEXT: retq
268+
; AVX1-SLOW-LABEL: test_v8i32_v8i8:
269+
; AVX1-SLOW: # %bb.0:
270+
; AVX1-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
271+
; AVX1-SLOW-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
272+
; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
273+
; AVX1-SLOW-NEXT: retq
274+
;
275+
; AVX1-FAST-LABEL: test_v8i32_v8i8:
276+
; AVX1-FAST: # %bb.0:
277+
; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
278+
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
279+
; AVX1-FAST-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
280+
; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
281+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
282+
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
283+
; AVX1-FAST-NEXT: vmovd %xmm0, %eax
284+
; AVX1-FAST-NEXT: retq
285+
;
286+
; AVX2-LABEL: test_v8i32_v8i8:
287+
; AVX2: # %bb.0:
288+
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
289+
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
290+
; AVX2-NEXT: vmovd %xmm0, %eax
291+
; AVX2-NEXT: retq
292+
;
293+
; AVX512-LABEL: test_v8i32_v8i8:
294+
; AVX512: # %bb.0:
295+
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
296+
; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
297+
; AVX512-NEXT: vmovd %xmm0, %eax
298+
; AVX512-NEXT: retq
266299
%1 = zext <8 x i8> %a0 to <8 x i32>
267300
%2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
268301
ret i32 %2

0 commit comments

Comments
 (0)