@@ -833,34 +833,34 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
833
833
;
834
834
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
835
835
; SSSE3-FAST: # %bb.0:
836
- ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
837
- ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
838
- ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
839
- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
840
- ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
841
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
842
- ; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
843
- ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
844
- ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
845
- ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
846
- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
847
- ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
848
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
849
- ; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
850
- ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
851
- ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
852
- ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
853
- ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
854
- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
855
- ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
856
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
857
- ; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
858
- ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
859
- ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
860
836
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
861
- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
862
- ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
863
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
837
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3]
838
+ ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm5
839
+ ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
840
+ ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm6
841
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3]
842
+ ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm7
843
+ ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
844
+ ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm8
845
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm1[3,3]
846
+ ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm9
847
+ ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1]
848
+ ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm10
849
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm0[3,3]
850
+ ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm11
851
+ ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
852
+ ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
853
+ ; SSSE3-FAST-NEXT: addss %xmm11, %xmm0
854
+ ; SSSE3-FAST-NEXT: addss %xmm10, %xmm0
855
+ ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1
856
+ ; SSSE3-FAST-NEXT: addss %xmm9, %xmm1
857
+ ; SSSE3-FAST-NEXT: addss %xmm8, %xmm1
858
+ ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
859
+ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
860
+ ; SSSE3-FAST-NEXT: addss %xmm7, %xmm2
861
+ ; SSSE3-FAST-NEXT: addss %xmm6, %xmm2
862
+ ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm3
863
+ ; SSSE3-FAST-NEXT: addss %xmm5, %xmm3
864
864
; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
865
865
; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
866
866
; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
@@ -899,28 +899,28 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
899
899
;
900
900
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
901
901
; AVX-FAST: # %bb.0:
902
- ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
903
- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
904
- ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
905
- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
906
- ; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
907
- ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
908
- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
909
- ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
910
- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
911
- ; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
902
+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3,3,3]
903
+ ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm3[1,0]
904
+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3]
905
+ ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm7 = xmm2[1,0]
906
+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm1[3,3,3,3]
907
+ ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm9 = xmm1[1,0]
908
+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm0[3,3,3,3]
909
+ ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm11 = xmm0[1,0]
910
+ ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
911
+ ; AVX-FAST-NEXT: vaddss %xmm0, %xmm11, %xmm0
912
+ ; AVX-FAST-NEXT: vaddss %xmm0, %xmm10, %xmm0
913
+ ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
914
+ ; AVX-FAST-NEXT: vaddss %xmm1, %xmm9, %xmm1
915
+ ; AVX-FAST-NEXT: vaddss %xmm1, %xmm8, %xmm1
912
916
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
913
917
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
914
- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
915
- ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
916
- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
917
- ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
918
+ ; AVX-FAST-NEXT: vaddss %xmm7, %xmm1, %xmm1
919
+ ; AVX-FAST-NEXT: vaddss %xmm6, %xmm1, %xmm1
918
920
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
919
921
; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
920
- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
921
- ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
922
- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
923
- ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
922
+ ; AVX-FAST-NEXT: vaddss %xmm5, %xmm1, %xmm1
923
+ ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
924
924
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
925
925
; AVX-FAST-NEXT: retq
926
926
%5 = call float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %0 )
@@ -964,21 +964,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
964
964
;
965
965
; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
966
966
; SSSE3-FAST: # %bb.0:
967
- ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
968
- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
969
- ; SSSE3-FAST-NEXT: addps %xmm4, %xmm0
970
- ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
971
- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
972
- ; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
973
- ; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0
974
- ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
975
- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
976
- ; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
977
- ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
978
- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
979
- ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2
980
- ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
981
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
967
+ ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
968
+ ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
969
+ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0
982
970
; SSSE3-FAST-NEXT: retq
983
971
;
984
972
; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
@@ -1002,17 +990,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
1002
990
;
1003
991
; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1004
992
; AVX-FAST: # %bb.0:
1005
- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1006
- ; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
1007
- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1008
- ; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
993
+ ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
1009
994
; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1010
- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1011
- ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
1012
- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
1013
- ; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
1014
- ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1
1015
- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
995
+ ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
1016
996
; AVX-FAST-NEXT: retq
1017
997
%5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %0 )
1018
998
%6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %1 )
@@ -1051,17 +1031,9 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
1051
1031
;
1052
1032
; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
1053
1033
; SSSE3-FAST: # %bb.0:
1054
- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1055
- ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
1056
- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1057
- ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
1058
- ; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0
1059
- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1060
- ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
1061
- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1062
- ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
1063
- ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
1064
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1034
+ ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
1035
+ ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
1036
+ ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0
1065
1037
; SSSE3-FAST-NEXT: retq
1066
1038
;
1067
1039
; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
@@ -1089,17 +1061,9 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
1089
1061
;
1090
1062
; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
1091
1063
; AVX-FAST: # %bb.0:
1092
- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1093
- ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1094
- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1095
- ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1064
+ ; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
1096
1065
; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1097
- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1098
- ; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1099
- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1100
- ; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1101
- ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1102
- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1066
+ ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0
1103
1067
; AVX-FAST-NEXT: retq
1104
1068
;
1105
1069
; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:
0 commit comments