Skip to content

Commit 22a323e

Browse files
authored
[AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by constant (#71035)
Instead of: v_mul_lo_u32 v0, v0, 5 we should generate: v_lshl_add_u32 v0, v0, 2, v0.
1 parent abc27bd commit 22a323e

File tree

5 files changed

+66
-13
lines changed

5 files changed

+66
-13
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,16 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
515515
let HasExtVOP3DPP = 0;
516516
}
517517

518+
def IsPow2Plus1: PatLeaf<(i32 imm), [{
519+
uint32_t V = N->getZExtValue();
520+
return isPowerOf2_32(V - 1);
521+
}]>;
522+
523+
def Log2_32: SDNodeXForm<imm, [{
524+
uint32_t V = N->getZExtValue();
525+
return CurDAG->getTargetConstant(Log2_32(V - 1), SDLoc(N), MVT::i32);
526+
}]>;
527+
518528
let SubtargetPredicate = isGFX9Plus in {
519529
let isCommutable = 1, isReMaterializable = 1 in {
520530
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -612,6 +622,10 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
612622
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
613623
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
614624

625+
def : GCNPat<
626+
(DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
627+
(V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
628+
615629
let SubtargetPredicate = isGFX940Plus in
616630
def : GCNPat<
617631
(ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),

llvm/test/CodeGen/AMDGPU/mul.ll

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2644,6 +2644,45 @@ entry:
26442644
ret void
26452645
}
26462646

2647+
define i32 @mul_pow2_plus_1(i32 %val) {
2648+
; SI-LABEL: mul_pow2_plus_1:
2649+
; SI: ; %bb.0:
2650+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2651+
; SI-NEXT: v_mul_lo_u32 v0, v0, 9
2652+
; SI-NEXT: s_setpc_b64 s[30:31]
2653+
;
2654+
; VI-LABEL: mul_pow2_plus_1:
2655+
; VI: ; %bb.0:
2656+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2657+
; VI-NEXT: v_mul_lo_u32 v0, v0, 9
2658+
; VI-NEXT: s_setpc_b64 s[30:31]
2659+
;
2660+
; GFX9-LABEL: mul_pow2_plus_1:
2661+
; GFX9: ; %bb.0:
2662+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2663+
; GFX9-NEXT: v_lshl_add_u32 v0, v0, 3, v0
2664+
; GFX9-NEXT: s_setpc_b64 s[30:31]
2665+
;
2666+
; GFX10-LABEL: mul_pow2_plus_1:
2667+
; GFX10: ; %bb.0:
2668+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2669+
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v0
2670+
; GFX10-NEXT: s_setpc_b64 s[30:31]
2671+
;
2672+
; GFX11-LABEL: mul_pow2_plus_1:
2673+
; GFX11: ; %bb.0:
2674+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2675+
; GFX11-NEXT: v_lshl_add_u32 v0, v0, 3, v0
2676+
; GFX11-NEXT: s_setpc_b64 s[30:31]
2677+
;
2678+
; EG-LABEL: mul_pow2_plus_1:
2679+
; EG: ; %bb.0:
2680+
; EG-NEXT: CF_END
2681+
; EG-NEXT: PAD
2682+
%mul = mul i32 %val, 9
2683+
ret i32 %mul
2684+
}
2685+
26472686
declare i32 @llvm.amdgcn.workitem.id.x() #1
26482687

26492688
attributes #0 = { nounwind }

llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,9 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
141141
; SI-NEXT: bb.2.Flow:
142142
; SI-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
143143
; SI-NEXT: {{ $}}
144-
; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4
145-
; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4
146-
; SI-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4
144+
; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %31:vgpr_32, %bb.1, %10, %bb.4
145+
; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %9, %bb.4
146+
; SI-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %34:vgpr_32, %bb.4
147147
; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
148148
; SI-NEXT: S_BRANCH %bb.3
149149
; SI-NEXT: {{ $}}
@@ -158,7 +158,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
158158
; SI-NEXT: successors: %bb.2(0x80000000)
159159
; SI-NEXT: {{ $}}
160160
; SI-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY2]], 0, [[PHI1]], 0, 0, implicit $mode, implicit $exec
161-
; SI-NEXT: [[V_MUL_LO_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32_e64 killed [[PHI1]], 3, implicit $exec
161+
; SI-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 killed [[PHI1]], 1, [[PHI1]], implicit $exec
162162
; SI-NEXT: S_BRANCH %bb.2
163163
; SI-NEXT: {{ $}}
164164
; SI-NEXT: bb.5.if.end:

llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -92,32 +92,32 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
9292
; SI-NEXT: .LBB2_1: ; %if.end
9393
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
9494
; SI-NEXT: s_or_b32 exec_lo, exec_lo, s2
95-
; SI-NEXT: v_add_nc_u32_e32 v2, 1, v0
95+
; SI-NEXT: v_add_nc_u32_e32 v2, 1, v3
9696
; SI-NEXT: s_add_i32 s1, s1, 1
9797
; SI-NEXT: s_cmp_lt_i32 s1, s0
9898
; SI-NEXT: s_cbranch_scc0 .LBB2_6
9999
; SI-NEXT: .LBB2_2: ; %for.body
100100
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
101-
; SI-NEXT: ; implicit-def: $vgpr0
102101
; SI-NEXT: ; implicit-def: $vgpr3
102+
; SI-NEXT: ; implicit-def: $vgpr0
103103
; SI-NEXT: s_and_saveexec_b32 s2, vcc_lo
104104
; SI-NEXT: s_xor_b32 s2, exec_lo, s2
105105
; SI-NEXT: ; %bb.3: ; %else
106106
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
107-
; SI-NEXT: v_mul_lo_u32 v0, v2, 3
108-
; SI-NEXT: v_mul_f32_e32 v3, v1, v2
107+
; SI-NEXT: v_mul_f32_e32 v0, v1, v2
108+
; SI-NEXT: v_lshl_add_u32 v3, v2, 1, v2
109109
; SI-NEXT: ; implicit-def: $vgpr2
110110
; SI-NEXT: ; %bb.4: ; %Flow
111111
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
112112
; SI-NEXT: s_andn2_saveexec_b32 s2, s2
113113
; SI-NEXT: s_cbranch_execz .LBB2_1
114114
; SI-NEXT: ; %bb.5: ; %if
115115
; SI-NEXT: ; in Loop: Header=BB2_2 Depth=1
116-
; SI-NEXT: v_mul_f32_e32 v3, s1, v1
117-
; SI-NEXT: v_add_nc_u32_e32 v0, 1, v2
116+
; SI-NEXT: v_mul_f32_e32 v0, s1, v1
117+
; SI-NEXT: v_add_nc_u32_e32 v3, 1, v2
118118
; SI-NEXT: s_branch .LBB2_1
119119
; SI-NEXT: .LBB2_6: ; %for.end
120-
; SI-NEXT: v_add_f32_e32 v0, v0, v3
120+
; SI-NEXT: v_add_f32_e32 v0, v3, v0
121121
; SI-NEXT: ; return to shader part epilog
122122
entry:
123123
; %break = icmp sgt i32 %bound, 0

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,7 +1536,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
15361536
; GFX9-W64-NEXT: ; %bb.2: ; %Flow
15371537
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
15381538
; GFX9-W64-NEXT: ; %bb.3: ; %IF
1539-
; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3
1539+
; GFX9-W64-NEXT: v_lshl_add_u32 v0, v5, 1, v5
15401540
; GFX9-W64-NEXT: ; %bb.4: ; %END
15411541
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
15421542
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
@@ -1566,7 +1566,7 @@ define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i3
15661566
; GFX10-W32-NEXT: ; %bb.2: ; %Flow
15671567
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
15681568
; GFX10-W32-NEXT: ; %bb.3: ; %IF
1569-
; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3
1569+
; GFX10-W32-NEXT: v_lshl_add_u32 v0, v5, 1, v5
15701570
; GFX10-W32-NEXT: ; %bb.4: ; %END
15711571
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
15721572
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12

0 commit comments

Comments
 (0)