Skip to content

Commit a850dbc

Browse files
authored
[AArch64] Sink vscale calls into loops for better isel (#70304)
For more recent sve capable CPUs it is beneficial to use the inc* instruction to increment a value by vscale (potentially shifted or multiplied) even in short loops. This patch tells codegenprepare to sink appropriate vscale calls into blocks where they are used so that isel can match them.
1 parent fd48044 commit a850dbc

File tree

6 files changed

+229
-33
lines changed

6 files changed

+229
-33
lines changed

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8230,7 +8230,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
82308230
if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
82318231
return true;
82328232
}
8233-
return false;
82348233
}
82358234

82368235
if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14552,6 +14552,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
1455214552
return true;
1455314553
}
1455414554

14555+
/// We want to sink following cases:
14556+
/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
14557+
static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
14558+
if (match(Op, m_VScale()))
14559+
return true;
14560+
if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
14561+
match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
14562+
Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
14563+
return true;
14564+
}
14565+
return false;
14566+
}
14567+
1455514568
/// Check if sinking \p I's operands to I's basic block is profitable, because
1455614569
/// the operands can be folded into a target instruction, e.g.
1455714570
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -14668,6 +14681,22 @@ bool AArch64TargetLowering::shouldSinkOperands(
1466814681
}
1466914682
}
1467014683

14684+
// Sink vscales closer to uses for better isel
14685+
switch (I->getOpcode()) {
14686+
case Instruction::GetElementPtr:
14687+
case Instruction::Add:
14688+
case Instruction::Sub:
14689+
for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
14690+
if (shouldSinkVScale(I->getOperand(Op), Ops)) {
14691+
Ops.push_back(&I->getOperandUse(Op));
14692+
return true;
14693+
}
14694+
}
14695+
break;
14696+
default:
14697+
break;
14698+
}
14699+
1467114700
if (!I->getType()->isVectorTy())
1467214701
return false;
1467314702

llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
1818
; CHECK-NEXT: ptrue p1.b
1919
; CHECK-NEXT: cntd x9
2020
; CHECK-NEXT: ptrue p0.d
21-
; CHECK-NEXT: neg x10, x9
22-
; CHECK-NEXT: mov w11, #100 // =0x64
21+
; CHECK-NEXT: neg x9, x9
22+
; CHECK-NEXT: mov w10, #100 // =0x64
2323
; CHECK-NEXT: mov x8, xzr
24-
; CHECK-NEXT: and x10, x10, x11
24+
; CHECK-NEXT: and x10, x9, x10
2525
; CHECK-NEXT: rdvl x11, #2
2626
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
2727
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
@@ -33,7 +33,7 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
3333
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
3434
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
3535
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
36-
; CHECK-NEXT: subs x10, x10, x9
36+
; CHECK-NEXT: adds x10, x10, x9
3737
; CHECK-NEXT: add x8, x8, x11
3838
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
3939
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -106,11 +106,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
106106
; CHECK-NEXT: mov z1.d, #0 // =0x0
107107
; CHECK-NEXT: fmov d2, #2.00000000
108108
; CHECK-NEXT: cntd x9
109-
; CHECK-NEXT: mov w11, #100 // =0x64
109+
; CHECK-NEXT: mov w10, #100 // =0x64
110110
; CHECK-NEXT: ptrue p1.b
111-
; CHECK-NEXT: neg x10, x9
111+
; CHECK-NEXT: neg x9, x9
112112
; CHECK-NEXT: mov x8, xzr
113-
; CHECK-NEXT: and x10, x10, x11
113+
; CHECK-NEXT: and x10, x9, x10
114114
; CHECK-NEXT: rdvl x11, #2
115115
; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d
116116
; CHECK-NEXT: mov z1.d, p0/m, z2.d
@@ -125,7 +125,7 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
125125
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
126126
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
127127
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
128-
; CHECK-NEXT: subs x10, x10, x9
128+
; CHECK-NEXT: adds x10, x10, x9
129129
; CHECK-NEXT: add x8, x8, x11
130130
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
131131
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -193,34 +193,34 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
193193
; CHECK-NEXT: ptrue p1.b
194194
; CHECK-NEXT: cntw x9
195195
; CHECK-NEXT: ptrue p0.d
196-
; CHECK-NEXT: neg x10, x9
197-
; CHECK-NEXT: mov w11, #1000 // =0x3e8
198-
; CHECK-NEXT: rdvl x13, #2
196+
; CHECK-NEXT: neg x9, x9
197+
; CHECK-NEXT: mov w10, #1000 // =0x3e8
198+
; CHECK-NEXT: rdvl x12, #2
199199
; CHECK-NEXT: mov x8, xzr
200-
; CHECK-NEXT: and x10, x10, x11
200+
; CHECK-NEXT: and x10, x9, x10
201201
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
202202
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
203-
; CHECK-NEXT: rdvl x11, #4
204-
; CHECK-NEXT: add x12, x1, x13
205-
; CHECK-NEXT: add x13, x0, x13
203+
; CHECK-NEXT: add x11, x1, x12
204+
; CHECK-NEXT: add x12, x0, x12
205+
; CHECK-NEXT: rdvl x13, #4
206206
; CHECK-NEXT: mov z2.d, z1.d
207207
; CHECK-NEXT: mov z3.d, z0.d
208208
; CHECK-NEXT: .LBB2_1: // %vector.body
209209
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
210210
; CHECK-NEXT: add x14, x0, x8
211-
; CHECK-NEXT: add x15, x13, x8
211+
; CHECK-NEXT: add x15, x12, x8
212212
; CHECK-NEXT: add x16, x1, x8
213-
; CHECK-NEXT: add x17, x12, x8
213+
; CHECK-NEXT: add x17, x11, x8
214214
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8]
215215
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl]
216-
; CHECK-NEXT: ld1b { z6.b }, p1/z, [x13, x8]
216+
; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8]
217217
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x15, #1, mul vl]
218218
; CHECK-NEXT: ld1b { z16.b }, p1/z, [x1, x8]
219219
; CHECK-NEXT: ld1d { z17.d }, p0/z, [x16, #1, mul vl]
220-
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x12, x8]
220+
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8]
221221
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
222-
; CHECK-NEXT: subs x10, x10, x9
223-
; CHECK-NEXT: add x8, x8, x11
222+
; CHECK-NEXT: adds x10, x10, x9
223+
; CHECK-NEXT: add x8, x8, x13
224224
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z4.d, #0
225225
; CHECK-NEXT: fcmla z0.d, p0/m, z17.d, z5.d, #0
226226
; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0

llvm/test/CodeGen/AArch64/sve-int-arith.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
770770
; CHECK-NEXT: b.lt .LBB70_3
771771
; CHECK-NEXT: // %bb.1: // %for.body.preheader
772772
; CHECK-NEXT: mov w9, w3
773-
; CHECK-NEXT: ptrue p1.s
773+
; CHECK-NEXT: ptrue p0.s
774774
; CHECK-NEXT: mov z0.s, #1 // =0x1
775-
; CHECK-NEXT: whilelo p0.s, xzr, x9
775+
; CHECK-NEXT: whilelo p1.s, xzr, x9
776776
; CHECK-NEXT: mov x8, xzr
777777
; CHECK-NEXT: cntw x10
778778
; CHECK-NEXT: .LBB70_2: // %vector.body
779779
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
780-
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
781-
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
782-
; CHECK-NEXT: mad z1.s, p1/m, z2.s, z0.s
783-
; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2]
780+
; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
781+
; CHECK-NEXT: ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
782+
; CHECK-NEXT: mad z1.s, p0/m, z2.s, z0.s
783+
; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2]
784784
; CHECK-NEXT: add x8, x8, x10
785-
; CHECK-NEXT: whilelo p0.s, x8, x9
785+
; CHECK-NEXT: whilelo p1.s, x8, x9
786786
; CHECK-NEXT: b.mi .LBB70_2
787787
; CHECK-NEXT: .LBB70_3: // %for.cond.cleanup
788788
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
1111
; CHECK-NEXT: whilelt p0.s, wzr, w0
1212
; CHECK-NEXT: b.pl .LBB0_3
1313
; CHECK-NEXT: // %bb.1: // %for.body.preheader
14-
; CHECK-NEXT: mov w9, wzr
15-
; CHECK-NEXT: cntw x8
14+
; CHECK-NEXT: mov w8, wzr
15+
; CHECK-NEXT: cntw x9
1616
; CHECK-NEXT: .LBB0_2: // %for.body
1717
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
18-
; CHECK-NEXT: whilelt p0.s, w9, w0
19-
; CHECK-NEXT: add w9, w9, w8
18+
; CHECK-NEXT: whilelt p0.s, w8, w0
19+
; CHECK-NEXT: add w8, w8, w9
2020
; CHECK-NEXT: b.mi .LBB0_2
2121
; CHECK-NEXT: .LBB0_3: // %exit
2222
; CHECK-NEXT: ret
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2+
; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
7+
; CHECK-LABEL: define void @inc_add
8+
; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
9+
; CHECK-NEXT: entry:
10+
; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
11+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
12+
; CHECK: vector.body:
13+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
14+
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
15+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
16+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
17+
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
18+
; CHECK-NEXT: [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
19+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
20+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
21+
; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
22+
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
23+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
24+
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
25+
; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
26+
; CHECK: for.cond.cleanup:
27+
; CHECK-NEXT: ret void
28+
;
29+
entry:
30+
%wide.trip.count = zext i32 %N to i64
31+
%0 = tail call i64 @llvm.vscale.i64()
32+
%1 = shl nuw nsw i64 %0, 2
33+
br label %vector.body
34+
35+
vector.body:
36+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
37+
%2 = getelementptr inbounds float, ptr %in1, i64 %index
38+
%wide.load = load <vscale x 4 x float>, ptr %2, align 4
39+
%3 = getelementptr inbounds float, ptr %in2, i64 %index
40+
%wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
41+
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
42+
%5 = getelementptr inbounds float, ptr %out, i64 %index
43+
store <vscale x 4 x float> %4, ptr %5, align 4
44+
%index.next = add nuw i64 %index, %1
45+
%6 = icmp eq i64 %index.next, %wide.trip.count
46+
br i1 %6, label %for.cond.cleanup, label %vector.body
47+
48+
for.cond.cleanup:
49+
ret void
50+
}
51+
52+
define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
53+
; CHECK-LABEL: define void @dec_sub
54+
; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
55+
; CHECK-NEXT: entry:
56+
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
57+
; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
58+
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
59+
; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
60+
; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
61+
; CHECK-NEXT: [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
62+
; CHECK-NEXT: [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
63+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
64+
; CHECK: vector.body:
65+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
66+
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
67+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
68+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
69+
; CHECK-NEXT: [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
70+
; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
71+
; CHECK-NEXT: [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
72+
; CHECK-NEXT: [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
73+
; CHECK-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
74+
; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
75+
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
76+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
77+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
78+
; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
79+
; CHECK: for.cond.cleanup:
80+
; CHECK-NEXT: ret void
81+
;
82+
entry:
83+
%0 = zext i32 %N to i64
84+
%1 = tail call i64 @llvm.vscale.i64()
85+
%2 = shl nuw nsw i64 %1, 2
86+
%3 = sub nsw i64 1, %2
87+
%invariant.gep = getelementptr float, ptr %in1, i64 %3
88+
%invariant.gep20 = getelementptr float, ptr %in2, i64 %3
89+
%invariant.gep22 = getelementptr float, ptr %out, i64 %3
90+
br label %vector.body
91+
92+
vector.body:
93+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
94+
%offset.idx = sub i64 %0, %index
95+
%gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
96+
%wide.load = load <vscale x 4 x float>, ptr %gep, align 4
97+
%gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
98+
%wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
99+
%4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
100+
%gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
101+
store <vscale x 4 x float> %4, ptr %gep23, align 4
102+
%index.next = add nuw i64 %index, %2
103+
%5 = icmp eq i64 %index.next, %0
104+
br i1 %5, label %for.cond.cleanup, label %vector.body
105+
106+
for.cond.cleanup:
107+
ret void
108+
}
109+
110+
define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %val) #0 {
111+
; CHECK-LABEL: define void @gep
112+
; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr nocapture noundef writeonly [[PTR:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
113+
; CHECK-NEXT: entry:
114+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
115+
; CHECK: for.body:
116+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
117+
; CHECK-NEXT: [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
118+
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
119+
; CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
120+
; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4
121+
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
122+
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
123+
; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
124+
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 4
125+
; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
126+
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
127+
; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
128+
; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 4
129+
; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
130+
; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
131+
; CHECK-NEXT: [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
132+
; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 4
133+
; CHECK-NEXT: [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
134+
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
135+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
136+
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
137+
; CHECK: for.exit:
138+
; CHECK-NEXT: ret void
139+
;
140+
entry:
141+
%0 = tail call i64 @llvm.vscale.i64()
142+
%1 = shl i64 %0, 4
143+
br label %for.body
144+
145+
for.body: ; preds = %for.body, %for.body.lr.ph.new
146+
%lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ]
147+
%ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ]
148+
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %ptr.addr, i32 1, <vscale x 16 x i1> %pg)
149+
%add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %1
150+
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr, i32 1, <vscale x 16 x i1> %pg)
151+
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
152+
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.1, i32 1, <vscale x 16 x i1> %pg)
153+
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %1
154+
tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.2, i32 1, <vscale x 16 x i1> %pg)
155+
%add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %1
156+
%lsr.iv.next = add i32 %lsr.iv, -4
157+
%cmp = icmp eq i32 %lsr.iv.next, 0
158+
br i1 %cmp, label %for.exit, label %for.body
159+
160+
for.exit:
161+
ret void
162+
}
163+
164+
declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
165+
166+
declare i64 @llvm.vscale.i64()
167+
168+
attributes #0 = { "target-features"="+sve2" }

0 commit comments

Comments
 (0)