[AArch64] Sink vscale calls into loops for better isel (#70304)

huntergr-arm · web-flow · commit a850dbcc5cf1 · 2023-11-07T10:29:42.000Z
For more recent sve capable CPUs it is beneficial to use the inc*
instruction
to increment a value by vscale (potentially shifted or multiplied) even
in
short loops.

This patch tells codegenprepare to sink appropriate vscale calls into
blocks where they are used so that isel can match them.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8230,7 +8230,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
     if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {
       return true;
     }
-    return false;
   }
 
   if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14552,6 +14552,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
   return true;
 }
 
+/// We want to sink following cases:
+/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale
+static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
+  if (match(Op, m_VScale()))
+    return true;
+  if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
+      match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
+    Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
+    return true;
+  }
+  return false;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -14668,6 +14681,22 @@ bool AArch64TargetLowering::shouldSinkOperands(
     }
   }
 
+  // Sink vscales closer to uses for better isel
+  switch (I->getOpcode()) {
+  case Instruction::GetElementPtr:
+  case Instruction::Add:
+  case Instruction::Sub:
+    for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
+      if (shouldSinkVScale(I->getOperand(Op), Ops)) {
+        Ops.push_back(&I->getOperandUse(Op));
+        return true;
+      }
+    }
+    break;
+  default:
+    break;
+  }
+
   if (!I->getType()->isVectorTy())
     return false;
 
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -18,10 +18,10 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    neg x10, x9
-; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
+; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
@@ -33,7 +33,7 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
+; CHECK-NEXT:    adds x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -106,11 +106,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    fmov d2, #2.00000000
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    neg x9, x9
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
+; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:    sel z3.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    mov z1.d, p0/m, z2.d
@@ -125,7 +125,7 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
+; CHECK-NEXT:    adds x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -193,34 +193,34 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntw x9
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    neg x10, x9
-; CHECK-NEXT:    mov w11, #1000 // =0x3e8
-; CHECK-NEXT:    rdvl x13, #2
+; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    mov w10, #1000 // =0x3e8
+; CHECK-NEXT:    rdvl x12, #2
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x10, x11
+; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
-; CHECK-NEXT:    rdvl x11, #4
-; CHECK-NEXT:    add x12, x1, x13
-; CHECK-NEXT:    add x13, x0, x13
+; CHECK-NEXT:    add x11, x1, x12
+; CHECK-NEXT:    add x12, x0, x12
+; CHECK-NEXT:    rdvl x13, #4
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add x14, x0, x8
-; CHECK-NEXT:    add x15, x13, x8
+; CHECK-NEXT:    add x15, x12, x8
 ; CHECK-NEXT:    add x16, x1, x8
-; CHECK-NEXT:    add x17, x12, x8
+; CHECK-NEXT:    add x17, x11, x8
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x0, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x13, x8]
+; CHECK-NEXT:    ld1b { z6.b }, p1/z, [x12, x8]
 ; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x15, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z16.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x16, #1, mul vl]
-; CHECK-NEXT:    ld1b { z18.b }, p1/z, [x12, x8]
+; CHECK-NEXT:    ld1b { z18.b }, p1/z, [x11, x8]
 ; CHECK-NEXT:    ld1d { z19.d }, p0/z, [x17, #1, mul vl]
-; CHECK-NEXT:    subs x10, x10, x9
-; CHECK-NEXT:    add x8, x8, x11
+; CHECK-NEXT:    adds x10, x10, x9
+; CHECK-NEXT:    add x8, x8, x13
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z16.d, z4.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z17.d, z5.d, #0
 ; CHECK-NEXT:    fcmla z2.d, p0/m, z18.d, z6.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
@@ -770,19 +770,19 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
 ; CHECK-NEXT:    b.lt .LBB70_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, #1 // =0x1
-; CHECK-NEXT:    whilelo p0.s, xzr, x9
+; CHECK-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    cntw x10
 ; CHECK-NEXT:  .LBB70_2: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
-; CHECK-NEXT:    mad z1.s, p1/m, z2.s, z0.s
-; CHECK-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z2.s }, p1/z, [x2, x8, lsl #2]
+; CHECK-NEXT:    mad z1.s, p0/m, z2.s, z0.s
+; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
 ; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p0.s, x8, x9
+; CHECK-NEXT:    whilelo p1.s, x8, x9
 ; CHECK-NEXT:    b.mi .LBB70_2
 ; CHECK-NEXT:  .LBB70_3: // %for.cond.cleanup
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll
@@ -11,12 +11,12 @@ define void @test_sink_ptrue_into_ptest(i32 %n) {
 ; CHECK-NEXT:    whilelt p0.s, wzr, w0
 ; CHECK-NEXT:    b.pl .LBB0_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w9, wzr
-; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    cntw x9
 ; CHECK-NEXT:  .LBB0_2: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    whilelt p0.s, w9, w0
-; CHECK-NEXT:    add w9, w9, w8
+; CHECK-NEXT:    whilelt p0.s, w8, w0
+; CHECK-NEXT:    add w8, w8, w9
 ; CHECK-NEXT:    b.mi .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: // %exit
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @inc_add
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds float, ptr %in1, i64 %index
+  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+  %3 = getelementptr inbounds float, ptr %in2, i64 %index
+  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %5 = getelementptr inbounds float, ptr %out, i64 %index
+  store <vscale x 4 x float> %4, ptr %5, align 4
+  %index.next = add nuw i64 %index, %1
+  %6 = icmp eq i64 %index.next, %wide.trip.count
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @dec_sub
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = zext i32 %N to i64
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = sub nsw i64 1, %2
+  %invariant.gep = getelementptr float, ptr %in1, i64 %3
+  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %offset.idx = sub i64 %0, %index
+  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+  store <vscale x 4 x float> %4, ptr %gep23, align 4
+  %index.next = add nuw i64 %index, %2
+  %5 = icmp eq i64 %index.next, %0
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @gep(i32 noundef %first, i32 noundef %N, ptr nocapture noundef writeonly %ptr, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %val) #0 {
+; CHECK-LABEL: define void @gep
+; CHECK-SAME: (i32 noundef [[FIRST:%.*]], i32 noundef [[N:%.*]], ptr nocapture noundef writeonly [[PTR:%.*]], <vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[PTR_ADDR:%.*]] = phi ptr [ [[PTR]], [[ENTRY]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[PTR_ADDR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 4
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_ADDR]], i64 [[TMP1]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 4
+; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[TMP3]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_1]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 4
+; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[TMP5]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[VAL]], ptr [[ADD_PTR_2]], i32 1, <vscale x 16 x i1> [[PG]])
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 4
+; CHECK-NEXT:    [[ADD_PTR_3]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], -4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl i64 %0, 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph.new
+  %lsr.iv = phi i32 [ %N, %entry ], [ %lsr.iv.next, %for.body ]
+  %ptr.addr = phi ptr [ %ptr, %entry ], [ %add.ptr.3, %for.body ]
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %ptr.addr, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr = getelementptr inbounds i8, ptr %ptr.addr, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.1, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %1
+  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %val, ptr %add.ptr.2, i32 1, <vscale x 16 x i1> %pg)
+  %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %1
+  %lsr.iv.next = add i32 %lsr.iv, -4
+  %cmp = icmp eq i32 %lsr.iv.next, 0
+  br i1 %cmp, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
+
+declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }

Original file line number	Diff line number	Diff line change
`@@ -8230,7 +8230,6 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {`
`8230`	`8230`	`if (tryUnmergingGEPsAcrossIndirectBr(GEPI, TTI)) {`
`8231`	`8231`	`return true;`
`8232`	`8232`	`}`
`8233`		`- return false;`
`8234`	`8233`	`}`
`8235`	`8234`
`8236`	`8235`	`if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {`