Skip to content

SLP Vectorizer could vectorize and/shr with constant on 3 out of 4 lanes #110740

Open
@MatzeB

Description

@MatzeB

We have a library (fbgemm) with kernels that break apart an 8-bit integer into 4, 2-bit integer values. Ideally we would like to vectorize this pattern which effectively looks like vecorizing this pattern/vector (with x being an uint8_t value):
{x & 3, (x >> 2) & 3, (x >> 4) & 3, x >> 6 }
Unfortunately the pattern is not identical on each lane as the >> 0 on the first lane and the & 3 on the last lane was optimized away because they are not necessary. However when vectorizing the resulting could would be best when choosing {x, x, x, x} >> {0, 2, 4, 6} & {3, 3, 3, 3}.

Reproducer:

target triple = "x86_64--"

define dso_local void @_Z3fooPfPKhff(ptr %0, ptr %1, float %2, float %3) local_unnamed_addr #0 {
  %5 = getelementptr inbounds float, ptr %0, i64 1
  %6 = getelementptr inbounds float, ptr %0, i64 2
  %7 = getelementptr inbounds float, ptr %0, i64 3
  %8 = load i8, ptr %1, align 1
  %9 = zext i8 %8 to i32
  %10 = and i32 %9, 3
  %11 = sitofp i32 %10 to float
  %12 = lshr i32 %9, 2
  %13 = and i32 %12, 3
  %14 = sitofp i32 %13 to float
  %15 = lshr i32 %9, 4
  %16 = and i32 %15, 3
  %17 = sitofp i32 %16 to float
  %18 = lshr i32 %9, 6
  %19 = sitofp i32 %18 to float
  %20 = load float, ptr %0, align 4
  %21 = fadd float %20, %3
  %22 = tail call noundef float @llvm.fma.f32(float %2, float %11, float %21)
  store float %22, ptr %0, align 4
  %23 = load float, ptr %5, align 4
  %24 = fadd float %23, %3
  %25 = tail call noundef float @llvm.fma.f32(float %2, float %14, float %24)
  store float %25, ptr %5, align 4
  %26 = load float, ptr %6, align 4
  %27 = fadd float %26, %3
  %28 = tail call noundef float @llvm.fma.f32(float %2, float %17, float %27)
  store float %28, ptr %6, align 4
  %29 = load float, ptr %7, align 4
  %30 = fadd float %29, %3
  %31 = tail call noundef float @llvm.fma.f32(float %2, float %19, float %30)
  store float %31, ptr %7, align 4
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+avx,+avx2,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"uwtable", i32 2}
!2 = !{!"clang version 20.0.0git"}

(tested with github/main 32ffc9f from 2024-10-01)

Documenting this problem here while I am looking at the SLP vectorizer code to see what to do about this.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions