SLP Vectorizer could vectorize and/shr with constant on 3 out of 4 lanes

We have a library (fbgemm) with kernels that break apart an 8-bit integer into 4, 2-bit integer values. Ideally we would like to vectorize this pattern which effectively looks like vecorizing this pattern/vector (with `x` being an `uint8_t` value):
     {x & 3, (x >> 2) & 3, (x >> 4) & 3, x >> 6 }
Unfortunately the pattern is not identical on each lane as the `>> 0` on the first lane and the `& 3` on the last lane was optimized away because they are not necessary. However when vectorizing the resulting could would be best when choosing `{x, x, x, x} >> {0, 2, 4, 6} & {3, 3, 3, 3}`.

Reproducer:
```
target triple = "x86_64--"

define dso_local void @_Z3fooPfPKhff(ptr %0, ptr %1, float %2, float %3) local_unnamed_addr #0 {
  %5 = getelementptr inbounds float, ptr %0, i64 1
  %6 = getelementptr inbounds float, ptr %0, i64 2
  %7 = getelementptr inbounds float, ptr %0, i64 3
  %8 = load i8, ptr %1, align 1
  %9 = zext i8 %8 to i32
  %10 = and i32 %9, 3
  %11 = sitofp i32 %10 to float
  %12 = lshr i32 %9, 2
  %13 = and i32 %12, 3
  %14 = sitofp i32 %13 to float
  %15 = lshr i32 %9, 4
  %16 = and i32 %15, 3
  %17 = sitofp i32 %16 to float
  %18 = lshr i32 %9, 6
  %19 = sitofp i32 %18 to float
  %20 = load float, ptr %0, align 4
  %21 = fadd float %20, %3
  %22 = tail call noundef float @llvm.fma.f32(float %2, float %11, float %21)
  store float %22, ptr %0, align 4
  %23 = load float, ptr %5, align 4
  %24 = fadd float %23, %3
  %25 = tail call noundef float @llvm.fma.f32(float %2, float %14, float %24)
  store float %25, ptr %5, align 4
  %26 = load float, ptr %6, align 4
  %27 = fadd float %26, %3
  %28 = tail call noundef float @llvm.fma.f32(float %2, float %17, float %27)
  store float %28, ptr %6, align 4
  %29 = load float, ptr %7, align 4
  %30 = fadd float %29, %3
  %31 = tail call noundef float @llvm.fma.f32(float %2, float %19, float %30)
  store float %31, ptr %7, align 4
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+avx,+avx2,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"uwtable", i32 2}
!2 = !{!"clang version 20.0.0git"}
```

(tested with github/main 32ffc9fdc2cd422c88c926b862adb3de726e3888 from 2024-10-01)

Documenting this problem here while I am looking at the SLP vectorizer code to see what to do about this.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

SLP Vectorizer could vectorize and/shr with constant on 3 out of 4 lanes #110740

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

SLP Vectorizer could vectorize and/shr with constant on 3 out of 4 lanes #110740

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions