Open
Description
We have a library (fbgemm) with kernels that break apart an 8-bit integer into 4, 2-bit integer values. Ideally we would like to vectorize this pattern which effectively looks like vecorizing this pattern/vector (with x
being an uint8_t
value):
{x & 3, (x >> 2) & 3, (x >> 4) & 3, x >> 6 }
Unfortunately the pattern is not identical on each lane as the >> 0
on the first lane and the & 3
on the last lane was optimized away because they are not necessary. However when vectorizing the resulting could would be best when choosing {x, x, x, x} >> {0, 2, 4, 6} & {3, 3, 3, 3}
.
Reproducer:
target triple = "x86_64--"
define dso_local void @_Z3fooPfPKhff(ptr %0, ptr %1, float %2, float %3) local_unnamed_addr #0 {
%5 = getelementptr inbounds float, ptr %0, i64 1
%6 = getelementptr inbounds float, ptr %0, i64 2
%7 = getelementptr inbounds float, ptr %0, i64 3
%8 = load i8, ptr %1, align 1
%9 = zext i8 %8 to i32
%10 = and i32 %9, 3
%11 = sitofp i32 %10 to float
%12 = lshr i32 %9, 2
%13 = and i32 %12, 3
%14 = sitofp i32 %13 to float
%15 = lshr i32 %9, 4
%16 = and i32 %15, 3
%17 = sitofp i32 %16 to float
%18 = lshr i32 %9, 6
%19 = sitofp i32 %18 to float
%20 = load float, ptr %0, align 4
%21 = fadd float %20, %3
%22 = tail call noundef float @llvm.fma.f32(float %2, float %11, float %21)
store float %22, ptr %0, align 4
%23 = load float, ptr %5, align 4
%24 = fadd float %23, %3
%25 = tail call noundef float @llvm.fma.f32(float %2, float %14, float %24)
store float %25, ptr %5, align 4
%26 = load float, ptr %6, align 4
%27 = fadd float %26, %3
%28 = tail call noundef float @llvm.fma.f32(float %2, float %17, float %27)
store float %28, ptr %6, align 4
%29 = load float, ptr %7, align 4
%30 = fadd float %29, %3
%31 = tail call noundef float @llvm.fma.f32(float %2, float %19, float %30)
store float %31, ptr %7, align 4
ret void
}
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+avx,+avx2,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"uwtable", i32 2}
!2 = !{!"clang version 20.0.0git"}
(tested with github/main 32ffc9f from 2024-10-01)
Documenting this problem here while I am looking at the SLP vectorizer code to see what to do about this.