diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index fe13fc676e303..71b204f9c3fec 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1870,6 +1870,13 @@ class TargetTransformInfo { /// false, but it shouldn't matter what it returns anyway. bool hasArmWideBranch(bool Thumb) const; + /// Returns a bitmask constructed from the target-features or fmv-features + /// metadata of a function. + uint64_t getFeatureMask(const Function &F) const; + + /// Returns true if this is an instance of a function with multiple versions. + bool isMultiversionedFunction(const Function &F) const; + /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; @@ -2312,6 +2319,8 @@ class TargetTransformInfo::Concept { virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; + virtual uint64_t getFeatureMask(const Function &F) const = 0; + virtual bool isMultiversionedFunction(const Function &F) const = 0; virtual unsigned getMaxNumArgs() const = 0; virtual unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const = 0; @@ -3144,6 +3153,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.hasArmWideBranch(Thumb); } + uint64_t getFeatureMask(const Function &F) const override { + return Impl.getFeatureMask(F); + } + + bool isMultiversionedFunction(const Function &F) const override { + return Impl.isMultiversionedFunction(F); + } + unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 7ac3063ca9a37..dcef4a1abcfa3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1039,6 +1039,10 @@ class TargetTransformInfoImplBase { bool hasArmWideBranch(bool) const { return false; } + uint64_t getFeatureMask(const Function &F) const { return 0; } + + bool isMultiversionedFunction(const Function &F) const { return false; } + unsigned getMaxNumArgs() const { return UINT_MAX; } unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const { diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 63f06a3a69298..0338770593bc4 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -270,13 +270,16 @@ void fillValidCPUArchList(SmallVectorImpl &Values); bool isX18ReservedByDefault(const Triple &TT); -// Return the priority for a given set of FMV features. +// For a given set of feature names, which can be either target-features, or +// fmv-features metadata, expand their dependencies and then return a bitmask +// corresponding to the entries of AArch64::FeatPriorities. uint64_t getFMVPriority(ArrayRef Features); -// For given feature names, return a bitmask corresponding to the entries of -// AArch64::CPUFeatures. The values in CPUFeatures are not bitmasks themselves, -// they are sequential (0, 1, 2, 3, ...). The resulting bitmask is used at -// runtime to test whether a certain FMV feature is available on the host. +// For a given set of FMV feature names, expand their dependencies and then +// return a bitmask corresponding to the entries of AArch64::CPUFeatures. +// The values in CPUFeatures are not bitmasks themselves, they are sequential +// (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether +// a certain FMV feature is available on the host. uint64_t getCpuSupportsMask(ArrayRef Features); void PrintSupportedExtensions(); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index df42dc2746daf..8b9722d047edc 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1383,6 +1383,14 @@ bool TargetTransformInfo::hasArmWideBranch(bool Thumb) const { return TTIImpl->hasArmWideBranch(Thumb); } +uint64_t TargetTransformInfo::getFeatureMask(const Function &F) const { + return TTIImpl->getFeatureMask(F); +} + +bool TargetTransformInfo::isMultiversionedFunction(const Function &F) const { + return TTIImpl->isMultiversionedFunction(F); +} + unsigned TargetTransformInfo::getMaxNumArgs() const { return TTIImpl->getMaxNumArgs(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 932a6f9ce23fd..7f10bfed739b4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" #include @@ -248,6 +249,19 @@ static bool hasPossibleIncompatibleOps(const Function *F) { return false; } +uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const { + StringRef AttributeStr = + isMultiversionedFunction(F) ? "fmv-features" : "target-features"; + StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString(); + SmallVector Features; + FeatureStr.split(Features, ","); + return AArch64::getFMVPriority(Features); +} + +bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const { + return F.hasFnAttribute("fmv-features"); +} + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 8e7e590c173ff..1eb805ae00b1b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -89,6 +89,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const; + uint64_t getFeatureMask(const Function &F) const; + + bool isMultiversionedFunction(const Function &F) const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp index 34ca03a47e0a4..e13c6e6d28c2b 100644 --- a/llvm/lib/TargetParser/AArch64TargetParser.cpp +++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp @@ -48,12 +48,33 @@ std::optional AArch64::ArchInfo::findBySubArch(StringRef SubA return {}; } +std::optional lookupFMVByID(AArch64::ArchExtKind ExtID) { + for (const AArch64::FMVInfo &Info : AArch64::getFMVInfo()) + if (Info.ID && *Info.ID == ExtID) + return Info; + return {}; +} + uint64_t AArch64::getFMVPriority(ArrayRef Features) { - uint64_t Priority = 0; - for (StringRef Feature : Features) - if (std::optional Info = parseFMVExtension(Feature)) - Priority |= (1ULL << Info->PriorityBit); - return Priority; + // Transitively enable the Arch Extensions which correspond to each feature. + ExtensionSet FeatureBits; + for (const StringRef Feature : Features) { + std::optional FMV = parseFMVExtension(Feature); + if (!FMV) { + if (std::optional Info = targetFeatureToExtension(Feature)) + FMV = lookupFMVByID(Info->ID); + } + if (FMV && FMV->ID) + FeatureBits.enable(*FMV->ID); + } + + // Construct a bitmask for all the transitively enabled Arch Extensions. + uint64_t PriorityMask = 0; + for (const FMVInfo &Info : getFMVInfo()) + if (Info.ID && FeatureBits.Enabled.test(*Info.ID)) + PriorityMask |= (1ULL << Info.PriorityBit); + + return PriorityMask; } uint64_t AArch64::getCpuSupportsMask(ArrayRef Features) { diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 78cd249c9c16a..bf0cacc6224be 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -2641,6 +2641,165 @@ DeleteDeadIFuncs(Module &M, return Changed; } +// Follows the use-def chain of \p V backwards until it finds a Function, +// in which case it collects in \p Versions. Return true on successful +// use-def chain traversal, false otherwise. +static bool collectVersions(TargetTransformInfo &TTI, Value *V, + SmallVectorImpl &Versions) { + if (auto *F = dyn_cast(V)) { + if (!TTI.isMultiversionedFunction(*F)) + return false; + Versions.push_back(F); + } else if (auto *Sel = dyn_cast(V)) { + if (!collectVersions(TTI, Sel->getTrueValue(), Versions)) + return false; + if (!collectVersions(TTI, Sel->getFalseValue(), Versions)) + return false; + } else if (auto *Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + if (!collectVersions(TTI, Phi->getIncomingValue(I), Versions)) + return false; + } else { + // Unknown instruction type. Bail. + return false; + } + return true; +} + +// Bypass the IFunc Resolver of MultiVersioned functions when possible. To +// deduce whether the optimization is legal we need to compare the target +// features between caller and callee versions. The criteria for bypassing +// the resolver are the following: +// +// * If the callee's feature set is a subset of the caller's feature set, +// then the callee is a candidate for direct call. +// +// * Among such candidates the one of highest priority is the best match +// and it shall be picked, unless there is a version of the callee with +// higher priority than the best match which cannot be picked from a +// higher priority caller (directly or through the resolver). +// +// * For every higher priority callee version than the best match, there +// is a higher priority caller version whose feature set availability +// is implied by the callee's feature set. +// +static bool OptimizeNonTrivialIFuncs( + Module &M, function_ref GetTTI) { + bool Changed = false; + + // Cache containing the mask constructed from a function's target features. + DenseMap FeatureMask; + + for (GlobalIFunc &IF : M.ifuncs()) { + if (IF.isInterposable()) + continue; + + Function *Resolver = IF.getResolverFunction(); + if (!Resolver) + continue; + + if (Resolver->isInterposable()) + continue; + + TargetTransformInfo &TTI = GetTTI(*Resolver); + + // Discover the callee versions. + SmallVector Callees; + if (any_of(*Resolver, [&TTI, &Callees](BasicBlock &BB) { + if (auto *Ret = dyn_cast_or_null(BB.getTerminator())) + if (!collectVersions(TTI, Ret->getReturnValue(), Callees)) + return true; + return false; + })) + continue; + + assert(!Callees.empty() && "Expecting successful collection of versions"); + + // Cache the feature mask for each callee. + for (Function *Callee : Callees) { + auto [It, Inserted] = FeatureMask.try_emplace(Callee); + if (Inserted) + It->second = TTI.getFeatureMask(*Callee); + } + + // Sort the callee versions in decreasing priority order. + sort(Callees, [&](auto *LHS, auto *RHS) { + return FeatureMask[LHS] > FeatureMask[RHS]; + }); + + // Find the callsites and cache the feature mask for each caller. + SmallVector Callers; + DenseMap> CallSites; + for (User *U : IF.users()) { + if (auto *CB = dyn_cast(U)) { + if (CB->getCalledOperand() == &IF) { + Function *Caller = CB->getFunction(); + auto [FeatIt, FeatInserted] = FeatureMask.try_emplace(Caller); + if (FeatInserted) + FeatIt->second = TTI.getFeatureMask(*Caller); + auto [CallIt, CallInserted] = CallSites.try_emplace(Caller); + if (CallInserted) + Callers.push_back(Caller); + CallIt->second.push_back(CB); + } + } + } + + // Sort the caller versions in decreasing priority order. + sort(Callers, [&](auto *LHS, auto *RHS) { + return FeatureMask[LHS] > FeatureMask[RHS]; + }); + + auto implies = [](uint64_t A, uint64_t B) { return (A & B) == B; }; + + // Index to the highest priority candidate. + unsigned I = 0; + // Now try to redirect calls starting from higher priority callers. + for (Function *Caller : Callers) { + assert(I < Callees.size() && "Found callers of equal priority"); + + Function *Callee = Callees[I]; + uint64_t CallerBits = FeatureMask[Caller]; + uint64_t CalleeBits = FeatureMask[Callee]; + + // In the case of FMV callers, we know that all higher priority callers + // than the current one did not get selected at runtime, which helps + // reason about the callees (if they have versions that mandate presence + // of the features which we already know are unavailable on this target). + if (TTI.isMultiversionedFunction(*Caller)) { + // If the feature set of the caller implies the feature set of the + // highest priority candidate then it shall be picked. In case of + // identical sets advance the candidate index one position. + if (CallerBits == CalleeBits) + ++I; + else if (!implies(CallerBits, CalleeBits)) { + // Keep advancing the candidate index as long as the caller's + // features are a subset of the current candidate's. + while (implies(CalleeBits, CallerBits)) { + if (++I == Callees.size()) + break; + CalleeBits = FeatureMask[Callees[I]]; + } + continue; + } + } else { + // We can't reason much about non-FMV callers. Just pick the highest + // priority callee if it matches, otherwise bail. + if (I > 0 || !implies(CallerBits, CalleeBits)) + continue; + } + auto &Calls = CallSites[Caller]; + for (CallBase *CS : Calls) + CS->setCalledOperand(Callee); + Changed = true; + } + if (IF.use_empty() || + all_of(IF.users(), [](User *U) { return isa(U); })) + NumIFuncsResolved++; + } + return Changed; +} + static bool optimizeGlobalsInModule(Module &M, const DataLayout &DL, function_ref GetTLI, @@ -2707,6 +2866,9 @@ optimizeGlobalsInModule(Module &M, const DataLayout &DL, // Optimize IFuncs whose callee's are statically known. LocalChange |= OptimizeStaticIFuncs(M); + // Optimize IFuncs based on the target features of the caller. + LocalChange |= OptimizeNonTrivialIFuncs(M, GetTTI); + // Remove any IFuncs that are now dead. LocalChange |= DeleteDeadIFuncs(M, NotDiscardableComdats); diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll new file mode 100644 index 0000000000000..90bd98a9b0d38 --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll @@ -0,0 +1,365 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call i32 @(test_single_bb_resolver|test_multi_bb_resolver|test_caller_feats_not_implied|test_non_fmv_caller|test_priority|test_alternative_names)" --version 4 +; RUN: opt --passes=globalopt -o - -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +$test_single_bb_resolver.resolver = comdat any +$test_multi_bb_resolver.resolver = comdat any +$test_caller_feats_not_implied.resolver = comdat any +$test_non_fmv_caller.resolver = comdat any +$test_priority.resolver = comdat any +$test_alternative_names.resolver = comdat any + +@__aarch64_cpu_features = external local_unnamed_addr global { i64 } + +@test_single_bb_resolver = weak_odr ifunc i32 (), ptr @test_single_bb_resolver.resolver +@test_multi_bb_resolver = weak_odr ifunc i32 (), ptr @test_multi_bb_resolver.resolver +@test_caller_feats_not_implied = weak_odr ifunc i32 (), ptr @test_caller_feats_not_implied.resolver +@test_non_fmv_caller = weak_odr ifunc i32 (), ptr @test_non_fmv_caller.resolver +@test_priority = weak_odr ifunc i32 (), ptr @test_priority.resolver +@test_alternative_names = weak_odr ifunc i32 (), ptr @test_alternative_names.resolver + +declare void @__init_cpu_features_resolver() local_unnamed_addr + +declare i32 @test_single_bb_resolver.default() #0 +declare i32 @test_single_bb_resolver._Msve() #1 +declare i32 @test_single_bb_resolver._Msve2() #2 + +define weak_odr ptr @test_single_bb_resolver.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_single_bb_resolver.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 68719476736 + %.not = icmp eq i64 %1, 0 + %2 = and i64 %0, 1073741824 + %.not3 = icmp eq i64 %2, 0 + %test_single_bb_resolver._Msve.test_single_bb_resolver.default = select i1 %.not3, ptr @test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve + %common.ret.op = select i1 %.not, ptr %test_single_bb_resolver._Msve.test_single_bb_resolver.default, ptr @test_single_bb_resolver._Msve2 + ret ptr %common.ret.op +} + +define i32 @caller1._Msve() #1 { +; CHECK-LABEL: define i32 @caller1._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +define i32 @caller1._Msve2() #2 { +; CHECK-LABEL: define i32 @caller1._Msve2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver._Msve2() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +define i32 @caller1.default() #0 { +; CHECK-LABEL: define i32 @caller1.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_single_bb_resolver.default() +; +entry: + %call = tail call i32 @test_single_bb_resolver() + ret i32 %call +} + +declare i32 @test_multi_bb_resolver._Mmops() #3 +declare i32 @test_multi_bb_resolver._Msve2() #2 +declare i32 @test_multi_bb_resolver._Msve() #1 +declare i32 @test_multi_bb_resolver.default() #0 + +define weak_odr ptr @test_multi_bb_resolver.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_multi_bb_resolver.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_multi_bb_resolver._Mmops, %resolver_entry ], [ @test_multi_bb_resolver._Msve2, %resolver_else ], [ %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 68719476736 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_multi_bb_resolver._Msve.test_multi_bb_resolver.default = select i1 %.not6, ptr @test_multi_bb_resolver.default, ptr @test_multi_bb_resolver._Msve + br label %common.ret +} + +define i32 @caller2._MmopsMsve2() #4 { +; CHECK-LABEL: define i32 @caller2._MmopsMsve2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR4:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2._Mmops() #3 { +; CHECK-LABEL: define i32 @caller2._Mmops( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR3:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver._Mmops() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2._Msve() #1 { +; CHECK-LABEL: define i32 @caller2._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +define i32 @caller2.default() #0 { +; CHECK-LABEL: define i32 @caller2.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_multi_bb_resolver.default() +; +entry: + %call = tail call i32 @test_multi_bb_resolver() + ret i32 %call +} + +declare i32 @test_caller_feats_not_implied._Mmops() #3 +declare i32 @test_caller_feats_not_implied._Msme() #5 +declare i32 @test_caller_feats_not_implied._Msve() #1 +declare i32 @test_caller_feats_not_implied.default() #0 + +define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_caller_feats_not_implied.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 576460752303423488 + %.not = icmp eq i64 %1, 0 + br i1 %.not, label %resolver_else, label %common.ret + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_caller_feats_not_implied._Mmops, %resolver_entry ], [ @test_caller_feats_not_implied._Msme, %resolver_else ], [ %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %2 = and i64 %0, 4398046511104 + %.not5 = icmp eq i64 %2, 0 + br i1 %.not5, label %resolver_else2, label %common.ret + +resolver_else2: ; preds = %resolver_else + %3 = and i64 %0, 1073741824 + %.not6 = icmp eq i64 %3, 0 + %test_caller_feats_not_implied._Msve.test_caller_feats_not_implied.default = select i1 %.not6, ptr @test_caller_feats_not_implied.default, ptr @test_caller_feats_not_implied._Msve + br label %common.ret +} + +define i32 @caller3._Mmops() #3 { +; CHECK-LABEL: define i32 @caller3._Mmops( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR3]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied._Mmops() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @caller3._Msve() #1 { +; CHECK-LABEL: define i32 @caller3._Msve( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR1]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +define i32 @caller3.default() #0 { +; CHECK-LABEL: define i32 @caller3.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_caller_feats_not_implied() +; +entry: + %call = tail call i32 @test_caller_feats_not_implied() + ret i32 %call +} + +declare i32 @test_non_fmv_caller._Maes() #6 +declare i32 @test_non_fmv_caller._Msm4() #7 +declare i32 @test_non_fmv_caller.default() #0 + +define weak_odr ptr @test_non_fmv_caller.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_non_fmv_caller.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 32768 + %.not = icmp eq i64 %1, 0 + %test_non_fmv_caller._Maes.test_non_fmv_caller.default = select i1 %.not, ptr @test_non_fmv_caller.default, ptr @test_non_fmv_caller._Maes + ret ptr %test_non_fmv_caller._Maes.test_non_fmv_caller.default +} + +define i32 @caller4() #8 { +; CHECK-LABEL: define i32 @caller4( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes() +; +entry: + %call = tail call i32 @test_non_fmv_caller() + ret i32 %call +} + +define i32 @caller5() #9 { +; CHECK-LABEL: define i32 @caller5( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR8:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_non_fmv_caller() +; +entry: + %call = tail call i32 @test_non_fmv_caller() + ret i32 %call +} + +declare i32 @test_priority._Msve2-sha3() #10 +declare i32 @test_priority._Mls64Mssbs() #11 +declare i32 @test_priority._MflagmMlseMrng() #12 +declare i32 @test_priority.default() #0 + +define weak_odr ptr @test_priority.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_priority.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 131 + %2 = icmp eq i64 %1, 131 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_priority._MflagmMlseMrng, %resolver_entry ], [ @test_priority._Mls64Mssbs, %resolver_else ], [ %test_priority._Msve2-sha3.test_priority.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 9570149208162304 + %4 = icmp eq i64 %3, 9570149208162304 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 1099511627776 + %.not = icmp eq i64 %5, 0 + %test_priority._Msve2-sha3.test_priority.default = select i1 %.not, ptr @test_priority.default, ptr @test_priority._Msve2-sha3 + br label %common.ret +} + +define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3() #13 { +; CHECK-LABEL: define i32 @caller6._MflagmMls64MlseMrngMssbsMsve2-sha3( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR12:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_priority._Mls64Mssbs() +; +entry: + %call = tail call i32 @test_priority() + ret i32 %call +} + +declare i32 @test_alternative_names._Mdpb2Mfrintts() #14 +declare i32 @test_alternative_names._Mflagm2Mfrintts() #15 +declare i32 @test_alternative_names._Mrcpc2() #16 +declare i32 @test_alternative_names.default() #0 + +define weak_odr ptr @test_alternative_names.resolver() comdat { +; CHECK-LABEL: define weak_odr ptr @test_alternative_names.resolver() comdat { +resolver_entry: + tail call void @__init_cpu_features_resolver() + %0 = load i64, ptr @__aarch64_cpu_features, align 8 + %1 = and i64 %0, 17563904 + %2 = icmp eq i64 %1, 17563904 + br i1 %2, label %common.ret, label %resolver_else + +common.ret: ; preds = %resolver_else2, %resolver_else, %resolver_entry + %common.ret.op = phi ptr [ @test_alternative_names._Mdpb2Mfrintts, %resolver_entry ], [ @test_alternative_names._Mflagm2Mfrintts, %resolver_else ], [ %test_alternative_names._Mrcpc2.test_alternative_names.default, %resolver_else2 ] + ret ptr %common.ret.op + +resolver_else: ; preds = %resolver_entry + %3 = and i64 %0, 16777478 + %4 = icmp eq i64 %3, 16777478 + br i1 %4, label %common.ret, label %resolver_else2 + +resolver_else2: ; preds = %resolver_else + %5 = and i64 %0, 12582912 + %6 = icmp eq i64 %5, 12582912 + %test_alternative_names._Mrcpc2.test_alternative_names.default = select i1 %6, ptr @test_alternative_names._Mrcpc2, ptr @test_alternative_names.default + br label %common.ret +} + +define i32 @caller7._Mdpb2Mfrintts() #14 { +; CHECK-LABEL: define i32 @caller7._Mdpb2Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR13:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mdpb2Mfrintts() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7._Mfrintts() #17 { +; CHECK-LABEL: define i32 @caller7._Mfrintts( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR16:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7._Mrcpc2() #16 { +; CHECK-LABEL: define i32 @caller7._Mrcpc2( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR15:[0-9]+]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names._Mrcpc2() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +define i32 @caller7.default() #0 { +; CHECK-LABEL: define i32 @caller7.default( +; CHECK-SAME: ) local_unnamed_addr #[[ATTR0]] { +; CHECK: [[CALL:%.*]] = tail call i32 @test_alternative_names.default() +; +entry: + %call = tail call i32 @test_alternative_names() + ret i32 %call +} + +attributes #0 = { "fmv-features" } +attributes #1 = { "fmv-features"="sve" } +attributes #2 = { "fmv-features"="sve2" } +attributes #3 = { "fmv-features"="mops" } +attributes #4 = { "fmv-features"="mops,sve2" } +attributes #5 = { "fmv-features"="sme" } +attributes #6 = { "fmv-features"="aes" } +attributes #7 = { "fmv-features"="sm4" } +attributes #8 = { "target-features"="+aes,+fp-armv8,+neon,+outline-atomics,+v8a" } +attributes #9 = { "target-features"="+fp-armv8,+neon,+outline-atomics,+v8a,+sm4" } +attributes #10 = { "fmv-features"="sve2-sha3" } +attributes #11 = { "fmv-features"="ls64,ssbs" } +attributes #12 = { "fmv-features"="flagm,lse,rng" } +attributes #13 = { "fmv-features"="flagm,ls64,lse,rng,ssbs,sve2-sha3" } +attributes #14 = { "fmv-features"="dpb2,frintts" } +attributes #15 = { "fmv-features"="flagm2,frintts" } +attributes #16 = { "fmv-features"="rcpc2" } +attributes #17 = { "fmv-features"="frintts" }