//===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass tries to expand memcmp() calls into optimally-sized loads and // compares for the target. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ExpandMemCmp.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LazyBlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include <optional> usingnamespacellvm; usingnamespacellvm::PatternMatch; namespace llvm { class TargetLowering; } #define DEBUG_TYPE … STATISTIC(NumMemCmpCalls, "Number of memcmp calls"); STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size"); STATISTIC(NumMemCmpGreaterThanMax, "Number of memcmp calls with size greater than max size"); STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls"); static cl::opt<unsigned> MemCmpEqZeroNumLoadsPerBlock( "memcmp-num-loads-per-block", cl::Hidden, cl::init(1), cl::desc("The number of loads per basic block for inline expansion of " "memcmp that is only being compared against zero.")); static cl::opt<unsigned> MaxLoadsPerMemcmp( "max-loads-per-memcmp", cl::Hidden, cl::desc("Set maximum number of loads used in expanded memcmp")); static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize( "max-loads-per-memcmp-opt-size", cl::Hidden, cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz")); namespace { // This class provides helper functions to expand a memcmp library call into an // inline expansion. class MemCmpExpansion { … }; MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeGreedyLoadSequence( uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes, const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) { … } MemCmpExpansion::LoadEntryVector MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size, const unsigned MaxLoadSize, const unsigned MaxNumLoads, unsigned &NumLoadsNonOneByte) { … } void MemCmpExpansion::optimiseLoadSequence( LoadEntryVector &LoadSequence, const TargetTransformInfo::MemCmpExpansionOptions &Options, bool IsUsedForZeroCmp) { … } // Initialize the basic block structure required for expansion of memcmp call // with given maximum load size and memcmp size parameter. // This structure includes: // 1. A list of load compare blocks - LoadCmpBlocks. // 2. An EndBlock, split from original instruction point, which is the block to // return from. // 3. ResultBlock, block to branch to for early exit when a // LoadCmpBlock finds a difference. MemCmpExpansion::MemCmpExpansion( CallInst *const CI, uint64_t Size, const TargetTransformInfo::MemCmpExpansionOptions &Options, const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout, DomTreeUpdater *DTU) : … { … } unsigned MemCmpExpansion::getNumBlocks() { … } void MemCmpExpansion::createLoadCmpBlocks() { … } void MemCmpExpansion::createResultBlock() { … } MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType, Type *BSwapSizeType, Type *CmpSizeType, unsigned OffsetBytes) { … } // This function creates the IR instructions for loading and comparing 1 byte. // It loads 1 byte from each source of the memcmp parameters with the given // GEPIndex. It then subtracts the two loaded values and adds this result to the // final phi node for selecting the memcmp result. void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex, unsigned OffsetBytes) { … } /// Generate an equality comparison for one or more pairs of loaded values. /// This is used in the case where the memcmp() call is compared equal or not /// equal to zero. Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex, unsigned &LoadIndex) { … } void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(unsigned BlockIndex, unsigned &LoadIndex) { … } // This function creates the IR intructions for loading and comparing using the // given LoadSize. It loads the number of bytes specified by LoadSize from each // source of the memcmp parameters. It then does a subtract to see if there was // a difference in the loaded values. If a difference is found, it branches // with an early exit to the ResultBlock for calculating which source was // larger. Otherwise, it falls through to the either the next LoadCmpBlock or // the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with // a special case through emitLoadCompareByteBlock. The special handling can // simply subtract the loaded values and add it to the result phi node. void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) { … } // This function populates the ResultBlock with a sequence to calculate the // memcmp result. It compares the two loaded source values and returns -1 if // src1 < src2 and 1 if src1 > src2. void MemCmpExpansion::emitMemCmpResultBlock() { … } void MemCmpExpansion::setupResultBlockPHINodes() { … } void MemCmpExpansion::setupEndBlockPHINodes() { … } Value *MemCmpExpansion::getMemCmpExpansionZeroCase() { … } /// A memcmp expansion that compares equality with 0 and only has one block of /// load and compare can bypass the compare, branch, and phi IR that is required /// in the general case. Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() { … } /// A memcmp expansion that only has one block of load and compare can bypass /// the compare, branch, and phi IR that is required in the general case. /// This function also analyses users of memcmp, and if there is only one user /// from which we can conclude that only 2 out of 3 memcmp outcomes really /// matter, then it generates more efficient code with only one comparison. Value *MemCmpExpansion::getMemCmpOneBlock() { … } // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Returns nullptr if the memcmp is already replaced. Value *MemCmpExpansion::getMemCmpExpansion() { … } // This function checks to see if an expansion of memcmp can be generated. // It checks for constant compare size that is less than the max inline size. // If an expansion cannot occur, returns false to leave as a library call. // Otherwise, the library call is replaced with a new IR instruction sequence. /// We want to transform: /// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15) /// To: /// loadbb: /// %0 = bitcast i32* %buffer2 to i8* /// %1 = bitcast i32* %buffer1 to i8* /// %2 = bitcast i8* %1 to i64* /// %3 = bitcast i8* %0 to i64* /// %4 = load i64, i64* %2 /// %5 = load i64, i64* %3 /// %6 = call i64 @llvm.bswap.i64(i64 %4) /// %7 = call i64 @llvm.bswap.i64(i64 %5) /// %8 = sub i64 %6, %7 /// %9 = icmp ne i64 %8, 0 /// br i1 %9, label %res_block, label %loadbb1 /// res_block: ; preds = %loadbb2, /// %loadbb1, %loadbb /// %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ] /// %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ] /// %10 = icmp ult i64 %phi.src1, %phi.src2 /// %11 = select i1 %10, i32 -1, i32 1 /// br label %endblock /// loadbb1: ; preds = %loadbb /// %12 = bitcast i32* %buffer2 to i8* /// %13 = bitcast i32* %buffer1 to i8* /// %14 = bitcast i8* %13 to i32* /// %15 = bitcast i8* %12 to i32* /// %16 = getelementptr i32, i32* %14, i32 2 /// %17 = getelementptr i32, i32* %15, i32 2 /// %18 = load i32, i32* %16 /// %19 = load i32, i32* %17 /// %20 = call i32 @llvm.bswap.i32(i32 %18) /// %21 = call i32 @llvm.bswap.i32(i32 %19) /// %22 = zext i32 %20 to i64 /// %23 = zext i32 %21 to i64 /// %24 = sub i64 %22, %23 /// %25 = icmp ne i64 %24, 0 /// br i1 %25, label %res_block, label %loadbb2 /// loadbb2: ; preds = %loadbb1 /// %26 = bitcast i32* %buffer2 to i8* /// %27 = bitcast i32* %buffer1 to i8* /// %28 = bitcast i8* %27 to i16* /// %29 = bitcast i8* %26 to i16* /// %30 = getelementptr i16, i16* %28, i16 6 /// %31 = getelementptr i16, i16* %29, i16 6 /// %32 = load i16, i16* %30 /// %33 = load i16, i16* %31 /// %34 = call i16 @llvm.bswap.i16(i16 %32) /// %35 = call i16 @llvm.bswap.i16(i16 %33) /// %36 = zext i16 %34 to i64 /// %37 = zext i16 %35 to i64 /// %38 = sub i64 %36, %37 /// %39 = icmp ne i64 %38, 0 /// br i1 %39, label %res_block, label %loadbb3 /// loadbb3: ; preds = %loadbb2 /// %40 = bitcast i32* %buffer2 to i8* /// %41 = bitcast i32* %buffer1 to i8* /// %42 = getelementptr i8, i8* %41, i8 14 /// %43 = getelementptr i8, i8* %40, i8 14 /// %44 = load i8, i8* %42 /// %45 = load i8, i8* %43 /// %46 = zext i8 %44 to i32 /// %47 = zext i8 %45 to i32 /// %48 = sub i32 %46, %47 /// br label %endblock /// endblock: ; preds = %res_block, /// %loadbb3 /// %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ] /// ret i32 %phi.res static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, const TargetLowering *TLI, const DataLayout *DL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, DomTreeUpdater *DTU, const bool IsBCmp) { … } // Returns true if a change was made. static bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering *TL, const DataLayout &DL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, DomTreeUpdater *DTU); static PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering *TL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, DominatorTree *DT); class ExpandMemCmpLegacyPass : public FunctionPass { … }; bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering *TL, const DataLayout &DL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, DomTreeUpdater *DTU) { … } PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, const TargetLowering *TL, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, DominatorTree *DT) { … } } // namespace PreservedAnalyses ExpandMemCmpPass::run(Function &F, FunctionAnalysisManager &FAM) { … } char ExpandMemCmpLegacyPass::ID = …; INITIALIZE_PASS_BEGIN(ExpandMemCmpLegacyPass, DEBUG_TYPE, "Expand memcmp() to load/stores", false, false) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass) INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(ExpandMemCmpLegacyPass, DEBUG_TYPE, "Expand memcmp() to load/stores", false, false) FunctionPass *llvm::createExpandMemCmpLegacyPass() { … }