//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // \file // Interface file for the IRSimilarityIdentifier for identifying similarities in // IR including the IRInstructionMapper, which maps an Instruction to unsigned // integers. // // Two sequences of instructions are called "similar" if they perform the same // series of operations for all inputs. // // \code // %1 = add i32 %a, 10 // %2 = add i32 %a, %1 // %3 = icmp slt icmp %1, %2 // \endcode // // and // // \code // %1 = add i32 11, %a // %2 = sub i32 %a, %1 // %3 = icmp sgt icmp %2, %1 // \endcode // // ultimately have the same result, even if the inputs, and structure are // slightly different. // // For instructions, we do not worry about operands that do not have fixed // semantic meaning to the program. We consider the opcode that the instruction // has, the types, parameters, and extra information such as the function name, // or comparison predicate. These are used to create a hash to map instructions // to integers to be used in similarity matching in sequences of instructions // // Terminology: // An IRSimilarityCandidate is a region of IRInstructionData (wrapped // Instructions), usually used to denote a region of similarity has been found. // // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally // similar to one another. // //===----------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include <optional> namespace llvm { namespace IRSimilarity { struct IRInstructionDataList; /// This represents what is and is not supported when finding similarity in /// Instructions. /// /// Legal Instructions are considered when looking at similarity between /// Instructions. /// /// Illegal Instructions cannot be considered when looking for similarity /// between Instructions. They act as boundaries between similarity regions. /// /// Invisible Instructions are skipped over during analysis. // TODO: Shared with MachineOutliner enum InstrType { … }; /// This provides the utilities for hashing an Instruction to an unsigned /// integer. Two IRInstructionDatas produce the same hash value when their /// underlying Instructions perform the same operation (even if they don't have /// the same input operands.) /// As a more concrete example, consider the following: /// /// \code /// %add1 = add i32 %a, %b /// %add2 = add i32 %c, %d /// %add3 = add i64 %e, %f /// \endcode /// // Then the IRInstructionData wrappers for these Instructions may be hashed like /// so: /// /// \code /// ; These two adds have the same types and operand types, so they hash to the /// ; same number. /// %add1 = add i32 %a, %b ; Hash: 1 /// %add2 = add i32 %c, %d ; Hash: 1 /// ; This add produces an i64. This differentiates it from %add1 and %add2. So, /// ; it hashes to a different number. /// %add3 = add i64 %e, %f; Hash: 2 /// \endcode /// /// /// This hashing scheme will be used to represent the program as a very long /// string. This string can then be placed in a data structure which can be used /// for similarity queries. /// /// TODO: Handle types of Instructions which can be equal even with different /// operands. (E.g. comparisons with swapped predicates.) /// TODO: Handle CallInsts, which are only checked for function type /// by \ref isSameOperationAs. /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the /// exact same, and some do not. struct IRInstructionData : ilist_node<IRInstructionData, ilist_sentinel_tracking<true>> { … }; struct IRInstructionDataList : simple_ilist<IRInstructionData, ilist_sentinel_tracking<true>> { … }; /// Compare one IRInstructionData class to another IRInstructionData class for /// whether they are performing a the same operation, and can mapped to the /// same value. For regular instructions if the hash value is the same, then /// they will also be close. /// /// \param A - The first IRInstructionData class to compare /// \param B - The second IRInstructionData class to compare /// \returns true if \p A and \p B are similar enough to be mapped to the same /// value. bool isClose(const IRInstructionData &A, const IRInstructionData &B); struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> { … }; /// Helper struct for converting the Instructions in a Module into a vector of /// unsigned integers. This vector of unsigned integers can be thought of as a /// "numeric string". This numeric string can then be queried by, for example, /// data structures that find repeated substrings. /// /// This hashing is done per BasicBlock in the module. To hash Instructions /// based off of their operations, each Instruction is wrapped in an /// IRInstructionData struct. The unsigned integer for an IRInstructionData /// depends on: /// - The hash provided by the IRInstructionData. /// - Which member of InstrType the IRInstructionData is classified as. // See InstrType for more details on the possible classifications, and how they // manifest in the numeric string. /// /// The numeric string for an individual BasicBlock is terminated by an unique /// unsigned integer. This prevents data structures which rely on repetition /// from matching across BasicBlocks. (For example, the SuffixTree.) /// As a concrete example, if we have the following two BasicBlocks: /// \code /// bb0: /// %add1 = add i32 %a, %b /// %add2 = add i32 %c, %d /// %add3 = add i64 %e, %f /// bb1: /// %sub = sub i32 %c, %d /// \endcode /// We may hash the Instructions like this (via IRInstructionData): /// \code /// bb0: /// %add1 = add i32 %a, %b ; Hash: 1 /// %add2 = add i32 %c, %d; Hash: 1 /// %add3 = add i64 %e, %f; Hash: 2 /// bb1: /// %sub = sub i32 %c, %d; Hash: 3 /// %add4 = add i32 %c, %d ; Hash: 1 /// \endcode /// And produce a "numeric string representation" like so: /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2 /// /// TODO: This is very similar to the MachineOutliner, and should be /// consolidated into the same interface. struct IRInstructionMapper { … }; /// This is a class that wraps a range of IRInstructionData from one point to /// another in the vector of IRInstructionData, which is a region of the /// program. It is also responsible for defining the structure within this /// region of instructions. /// /// The structure of a region is defined through a value numbering system /// assigned to each unique value in a region at the creation of the /// IRSimilarityCandidate. /// /// For example, for each Instruction we add a mapping for each new /// value seen in that Instruction. /// IR: Mapping Added: /// %add1 = add i32 %a, c1 %add1 -> 3, %a -> 1, c1 -> 2 /// %add2 = add i32 %a, %1 %add2 -> 4 /// %add3 = add i32 c2, c1 %add3 -> 6, c2 -> 5 /// /// We can compare IRSimilarityCandidates against one another. /// The \ref isSimilar function compares each IRInstructionData against one /// another and if we have the same sequences of IRInstructionData that would /// create the same hash, we have similar IRSimilarityCandidates. /// /// We can also compare the structure of IRSimilarityCandidates. If we can /// create a mapping of registers in the region contained by one /// IRSimilarityCandidate to the region contained by different /// IRSimilarityCandidate, they can be considered structurally similar. /// /// IRSimilarityCandidate1: IRSimilarityCandidate2: /// %add1 = add i32 %a, %b %add1 = add i32 %d, %e /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 /// /// Can have the following mapping from candidate to candidate of: /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4 /// and can be considered similar. /// /// IRSimilarityCandidate1: IRSimilarityCandidate2: /// %add1 = add i32 %a, %b %add1 = add i32 %d, c4 /// %add2 = add i32 %a, %c %add2 = add i32 %d, %f /// %add3 = add i32 c1, c2 %add3 = add i32 c3, c4 /// /// We cannot create the same mapping since the use of c4 is not used in the /// same way as %b or c2. class IRSimilarityCandidate { … }; CandidateGVNMapping; SimilarityGroup; SimilarityGroupList; /// This class puts all the pieces of the IRInstructionData, /// IRInstructionMapper, IRSimilarityCandidate together. /// /// It first feeds the Module or vector of Modules into the IRInstructionMapper, /// and puts all the mapped instructions into a single long list of /// IRInstructionData. /// /// The list of unsigned integers is given to the Suffix Tree or similar data /// structure to find repeated subsequences. We construct an /// IRSimilarityCandidate for each instance of the subsequence. We compare them /// against one another since These repeated subsequences can have different /// structure. For each different kind of structure found, we create a /// similarity group. /// /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are /// structurally similar to one another, while C is different we would have two /// SimilarityGroups: /// /// SimilarityGroup 1: SimilarityGroup 2 /// A, B, D C /// /// A list of the different similarity groups is then returned after /// analyzing the module. class IRSimilarityIdentifier { … }; } // end namespace IRSimilarity /// An analysis pass based on legacy pass manager that runs and returns /// IRSimilarityIdentifier run on the Module. class IRSimilarityIdentifierWrapperPass : public ModulePass { … }; /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the /// Module. class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> { … }; /// Printer pass that uses \c IRSimilarityAnalysis. class IRSimilarityAnalysisPrinterPass : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> { … }; } // end namespace llvm #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H