llvm/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h

//===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// \file
// Interface file for the IRSimilarityIdentifier for identifying similarities in
// IR including the IRInstructionMapper, which maps an Instruction to unsigned
// integers.
//
// Two sequences of instructions are called "similar" if they perform the same
// series of operations for all inputs.
//
// \code
// %1 = add i32 %a, 10
// %2 = add i32 %a, %1
// %3 = icmp slt icmp %1, %2
// \endcode
//
// and
//
// \code
// %1 = add i32 11, %a
// %2 = sub i32 %a, %1
// %3 = icmp sgt icmp %2, %1
// \endcode
//
// ultimately have the same result, even if the inputs, and structure are
// slightly different.
//
// For instructions, we do not worry about operands that do not have fixed
// semantic meaning to the program.  We consider the opcode that the instruction
// has, the types, parameters, and extra information such as the function name,
// or comparison predicate.  These are used to create a hash to map instructions
// to integers to be used in similarity matching in sequences of instructions
//
// Terminology:
// An IRSimilarityCandidate is a region of IRInstructionData (wrapped
// Instructions), usually used to denote a region of similarity has been found.
//
// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
// similar to one another.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
#define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H

#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/Allocator.h"
#include <optional>

namespace llvm {

namespace IRSimilarity {

struct IRInstructionDataList;

/// This represents what is and is not supported when finding similarity in
/// Instructions.
///
/// Legal Instructions are considered when looking at similarity between
/// Instructions.
///
/// Illegal Instructions cannot be considered when looking for similarity
/// between Instructions. They act as boundaries between similarity regions.
///
/// Invisible Instructions are skipped over during analysis.
// TODO: Shared with MachineOutliner
enum InstrType {};

/// This provides the utilities for hashing an Instruction to an unsigned
/// integer. Two IRInstructionDatas produce the same hash value when their
/// underlying Instructions perform the same operation (even if they don't have
/// the same input operands.)
/// As a more concrete example, consider the following:
///
/// \code
/// %add1 = add i32 %a, %b
/// %add2 = add i32 %c, %d
/// %add3 = add i64 %e, %f
/// \endcode
///
// Then the IRInstructionData wrappers for these Instructions may be hashed like
/// so:
///
/// \code
/// ; These two adds have the same types and operand types, so they hash to the
/// ; same number.
/// %add1 = add i32 %a, %b ; Hash: 1
/// %add2 = add i32 %c, %d ; Hash: 1
/// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
/// ; it hashes to a different number.
/// %add3 = add i64 %e, %f; Hash: 2
/// \endcode
///
///
/// This hashing scheme will be used to represent the program as a very long
/// string. This string can then be placed in a data structure which can be used
/// for similarity queries.
///
/// TODO: Handle types of Instructions which can be equal even with different
/// operands. (E.g. comparisons with swapped predicates.)
/// TODO: Handle CallInsts, which are only checked for function type
/// by \ref isSameOperationAs.
/// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
/// exact same, and some do not.
struct IRInstructionData
    : ilist_node<IRInstructionData, ilist_sentinel_tracking<true>> {};

struct IRInstructionDataList
    : simple_ilist<IRInstructionData, ilist_sentinel_tracking<true>> {};

/// Compare one IRInstructionData class to another IRInstructionData class for
/// whether they are performing a the same operation, and can mapped to the
/// same value. For regular instructions if the hash value is the same, then
/// they will also be close.
///
/// \param A - The first IRInstructionData class to compare
/// \param B - The second IRInstructionData class to compare
/// \returns true if \p A and \p B are similar enough to be mapped to the same
/// value.
bool isClose(const IRInstructionData &A, const IRInstructionData &B);

struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {};

/// Helper struct for converting the Instructions in a Module into a vector of
/// unsigned integers. This vector of unsigned integers can be thought of as a
/// "numeric string". This numeric string can then be queried by, for example,
/// data structures that find repeated substrings.
///
/// This hashing is done per BasicBlock in the module. To hash Instructions
/// based off of their operations, each Instruction is wrapped in an
/// IRInstructionData struct. The unsigned integer for an IRInstructionData
/// depends on:
/// - The hash provided by the IRInstructionData.
/// - Which member of InstrType the IRInstructionData is classified as.
// See InstrType for more details on the possible classifications, and how they
// manifest in the numeric string.
///
/// The numeric string for an individual BasicBlock is terminated by an unique
/// unsigned integer. This prevents data structures which rely on repetition
/// from matching across BasicBlocks. (For example, the SuffixTree.)
/// As a concrete example, if we have the following two BasicBlocks:
/// \code
/// bb0:
/// %add1 = add i32 %a, %b
/// %add2 = add i32 %c, %d
/// %add3 = add i64 %e, %f
/// bb1:
/// %sub = sub i32 %c, %d
/// \endcode
/// We may hash the Instructions like this (via IRInstructionData):
/// \code
/// bb0:
/// %add1 = add i32 %a, %b ; Hash: 1
/// %add2 = add i32 %c, %d; Hash: 1
/// %add3 = add i64 %e, %f; Hash: 2
/// bb1:
/// %sub = sub i32 %c, %d; Hash: 3
/// %add4 = add i32 %c, %d ; Hash: 1
/// \endcode
/// And produce a "numeric string representation" like so:
/// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
///
/// TODO: This is very similar to the MachineOutliner, and should be
/// consolidated into the same interface.
struct IRInstructionMapper {};

/// This is a class that wraps a range of IRInstructionData from one point to
/// another in the vector of IRInstructionData, which is a region of the
/// program.  It is also responsible for defining the structure within this
/// region of instructions.
///
/// The structure of a region is defined through a value numbering system
/// assigned to each unique value in a region at the creation of the
/// IRSimilarityCandidate.
///
/// For example, for each Instruction we add a mapping for each new
/// value seen in that Instruction.
/// IR:                    Mapping Added:
/// %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
/// %add2 = add i32 %a, %1    %add2 -> 4
/// %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
///
/// We can compare IRSimilarityCandidates against one another.
/// The \ref isSimilar function compares each IRInstructionData against one
/// another and if we have the same sequences of IRInstructionData that would
/// create the same hash, we have similar IRSimilarityCandidates.
///
/// We can also compare the structure of IRSimilarityCandidates. If we can
/// create a mapping of registers in the region contained by one
/// IRSimilarityCandidate to the region contained by different
/// IRSimilarityCandidate, they can be considered structurally similar.
///
/// IRSimilarityCandidate1:   IRSimilarityCandidate2:
/// %add1 = add i32 %a, %b    %add1 = add i32 %d, %e
/// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
/// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
///
/// Can have the following mapping from candidate to candidate of:
/// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
/// and can be considered similar.
///
/// IRSimilarityCandidate1:   IRSimilarityCandidate2:
/// %add1 = add i32 %a, %b    %add1 = add i32 %d, c4
/// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
/// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
///
/// We cannot create the same mapping since the use of c4 is not used in the
/// same way as %b or c2.
class IRSimilarityCandidate {};

CandidateGVNMapping;
SimilarityGroup;
SimilarityGroupList;

/// This class puts all the pieces of the IRInstructionData,
/// IRInstructionMapper, IRSimilarityCandidate together.
///
/// It first feeds the Module or vector of Modules into the IRInstructionMapper,
/// and puts all the mapped instructions into a single long list of
/// IRInstructionData.
///
/// The list of unsigned integers is given to the Suffix Tree or similar data
/// structure to find repeated subsequences.  We construct an
/// IRSimilarityCandidate for each instance of the subsequence.  We compare them
/// against one another since  These repeated subsequences can have different
/// structure.  For each different kind of structure found, we create a
/// similarity group.
///
/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
/// structurally similar to one another, while C is different we would have two
/// SimilarityGroups:
///
/// SimilarityGroup 1:  SimilarityGroup 2
/// A, B, D             C
///
/// A list of the different similarity groups is then returned after
/// analyzing the module.
class IRSimilarityIdentifier {};

} // end namespace IRSimilarity

/// An analysis pass based on legacy pass manager that runs and returns
/// IRSimilarityIdentifier run on the Module.
class IRSimilarityIdentifierWrapperPass : public ModulePass {};

/// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
/// Module.
class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {};

/// Printer pass that uses \c IRSimilarityAnalysis.
class IRSimilarityAnalysisPrinterPass
    : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {};

} // end namespace llvm

#endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H