Clustering.cpp | Explore in Territory

//===-- Clustering.cpp ------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "Clustering.h"
#include "Error.h"
#include "SchedClassResolution.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include <algorithm>
#include <deque>
#include <string>
#include <vector>

namespace llvm {
namespace exegesis {

// The clustering problem has the following characteristics:
//  (A) - Low dimension (dimensions are typically proc resource units,
//    typically < 10).
//  (B) - Number of points : ~thousands (points are measurements of an MCInst)
//  (C) - Number of clusters: ~tens.
//  (D) - The number of clusters is not known /a priory/.
//  (E) - The amount of noise is relatively small.
// The problem is rather small. In terms of algorithms, (D) disqualifies
// k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable.
//
// We've used DBSCAN here because it's simple to implement. This is a pretty
// straightforward and inefficient implementation of the pseudocode in [2].
//
// [1] https://en.wikipedia.org/wiki/DBSCAN
// [2] https://en.wikipedia.org/wiki/OPTICS_algorithm

// Finds the points at distance less than sqrt(EpsilonSquared) of Q (not
// including Q).
void BenchmarkClustering::rangeQuery(
    const size_t Q, std::vector<size_t> &Neighbors) const { … }

// Given a set of points, checks that all the points are neighbours
// up to AnalysisClusteringEpsilon. This is O(2*N).
bool BenchmarkClustering::areAllNeighbours(
    ArrayRef<size_t> Pts) const { … }

BenchmarkClustering::BenchmarkClustering(
    const std::vector<Benchmark> &Points,
    const double AnalysisClusteringEpsilonSquared)
    : … { … }

Error BenchmarkClustering::validateAndSetup() { … }

void BenchmarkClustering::clusterizeDbScan(const size_t MinPts) { … }

void BenchmarkClustering::clusterizeNaive(
    const MCSubtargetInfo &SubtargetInfo, const MCInstrInfo &InstrInfo) { … }

// Given an instruction Opcode, we can make benchmarks (measurements) of the
// instruction characteristics/performance. Then, to facilitate further analysis
// we group the benchmarks with *similar* characteristics into clusters.
// Now, this is all not entirely deterministic. Some instructions have variable
// characteristics, depending on their arguments. And thus, if we do several
// benchmarks of the same instruction Opcode, we may end up with *different*
// performance characteristics measurements. And when we then do clustering,
// these several benchmarks of the same instruction Opcode may end up being
// clustered into *different* clusters. This is not great for further analysis.
// We shall find every opcode with benchmarks not in just one cluster, and move
// *all* the benchmarks of said Opcode into one new unstable cluster per Opcode.
void BenchmarkClustering::stabilize(unsigned NumOpcodes) { … }

Expected<BenchmarkClustering> BenchmarkClustering::create(
    const std::vector<Benchmark> &Points, const ModeE Mode,
    const size_t DbscanMinPts, const double AnalysisClusteringEpsilon,
    const MCSubtargetInfo *SubtargetInfo, const MCInstrInfo *InstrInfo) { … }

void SchedClassClusterCentroid::addPoint(ArrayRef<BenchmarkMeasure> Point) { … }

std::vector<BenchmarkMeasure> SchedClassClusterCentroid::getAsPoint() const { … }

bool SchedClassClusterCentroid::validate(
    Benchmark::ModeE Mode) const { … }

} // namespace exegesis
} // namespace llvm
llvm/llvm/tools/llvm-exegesis/lib/Clustering.cpp