FuzzyMatch.cpp | Explore in Territory

//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'),
// we consider the possible partial match states:
//
//     u n i q u e _ p t r
//   +---------------------
//   |A . . . . . . . . . .
//  u|
//   |. . . . . . . . . . .
//  _|
//   |. . . . . . . O . . .
//  p|
//   |. . . . . . . . . . B
//
// Each dot represents some prefix of the pattern being matched against some
// prefix of the word.
//   - A is the initial state: '' matched against ''
//   - O is an intermediate state: 'u_' matched against 'unique_'
//   - B is the target state: 'u_p' matched against 'unique_ptr'
//
// We aim to find the best path from A->B.
//  - Moving right (consuming a word character)
//    Always legal: not all word characters must match.
//  - Moving diagonally (consuming both a word and pattern character)
//    Legal if the characters match.
//  - Moving down (consuming a pattern character) is never legal.
//    Never legal: all pattern characters must match something.
// Characters are matched case-insensitively.
// The first pattern character may only match the start of a word segment.
//
// The scoring is based on heuristics:
//  - when matching a character, apply a bonus or penalty depending on the
//    match quality (does case match, do word segments align, etc)
//  - when skipping a character, apply a penalty if it hurts the match
//    (it starts a word segment, or splits the matched region, etc)
//
// These heuristics require the ability to "look backward" one character, to
// see whether it was matched or not. Therefore the dynamic-programming matrix
// has an extra dimension (last character matched).
// Each entry also has an additional flag indicating whether the last-but-one
// character matched, which is needed to trace back through the scoring table
// and reconstruct the match.
//
// We treat strings as byte-sequences, so only ASCII has first-class support.
//
// This algorithm was inspired by VS code's client-side filtering, and aims
// to be mostly-compatible.
//
//===----------------------------------------------------------------------===//

#include "FuzzyMatch.h"
#include "llvm/Support/Format.h"
#include <optional>

namespace clang {
namespace clangd {

constexpr int FuzzyMatcher::MaxPat;
constexpr int FuzzyMatcher::MaxWord;

static char lower(char C) { … }
// A "negative infinity" score that won't overflow.
// We use this to mark unreachable states and forbidden solutions.
// Score field is 15 bits wide, min value is -2^14, we use half of that.
static constexpr int AwfulScore = …;
static bool isAwful(int S) { … }
static constexpr int PerfectBonus = …; // Perfect per-pattern-char score.

FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern)
    : … { … }

std::optional<float> FuzzyMatcher::match(llvm::StringRef Word) { … }

// We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte.
// The top 6 bits of the char select the byte, the bottom 2 select the offset.
// e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower.
constexpr static uint8_t CharTypes[] = …;

// The Role can be determined from the Type of a character and its neighbors:
//
//   Example  | Chars | Type | Role
//   ---------+--------------+-----
//   F(o)oBar | Foo   | Ull  | Tail
//   Foo(B)ar | oBa   | lUl  | Head
//   (f)oo    | ^fo   | Ell  | Head
//   H(T)TP   | HTT   | UUU  | Tail
//
// Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role.
// A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset.
// e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head.
constexpr static uint8_t CharRoles[] = …;

template <typename T> static T packedLookup(const uint8_t *Data, int I) { … }
CharTypeSet calculateRoles(llvm::StringRef Text,
                           llvm::MutableArrayRef<CharRole> Roles) { … }

// Sets up the data structures matching Word.
// Returns false if we can cheaply determine that no match is possible.
bool FuzzyMatcher::init(llvm::StringRef NewWord) { … }

// The forwards pass finds the mappings of Pattern onto Word.
// Score = best score achieved matching Word[..W] against Pat[..P].
// Unlike other tables, indices range from 0 to N *inclusive*
// Matched = whether we chose to match Word[W] with Pat[P] or not.
//
// Points are mostly assigned to matched characters, with 1 being a good score
// and 3 being a great one. So we treat the score range as [0, 3 * PatN].
// This range is not strict: we can apply larger bonuses/penalties, or penalize
// non-matched characters.
void FuzzyMatcher::buildGraph() { … }

bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const { … }

int FuzzyMatcher::skipPenalty(int W, Action Last) const { … }

int FuzzyMatcher::matchBonus(int P, int W, Action Last) const { … }

llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const { … }

} // namespace clangd
} // namespace clang
llvm/clang-tools-extra/clangd/FuzzyMatch.cpp