//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'), // we consider the possible partial match states: // // u n i q u e _ p t r // +--------------------- // |A . . . . . . . . . . // u| // |. . . . . . . . . . . // _| // |. . . . . . . O . . . // p| // |. . . . . . . . . . B // // Each dot represents some prefix of the pattern being matched against some // prefix of the word. // - A is the initial state: '' matched against '' // - O is an intermediate state: 'u_' matched against 'unique_' // - B is the target state: 'u_p' matched against 'unique_ptr' // // We aim to find the best path from A->B. // - Moving right (consuming a word character) // Always legal: not all word characters must match. // - Moving diagonally (consuming both a word and pattern character) // Legal if the characters match. // - Moving down (consuming a pattern character) is never legal. // Never legal: all pattern characters must match something. // Characters are matched case-insensitively. // The first pattern character may only match the start of a word segment. // // The scoring is based on heuristics: // - when matching a character, apply a bonus or penalty depending on the // match quality (does case match, do word segments align, etc) // - when skipping a character, apply a penalty if it hurts the match // (it starts a word segment, or splits the matched region, etc) // // These heuristics require the ability to "look backward" one character, to // see whether it was matched or not. Therefore the dynamic-programming matrix // has an extra dimension (last character matched). // Each entry also has an additional flag indicating whether the last-but-one // character matched, which is needed to trace back through the scoring table // and reconstruct the match. // // We treat strings as byte-sequences, so only ASCII has first-class support. // // This algorithm was inspired by VS code's client-side filtering, and aims // to be mostly-compatible. // //===----------------------------------------------------------------------===// #include "FuzzyMatch.h" #include "llvm/Support/Format.h" #include <optional> namespace clang { namespace clangd { constexpr int FuzzyMatcher::MaxPat; constexpr int FuzzyMatcher::MaxWord; static char lower(char C) { … } // A "negative infinity" score that won't overflow. // We use this to mark unreachable states and forbidden solutions. // Score field is 15 bits wide, min value is -2^14, we use half of that. static constexpr int AwfulScore = …; static bool isAwful(int S) { … } static constexpr int PerfectBonus = …; // Perfect per-pattern-char score. FuzzyMatcher::FuzzyMatcher(llvm::StringRef Pattern) : … { … } std::optional<float> FuzzyMatcher::match(llvm::StringRef Word) { … } // We get CharTypes from a lookup table. Each is 2 bits, 4 fit in each byte. // The top 6 bits of the char select the byte, the bottom 2 select the offset. // e.g. 'q' = 010100 01 = byte 28 (55), bits 3-2 (01) -> Lower. constexpr static uint8_t CharTypes[] = …; // The Role can be determined from the Type of a character and its neighbors: // // Example | Chars | Type | Role // ---------+--------------+----- // F(o)oBar | Foo | Ull | Tail // Foo(B)ar | oBa | lUl | Head // (f)oo | ^fo | Ell | Head // H(T)TP | HTT | UUU | Tail // // Our lookup table maps a 6 bit key (Prev, Curr, Next) to a 2-bit Role. // A byte packs 4 Roles. (Prev, Curr) selects a byte, Next selects the offset. // e.g. Lower, Upper, Lower -> 01 10 01 -> byte 6 (aa), bits 3-2 (10) -> Head. constexpr static uint8_t CharRoles[] = …; template <typename T> static T packedLookup(const uint8_t *Data, int I) { … } CharTypeSet calculateRoles(llvm::StringRef Text, llvm::MutableArrayRef<CharRole> Roles) { … } // Sets up the data structures matching Word. // Returns false if we can cheaply determine that no match is possible. bool FuzzyMatcher::init(llvm::StringRef NewWord) { … } // The forwards pass finds the mappings of Pattern onto Word. // Score = best score achieved matching Word[..W] against Pat[..P]. // Unlike other tables, indices range from 0 to N *inclusive* // Matched = whether we chose to match Word[W] with Pat[P] or not. // // Points are mostly assigned to matched characters, with 1 being a good score // and 3 being a great one. So we treat the score range as [0, 3 * PatN]. // This range is not strict: we can apply larger bonuses/penalties, or penalize // non-matched characters. void FuzzyMatcher::buildGraph() { … } bool FuzzyMatcher::allowMatch(int P, int W, Action Last) const { … } int FuzzyMatcher::skipPenalty(int W, Action Last) const { … } int FuzzyMatcher::matchBonus(int P, int W, Action Last) const { … } llvm::SmallString<256> FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const { … } } // namespace clangd } // namespace clang