FuzzyMatch.h | Explore in Territory

//===--- FuzzyMatch.h - Approximate identifier matching  ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements fuzzy-matching of strings against identifiers.
// It indicates both the existence and quality of a match:
// 'eb' matches both 'emplace_back' and 'embed', the former has a better score.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H
#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H

#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"
#include <optional>

namespace clang {
namespace clangd {

// Utilities for word segmentation.
// FuzzyMatcher already incorporates this logic, so most users don't need this.
//
// A name like "fooBar_baz" consists of several parts foo, bar, baz.
// Aligning segmentation of word and pattern improves the fuzzy-match.
// For example: [lol] matches "LaughingOutLoud" better than "LionPopulation"
//
// First we classify each character into types (uppercase, lowercase, etc).
// Then we look at the sequence: e.g. [upper, lower] is the start of a segment.

// We distinguish the types of characters that affect segmentation.
// It's not obvious how to segment digits, we treat them as lowercase letters.
// As we don't decode UTF-8, we treat bytes over 127 as lowercase too.
// This means we require exact (case-sensitive) match for those characters.
enum CharType : unsigned char { … };
// A CharTypeSet is a bitfield representing all the character types in a word.
// Its bits are 1<<Empty, 1<<Lower, etc.
CharTypeSet;

// Each character's Role is the Head or Tail of a segment, or a Separator.
// e.g. XMLHttpRequest_Async
//      +--+---+------ +----
//      ^Head   ^Tail ^Separator
enum CharRole : unsigned char { … };

// Compute segmentation of Text.
// Character roles are stored in Roles (Roles.size() must equal Text.size()).
// The set of character types encountered is returned, this may inform
// heuristics for dealing with poorly-segmented identifiers like "strndup".
CharTypeSet calculateRoles(llvm::StringRef Text,
                           llvm::MutableArrayRef<CharRole> Roles);

// A matcher capable of matching and scoring strings against a single pattern.
// It's optimized for matching against many strings - match() does not allocate.
class FuzzyMatcher { … };

} // namespace clangd
} // namespace clang

#endif
llvm/clang-tools-extra/clangd/FuzzyMatch.h