ngram_extractor.h | Explore in Territory

// Copyright 2016 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
#define COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_

#include <stddef.h>

#include <iterator>
#include <string_view>
#include <type_traits>

#include "base/check_op.h"
#include "base/memory/raw_ref.h"
#include "base/strings/string_util.h"

namespace url_pattern_index {

// The class used to iteratively extract N-grams from strings. An N-gram is a
// string consisting of N (up to 8) non-special characters, which are stored in
// the lowest N non-zero bytes, lower bytes corresponding to later symbols. The
// size of the integer type limits the maximum value of N. For example an
// uint64_t can store up to 8-grams.
//
// Note: If used for UTF-8 strings, the N-grams can have partial byte sequences.
//
// Template parameters:
//  * N - the size of N-grams.
//  * NGramType - the integer type used to encode N-grams.
//  * CasePolicy - whether or not to lower-case the N-grams. Assumes ASCII.
//  * IsSeparator - the type of a bool(char) functor.
enum class NGramCaseExtraction { … };
template <size_t N,
          typename NGramType,
          NGramCaseExtraction CasePolicy,
          typename IsSeparator>
class NGramExtractor { … };

// A helper function used to create an NGramExtractor for a |string| without
// knowing the direct type of the |is_separator| functor.
//
// Typical usage:
//   const char* str = "no*abacaba*abcd";
//   auto extractor =
//     CreateNGramExtractor<5, uint64_t, NGrameCaseExtraction::kLowercase>(
//       str, [](char c) { return c == '*'; });
//   for (uint64_t ngram : extractor) {
//     ... process the |ngram| ...
//   }
template <size_t N,
          typename NGramType,
          NGramCaseExtraction CasePolicy,
          typename IsSeparator>
NGramExtractor<N, NGramType, CasePolicy, IsSeparator> CreateNGramExtractor(
    std::string_view string,
    IsSeparator is_separator) { … }

}  // namespace url_pattern_index

#endif  // COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_
chromium/components/url_pattern_index/ngram_extractor.h