// Copyright 2016 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_ #define COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_ #include <stddef.h> #include <iterator> #include <string_view> #include <type_traits> #include "base/check_op.h" #include "base/memory/raw_ref.h" #include "base/strings/string_util.h" namespace url_pattern_index { // The class used to iteratively extract N-grams from strings. An N-gram is a // string consisting of N (up to 8) non-special characters, which are stored in // the lowest N non-zero bytes, lower bytes corresponding to later symbols. The // size of the integer type limits the maximum value of N. For example an // uint64_t can store up to 8-grams. // // Note: If used for UTF-8 strings, the N-grams can have partial byte sequences. // // Template parameters: // * N - the size of N-grams. // * NGramType - the integer type used to encode N-grams. // * CasePolicy - whether or not to lower-case the N-grams. Assumes ASCII. // * IsSeparator - the type of a bool(char) functor. enum class NGramCaseExtraction { … }; template <size_t N, typename NGramType, NGramCaseExtraction CasePolicy, typename IsSeparator> class NGramExtractor { … }; // A helper function used to create an NGramExtractor for a |string| without // knowing the direct type of the |is_separator| functor. // // Typical usage: // const char* str = "no*abacaba*abcd"; // auto extractor = // CreateNGramExtractor<5, uint64_t, NGrameCaseExtraction::kLowercase>( // str, [](char c) { return c == '*'; }); // for (uint64_t ngram : extractor) { // ... process the |ngram| ... // } template <size_t N, typename NGramType, NGramCaseExtraction CasePolicy, typename IsSeparator> NGramExtractor<N, NGramType, CasePolicy, IsSeparator> CreateNGramExtractor( std::string_view string, IsSeparator is_separator) { … } } // namespace url_pattern_index #endif // COMPONENTS_URL_PATTERN_INDEX_NGRAM_EXTRACTOR_H_