// Copyright 2021 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifdef UNSAFE_BUFFERS_BUILD // TODO(crbug.com/40285824): Remove this and convert code to safer constructs. #pragma allow_unsafe_buffers #endif #include "components/language_detection/core/ngram_hash.h" #include <string> #include "components/language_detection/core/ngram_hash_ops_utils.h" #include "third_party/flatbuffers/src/include/flatbuffers/flexbuffers.h" #include "third_party/flatbuffers/src/include/flatbuffers/util.h" #include "third_party/smhasher/src/MurmurHash2.h" #include "third_party/tflite/src/tensorflow/lite/kernels/kernel_util.h" #include "third_party/tflite/src/tensorflow/lite/string_util.h" namespace language_detection { namespace { GetRoot; Map; TypedVector; GetString; StringRef; constexpr int kInputMessage = …; constexpr int kOutputLabel = …; constexpr int kDefaultMaxSplits = …; // This op takes in a string, finds the character ngrams for it and then // maps each of these ngrams to an index using the specified vocabulary sizes. // Input(s): // - input: Input string. // - seeds: Seed for the random number generator. // - ngram_lengths: Lengths of each of the ngrams. For example [1, 2, 3] would // be interpreted as generating unigrams, bigrams, and trigrams. // - vocab_sizes: Size of the vocabulary for each of the ngram features // respectively. The op would generate vocab ids to be less than or equal to // the vocab size. The index 0 implies an invalid ngram. // - max_splits: Maximum number of tokens in the output. If this is unset, the // limit is `kDefaultMaxSplits`. // - lower_case_input: If this is set to true, the input string would be // lower-cased before any processing. // Output(s): // - output: A tensor of size [number of ngrams, number of tokens + 2], // where 2 tokens are reserved for the padding. If `max_splits` is set, this // length is <= max_splits, otherwise it is <= `kDefaultMaxSplits`. // Helper class used for pre-processing the input. class NGramHashParams { … }; // Convert the TypedVector into a regular std::vector. std::vector<int> GetIntVector(TypedVector typed_vec) { … } void GetNGramHashIndices(NGramHashParams* params, int32_t* data) { … } void* Init(TfLiteContext* context, const char* buffer, size_t length) { … } void Free(TfLiteContext* context, void* buffer) { … } TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) { … } TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { … } } // namespace TfLiteRegistration* Register_NGRAM_HASH() { … } } // namespace language_detection