ngram_hash.cc | Explore in Territory

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

#include "components/language_detection/core/ngram_hash.h"

#include <string>
#include "components/language_detection/core/ngram_hash_ops_utils.h"
#include "third_party/flatbuffers/src/include/flatbuffers/flexbuffers.h"
#include "third_party/flatbuffers/src/include/flatbuffers/util.h"
#include "third_party/smhasher/src/MurmurHash2.h"
#include "third_party/tflite/src/tensorflow/lite/kernels/kernel_util.h"
#include "third_party/tflite/src/tensorflow/lite/string_util.h"

namespace language_detection {

namespace {

GetRoot;
Map;
TypedVector;
GetString;
StringRef;
constexpr int kInputMessage = …;
constexpr int kOutputLabel = …;
constexpr int kDefaultMaxSplits = …;

// This op takes in a string, finds the character ngrams for it and then
// maps each of these ngrams to an index using the specified vocabulary sizes.
// Input(s):
// - input: Input string.
// - seeds: Seed for the random number generator.
// - ngram_lengths: Lengths of each of the ngrams. For example [1, 2, 3] would
//   be interpreted as generating unigrams, bigrams, and trigrams.
// - vocab_sizes: Size of the vocabulary for each of the ngram features
//   respectively. The op would generate vocab ids to be less than or equal to
//   the vocab size. The index 0 implies an invalid ngram.
// - max_splits: Maximum number of tokens in the output. If this is unset, the
//   limit is `kDefaultMaxSplits`.
// - lower_case_input: If this is set to true, the input string would be
//   lower-cased before any processing.
// Output(s):
// - output: A tensor of size [number of ngrams, number of tokens + 2],
//   where 2 tokens are reserved for the padding. If `max_splits` is set, this
//   length is <= max_splits, otherwise it is <= `kDefaultMaxSplits`.
// Helper class used for pre-processing the input.
class NGramHashParams { … };

// Convert the TypedVector into a regular std::vector.
std::vector<int> GetIntVector(TypedVector typed_vec) { … }

void GetNGramHashIndices(NGramHashParams* params, int32_t* data) { … }

void* Init(TfLiteContext* context, const char* buffer, size_t length) { … }

void Free(TfLiteContext* context, void* buffer) { … }

TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) { … }

TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { … }

}  // namespace

TfLiteRegistration* Register_NGRAM_HASH() { … }

}  // namespace language_detection
chromium/components/language_detection/core/ngram_hash.cc