sequence_string_projection.cc | Explore in Territory

/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/**
 * Sequence String projection op used in PRADO.
 */
#include "tflite_ops/sequence_string_projection.h"  // seq_flow_lite

#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <map>
#include <memory>
#include <unordered_map>

#include "flatbuffers/flexbuffers.h"  // flatbuffer
#include "tensorflow/lite/string_util.h"
#include "tf_ops/projection_normalizer_util.h"  // seq_flow_lite
#include "tf_ops/projection_util.h"             // seq_flow_lite
#include "tflite_ops/quantization_util.h"       // seq_flow_lite

namespace seq_flow_lite {
namespace ops {
namespace custom {

namespace sequence_string_projection {
/**
 * This op referred to as Ternary Sequence String Projection op (TSP), tokenizes
 * input text either on space or unicode boundary. Fingerprint for each token is
 * computed using murmur hash and bit features are extracted from fingerprint
 * that maps every 2 bits to the ternary output {-1, 0, 1}. This effectively
 * turns a text input into a ternary rank 3 tensor (in 8bit/float format) of
 * shape [1, max token length, requested number of features].
 *
 * Input:
 *   tensor[0]: Input message, string[num_batch]
 *   attribute[0]: feature size
 *   attribute[1]: vocabulary, a set of allowed characters in utf8 format.
 *   attribute[2]: split_on_space, a boolean specifying the tokenization method.
 *   attribute[3]: max_splits, maximum number of splits allowed during
 *                 tokenization. When max_splits is set to -1, no limit on
 *                 number of tokens is imposed. When it is set to a positive
 *                 integer, number of tokens is truncated beyond that integer.
 *                 An end of input token is always added after tokenization,
 *                 hence the number of tokens is one more than the true number
 *                 of tokens. As a result, the number of tokens returned by this
 *                 op is not the same as absl::StrSplit.
 *   attribute[4]: word_novelty_bits, when set to a positive value less than 8,
 *                 generates a word specific novelty feature in the last feature
 *                 index.
 *   attribute[5]: doc_size_levels, when set to a positive value less than 17,
 *                 generates a feature proportional to the logarithm of the
 *                 number of tokens in the second to last feature index.
 *   attribute[6]: add_eos_tag, add an end of sequence tag to the output when
 *                 true. Defaults to true.
 *   attribute[7]: add_bos_tag, add a begin of sequence tag to the output when
 *                 true. Defaults to false.
 *   attribute[8]: add_first_cap_feature, when set to 1.0f add a feature to the
 *                 resulting projection tensor that helps discriminate if the
 *                 input token is Camel case. Otherwise leaves the projection
 *                 output unmodified.
 *   attribute[9]: add_all_caps_feature, when set to 1.0f add a feature to the
 *                 resulting projection tensor that helps discriminate if the
 *                 input token is ALLCAPS. Otherwise leaves the projection
 *                 output unmodified.
 * Output:
 * tensor[0]: computed projections.
 *            float32[true number of tokens][feature size]
 *            true number of tokens is number of tokens + 1. (for end of
 *            sequence).
 */

namespace {

constexpr char kBeginToken[] = …;
constexpr char kEndToken[] = …;
constexpr int kInputMessage = …;
constexpr int kOutputLabel = …;

enum class BosTag { … };
enum class EosTag { … };

class ProjectionParams { … };

class ProjectionParamsV2 : public ProjectionParams { … };

inline void SetTensorToDynamic(TfLiteTensor* tensor) { … }

// Determines whether tensor is dynamic. Note that a tensor can be non-const and
// not dynamic. This function specifically checks for a dynamic tensor.
inline bool IsDynamicTensor(const TfLiteTensor* tensor) { … }

void* Init(TfLiteContext* context, const char* buffer, size_t length) { … }

void* InitV2(TfLiteContext* context, const char* buffer, size_t length) { … }

void Free(TfLiteContext* context, void* buffer) { … }

TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) { … }

constexpr int kHashCodeBits = …;
constexpr int kMapBits = …;
constexpr int kIncrement = …;
constexpr int kMapHigh = …;
constexpr int kMapLow = …;

template <typename T>
void TypedEval(const T* mapping_table, ProjectionParams* params, T* data) { … }

TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { … }

}  // namespace
}  // namespace sequence_string_projection

const char kSequenceStringProjection[] = …;

// This op converts a list of strings to a sequence of features using hashing.
TfLiteRegistration* Register_SEQUENCE_STRING_PROJECTION() { … }

const char kSequenceStringProjectionV2[] = …;

// This op converts a sequence of tokens to a sequence of projected features
// using hashing.
TfLiteRegistration* Register_SEQUENCE_STRING_PROJECTION_V2() { … }

}  // namespace custom
}  // namespace ops
}  // namespace seq_flow_lite
chromium/third_party/tensorflow_models/src/research/seq_flow_lite/tflite_ops/sequence_string_projection.cc