projection_util.cc | Explore in Territory

/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tf_ops/projection_util.h"  // seq_flow_lite

#include <cassert>
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <memory>
#include <sstream>
#include <unordered_set>

#include "icu4c/source/common/unicode/uchar.h"
#include "icu4c/source/common/unicode/utf8.h"

namespace {

constexpr int kInvalid = …;
constexpr char kSpace = …;

// A HashEngine that uses MurmurHash to convert text to hashcodes.
class MurmurHash : public HashEngine { … };

// A HashEngine that uses a prefix and suffix preserving hash to convert text
// to hashcodes.
class XFixHash : public HashEngine { … };

// A HashEngine that performs a position preserving unicode level hashing to
// convert text to hashcodes.
class UnicodeHash : public HashEngine { … };

}  // namespace

bool Hasher::SupportedHashType(const std::string& hash_type) { … }

Hasher* Hasher::CreateHasher(int feature_size, const std::string& hash_type) { … }

Hasher::Hasher(int feature_size, HashEngine* hash_engine)
    : … { … }

std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
    const std::pair<const char*, size_t>& source, bool* first_cap,
    bool* all_caps) const { … }

void ProjectionUnicodeHandler::InitializeVocabulary(
    const std::string& vocabulary) { … }

// Starting from input_ptr[from], search for the next occurrence of ' ',
// Don't search beyond input_ptr[length](non-inclusive), return -1 if not
// found.
inline size_t FindNextSpace(const char* input_ptr, size_t from, size_t length) { … }

template <typename T>
void SplitBySpaceInternal(std::vector<T>* tokens, const char* input_ptr,
                          size_t len, size_t max_input, size_t max_tokens) { … }

std::vector<std::pair<const char*, size_t>> SplitBySpaceAsPairs(
    const char* input_ptr, size_t len, size_t max_tokens) { … }

std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
                                      size_t max_input, size_t max_tokens) { … }

template <typename T>
void SplitByCharInternal(std::vector<T>* tokens, const char* input_ptr,
                         size_t len, size_t max_tokens) { … }

std::vector<std::pair<const char*, size_t>> SplitByCharAsPairs(
    const char* input_ptr, size_t len, size_t max_tokens) { … }

std::vector<std::string> SplitByChar(const char* input_ptr, size_t len,
                                     size_t max_tokens) { … }

std::string JoinPairsBySpace(
    std::vector<std::pair<const char*, size_t>> words) { … }

std::vector<std::pair<const char*, size_t>> ProjectionUnicodeHandler::Tokenize(
    const char* str, size_t len, bool by_space, int max_tokens) { … }
chromium/third_party/tensorflow_models/src/research/seq_flow_lite/tf_ops/projection_util.cc