#include "tf_ops/projection_util.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <memory>
#include <sstream>
#include <unordered_set>
#include "icu4c/source/common/unicode/uchar.h"
#include "icu4c/source/common/unicode/utf8.h"
namespace {
constexpr int kInvalid = …;
constexpr char kSpace = …;
class MurmurHash : public HashEngine { … };
class XFixHash : public HashEngine { … };
class UnicodeHash : public HashEngine { … };
}
bool Hasher::SupportedHashType(const std::string& hash_type) { … }
Hasher* Hasher::CreateHasher(int feature_size, const std::string& hash_type) { … }
Hasher::Hasher(int feature_size, HashEngine* hash_engine)
: … { … }
std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
const std::pair<const char*, size_t>& source, bool* first_cap,
bool* all_caps) const { … }
void ProjectionUnicodeHandler::InitializeVocabulary(
const std::string& vocabulary) { … }
inline size_t FindNextSpace(const char* input_ptr, size_t from, size_t length) { … }
template <typename T>
void SplitBySpaceInternal(std::vector<T>* tokens, const char* input_ptr,
size_t len, size_t max_input, size_t max_tokens) { … }
std::vector<std::pair<const char*, size_t>> SplitBySpaceAsPairs(
const char* input_ptr, size_t len, size_t max_tokens) { … }
std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
size_t max_input, size_t max_tokens) { … }
template <typename T>
void SplitByCharInternal(std::vector<T>* tokens, const char* input_ptr,
size_t len, size_t max_tokens) { … }
std::vector<std::pair<const char*, size_t>> SplitByCharAsPairs(
const char* input_ptr, size_t len, size_t max_tokens) { … }
std::vector<std::string> SplitByChar(const char* input_ptr, size_t len,
size_t max_tokens) { … }
std::string JoinPairsBySpace(
std::vector<std::pair<const char*, size_t>> words) { … }
std::vector<std::pair<const char*, size_t>> ProjectionUnicodeHandler::Tokenize(
const char* str, size_t len, bool by_space, int max_tokens) { … }