#include "mediapipe/tasks/cc/text/tokenizers/regex_tokenizer.h"
#include <iostream>
#include "absl/strings/substitute.h"
#include "mediapipe/tasks/cc/text/utils/vocab_utils.h"
namespace mediapipe {
namespace tasks {
namespace text {
namespace tokenizers {
namespace {
LoadVocabAndIndexFromBuffer;
LoadVocabAndIndexFromFile;
constexpr char kStart[] = …;
constexpr char kPad[] = …;
constexpr char kUnknown[] = …;
void buildIndexTokenMap(
const absl::node_hash_map<std::string, int>& token_index_map,
absl::node_hash_map<int, absl::string_view>* index_token_map) { … }
}
RegexTokenizer::RegexTokenizer(const std::string& regex_pattern,
const std::string& path_to_vocab)
: … { … }
RegexTokenizer::RegexTokenizer(const std::string& regex_pattern,
const char* vocab_buffer_data,
size_t vocab_buffer_size)
: … { … }
TokenizerResult RegexTokenizer::Tokenize(const std::string& input) { … }
bool RegexTokenizer::LookupId(absl::string_view key, int* result) const { … }
bool RegexTokenizer::LookupWord(int vocab_id, absl::string_view* result) const { … }
bool RegexTokenizer::GetStartToken(int* start_token) { … }
bool RegexTokenizer::GetPadToken(int* pad_token) { … }
bool RegexTokenizer::GetUnknownToken(int* unknown_token) { … }
}
}
}
}