#include "sentencepiece_processor.h"
#include <map>
#include <set>
#include <utility>
#include "absl/memory/memory.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "absl/strings/str_replace.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "absl/strings/strip.h"
#include "common.h"
#include "filesystem.h"
#include "model_factory.h"
#include "model_interface.h"
#include "normalizer.h"
#include "sentencepiece.pb.h"
#include "unigram_model.h"
#include "util.h"
namespace sentencepiece {
namespace {
const char kSpaceSymbol[] = …;
const char kDefaultUnknownSymbol[] = …;
const char kReplacementCharacter[] = …;
std::vector<absl::string_view> ToPieceArray(const std::vector<std::string>& v) { … }
void ConvertToUnicodeSpansInternal(SentencePieceText* spt) { … }
}
ImmutableSentencePieceText::ImmutableSentencePieceText()
: … { … }
ImmutableSentencePieceText::ImmutableSentencePieceText(
const SentencePieceText& spt)
: … { … }
ImmutableSentencePieceText::~ImmutableSentencePieceText() { … }
ImmutableSentencePieceText_ImmutableSentencePiece::
ImmutableSentencePieceText_ImmutableSentencePiece()
: … { … }
ImmutableSentencePieceText_ImmutableSentencePiece::
ImmutableSentencePieceText_ImmutableSentencePiece(
const SentencePieceText_SentencePiece& sp)
: … { … }
const std::string& ImmutableSentencePieceText_ImmutableSentencePiece::piece()
const { … }
const std::string& ImmutableSentencePieceText_ImmutableSentencePiece::surface()
const { … }
uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::id() const { … }
uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::begin() const { … }
uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::end() const { … }
std::vector<ImmutableSentencePieceText_ImmutableSentencePiece>
ImmutableSentencePieceText::pieces() const { … }
size_t ImmutableSentencePieceText::pieces_size() const { … }
ImmutableSentencePieceText_ImmutableSentencePiece
ImmutableSentencePieceText::pieces(int index) const { … }
const std::string& ImmutableSentencePieceText::text() const { … }
float ImmutableSentencePieceText::score() const { … }
SentencePieceText* ImmutableSentencePieceText::mutable_proto() { … }
void ImmutableSentencePieceText::ConvertToUnicodeSpans() { … }
util::bytes ImmutableSentencePieceText::SerializeAsString() const { … }
ImmutableNBestSentencePieceText::ImmutableNBestSentencePieceText() { … }
ImmutableNBestSentencePieceText::~ImmutableNBestSentencePieceText() { … }
size_t ImmutableNBestSentencePieceText::nbests_size() const { … }
ImmutableSentencePieceText ImmutableNBestSentencePieceText::nbests(
int index) const { … }
std::vector<ImmutableSentencePieceText>
ImmutableNBestSentencePieceText::nbests() const { … }
NBestSentencePieceText* ImmutableNBestSentencePieceText::mutable_proto() { … }
void ImmutableNBestSentencePieceText::ConvertToUnicodeSpans() { … }
util::bytes ImmutableNBestSentencePieceText::SerializeAsString() const { … }
SentencePieceProcessor::SentencePieceProcessor() { … }
SentencePieceProcessor::~SentencePieceProcessor() { … }
util::Status SentencePieceProcessor::Load(absl::string_view filename) { … }
void SentencePieceProcessor::LoadOrDie(absl::string_view filename) { … }
util::Status SentencePieceProcessor::Load(const ModelProto& model_proto) { … }
util::Status SentencePieceProcessor::LoadFromSerializedProto(
absl::string_view serialized) { … }
util::Status SentencePieceProcessor::Load(
std::unique_ptr<ModelProto> model_proto) { … }
util::Status SentencePieceProcessor::SetEncodeExtraOptions(
absl::string_view extra_options) { … }
util::Status SentencePieceProcessor::SetDecodeExtraOptions(
absl::string_view extra_options) { … }
util::Status SentencePieceProcessor::status() const { … }
util::Status SentencePieceProcessor::SetVocabulary(
const std::vector<absl::string_view>& valid_vocab) { … }
util::Status SentencePieceProcessor::ResetVocabulary() { … }
util::Status SentencePieceProcessor::LoadVocabulary(absl::string_view filename,
int threshold) { … }
#define CHECK_OR_RETURN_STATUS_STL(container) …
#define CHECK_OR_RETURN_STATUS_PROTO(proto) …
util::Status SentencePieceProcessor::Encode(
absl::string_view input,
std::vector<std::string>* pieces) const { … }
util::Status SentencePieceProcessor::Encode(absl::string_view input,
std::vector<int>* ids) const { … }
util::Status SentencePieceProcessor::Decode(
const std::vector<std::string>& pieces,
std::string* detokenized) const { … }
util::Status SentencePieceProcessor::Decode(
const std::vector<absl::string_view>& pieces,
std::string* detokenized) const { … }
util::Status SentencePieceProcessor::Decode(const std::vector<int>& ids,
std::string* detokenized) const { … }
util::Status SentencePieceProcessor::NBestEncode(
absl::string_view input,
int nbest_size,
std::vector<std::vector<std::string>>* pieces) const { … }
util::Status SentencePieceProcessor::NBestEncode(
absl::string_view input,
int nbest_size,
std::vector<std::vector<int>>* ids) const { … }
util::Status SentencePieceProcessor::SampleEncode(
absl::string_view input,
int nbest_size,
float alpha,
std::vector<std::string>* pieces) const { … }
util::Status SentencePieceProcessor::SampleEncode(absl::string_view input,
int nbest_size,
float alpha,
std::vector<int>* ids) const { … }
util::Status SentencePieceProcessor::SampleEncodeAndScore(
absl::string_view input,
int num_samples,
float alpha,
bool wor,
bool include_best,
std::vector<std::pair<std::vector<std::string>, float>>* pieces) const { … }
util::Status SentencePieceProcessor::SampleEncodeAndScore(
absl::string_view input,
int num_samples,
float alpha,
bool wor,
bool include_best,
std::vector<std::pair<std::vector<int>, float>>* ids) const { … }
util::Status SentencePieceProcessor::PopulateSentencePieceText(
absl::string_view input,
absl::string_view normalized,
const std::vector<size_t>& norm_to_orig,
const EncodeResult& result,
SentencePieceText* spt) const { … }
util::Status SentencePieceProcessor::Encode(absl::string_view input,
SentencePieceText* spt) const { … }
util::Status SentencePieceProcessor::NBestEncode(
absl::string_view input,
int nbest_size,
NBestSentencePieceText* nbest_spt) const { … }
util::Status SentencePieceProcessor::SampleEncode(
absl::string_view input,
int nbest_size,
float alpha,
SentencePieceText* spt) const { … }
util::Status SentencePieceProcessor::SampleEncodeAndScore(
absl::string_view input,
int samples,
float alpha,
bool wor,
bool include_best,
NBestSentencePieceText* samples_spt) const { … }
util::Status SentencePieceProcessor::CalculateEntropy(absl::string_view input,
float alpha,
float* entropy) const { … }
util::Status SentencePieceProcessor::Decode(
const std::vector<std::string>& pieces,
SentencePieceText* spt) const { … }
util::Status SentencePieceProcessor::Decode(
const std::vector<absl::string_view>& pieces,
SentencePieceText* spt) const { … }
util::Status SentencePieceProcessor::Decode(const std::vector<int>& ids,
SentencePieceText* spt) const { … }
#define CHECK_STATUS_OR_RETURN_DEFAULT(value) …
int SentencePieceProcessor::GetPieceSize() const { … }
int SentencePieceProcessor::PieceToId(absl::string_view piece) const { … }
const std::string &SentencePieceProcessor::IdToPiece(int id) const { … }
float SentencePieceProcessor::GetScore(int id) const { … }
bool SentencePieceProcessor::IsControl(int id) const { … }
bool SentencePieceProcessor::IsUnknown(int id) const { … }
bool SentencePieceProcessor::IsUnused(int id) const { … }
bool SentencePieceProcessor::IsByte(int id) const { … }
int SentencePieceProcessor::unk_id() const { … }
int SentencePieceProcessor::bos_id() const { … }
int SentencePieceProcessor::eos_id() const { … }
int SentencePieceProcessor::pad_id() const { … }
util::Status SentencePieceProcessor::ApplyExtraOptions(
const std::vector<ExtraOption>& extra_options,
SentencePieceText* spt) const { … }
util::Status SentencePieceProcessor::ParseExtraOptions(
absl::string_view _extra_option,
std::vector<SentencePieceProcessor::ExtraOption>* extra_options) const { … }
void SentencePieceProcessor::SetModel(std::unique_ptr<ModelInterface> &&model) { … }
void SentencePieceProcessor::SetNormalizer(
std::unique_ptr<normalizer::Normalizer> &&normalizer) { … }
const ModelProto &SentencePieceProcessor::model_proto() const { … }
std::string SentencePieceProcessor::serialized_model_proto() const { … }
void SetRandomGeneratorSeed(unsigned int seed);
namespace io {
util::Status LoadModelProto(absl::string_view filename,
ModelProto* model_proto) { … }
util::Status SaveModelProto(absl::string_view filename,
const ModelProto& model_proto) { … }
}
}