chromium/third_party/sentencepiece/src/src/sentencepiece_processor.cc

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!

#include "sentencepiece_processor.h"

#include <map>
#include <set>
#include <utility>

#include "absl/memory/memory.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_join.h"
#include "absl/strings/str_replace.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "absl/strings/strip.h"
#include "common.h"
#include "filesystem.h"
#include "model_factory.h"
#include "model_interface.h"
#include "normalizer.h"
#include "sentencepiece.pb.h"
#include "unigram_model.h"
#include "util.h"

namespace sentencepiece {
namespace {

// Replaces white space with U+2581 (LOWER ONE EIGHT BLOCK).
const char kSpaceSymbol[] =;

// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
// since this character can be useful both for user and
// developer. We can easily figure out that <unk> is emitted.
const char kDefaultUnknownSymbol[] =;

// REPLACEMENT CHARACTER (U+FFFD) in UTF-8.
const char kReplacementCharacter[] =;

std::vector<absl::string_view> ToPieceArray(const std::vector<std::string>& v) {}

void ConvertToUnicodeSpansInternal(SentencePieceText* spt) {}

}  // namespace

ImmutableSentencePieceText::ImmutableSentencePieceText()
    :{}

ImmutableSentencePieceText::ImmutableSentencePieceText(
    const SentencePieceText& spt)
    :{}

ImmutableSentencePieceText::~ImmutableSentencePieceText() {}

ImmutableSentencePieceText_ImmutableSentencePiece::
    ImmutableSentencePieceText_ImmutableSentencePiece()
    :{}

ImmutableSentencePieceText_ImmutableSentencePiece::
    ImmutableSentencePieceText_ImmutableSentencePiece(
        const SentencePieceText_SentencePiece& sp)
    :{}

const std::string& ImmutableSentencePieceText_ImmutableSentencePiece::piece()
    const {}

const std::string& ImmutableSentencePieceText_ImmutableSentencePiece::surface()
    const {}

uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::id() const {}

uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::begin() const {}

uint32_t ImmutableSentencePieceText_ImmutableSentencePiece::end() const {}

std::vector<ImmutableSentencePieceText_ImmutableSentencePiece>
ImmutableSentencePieceText::pieces() const {}

size_t ImmutableSentencePieceText::pieces_size() const {}

ImmutableSentencePieceText_ImmutableSentencePiece
ImmutableSentencePieceText::pieces(int index) const {}

const std::string& ImmutableSentencePieceText::text() const {}

float ImmutableSentencePieceText::score() const {}

SentencePieceText* ImmutableSentencePieceText::mutable_proto() {}

void ImmutableSentencePieceText::ConvertToUnicodeSpans() {}

util::bytes ImmutableSentencePieceText::SerializeAsString() const {}

ImmutableNBestSentencePieceText::ImmutableNBestSentencePieceText() {}
ImmutableNBestSentencePieceText::~ImmutableNBestSentencePieceText() {}

size_t ImmutableNBestSentencePieceText::nbests_size() const {}

ImmutableSentencePieceText ImmutableNBestSentencePieceText::nbests(
    int index) const {}

std::vector<ImmutableSentencePieceText>
ImmutableNBestSentencePieceText::nbests() const {}

NBestSentencePieceText* ImmutableNBestSentencePieceText::mutable_proto() {}

void ImmutableNBestSentencePieceText::ConvertToUnicodeSpans() {}

util::bytes ImmutableNBestSentencePieceText::SerializeAsString() const {}

SentencePieceProcessor::SentencePieceProcessor() {}
SentencePieceProcessor::~SentencePieceProcessor() {}

util::Status SentencePieceProcessor::Load(absl::string_view filename) {}

void SentencePieceProcessor::LoadOrDie(absl::string_view filename) {}

util::Status SentencePieceProcessor::Load(const ModelProto& model_proto) {}

util::Status SentencePieceProcessor::LoadFromSerializedProto(
    absl::string_view serialized) {}

util::Status SentencePieceProcessor::Load(
    std::unique_ptr<ModelProto> model_proto) {}

util::Status SentencePieceProcessor::SetEncodeExtraOptions(
    absl::string_view extra_options) {}

util::Status SentencePieceProcessor::SetDecodeExtraOptions(
    absl::string_view extra_options) {}

util::Status SentencePieceProcessor::status() const {}

util::Status SentencePieceProcessor::SetVocabulary(
    const std::vector<absl::string_view>& valid_vocab) {}

util::Status SentencePieceProcessor::ResetVocabulary() {}

util::Status SentencePieceProcessor::LoadVocabulary(absl::string_view filename,
                                                    int threshold) {}

#define CHECK_OR_RETURN_STATUS_STL(container)

#define CHECK_OR_RETURN_STATUS_PROTO(proto)

//////////////////////////////////////////////////////////////
// Simple API.
util::Status SentencePieceProcessor::Encode(
    absl::string_view input,
    std::vector<std::string>* pieces) const {}

util::Status SentencePieceProcessor::Encode(absl::string_view input,
                                            std::vector<int>* ids) const {}

util::Status SentencePieceProcessor::Decode(
    const std::vector<std::string>& pieces,
    std::string* detokenized) const {}

util::Status SentencePieceProcessor::Decode(
    const std::vector<absl::string_view>& pieces,
    std::string* detokenized) const {}

util::Status SentencePieceProcessor::Decode(const std::vector<int>& ids,
                                            std::string* detokenized) const {}

util::Status SentencePieceProcessor::NBestEncode(
    absl::string_view input,
    int nbest_size,
    std::vector<std::vector<std::string>>* pieces) const {}

util::Status SentencePieceProcessor::NBestEncode(
    absl::string_view input,
    int nbest_size,
    std::vector<std::vector<int>>* ids) const {}

util::Status SentencePieceProcessor::SampleEncode(
    absl::string_view input,
    int nbest_size,
    float alpha,
    std::vector<std::string>* pieces) const {}

util::Status SentencePieceProcessor::SampleEncode(absl::string_view input,
                                                  int nbest_size,
                                                  float alpha,
                                                  std::vector<int>* ids) const {}

util::Status SentencePieceProcessor::SampleEncodeAndScore(
    absl::string_view input,
    int num_samples,
    float alpha,
    bool wor,
    bool include_best,
    std::vector<std::pair<std::vector<std::string>, float>>* pieces) const {}

util::Status SentencePieceProcessor::SampleEncodeAndScore(
    absl::string_view input,
    int num_samples,
    float alpha,
    bool wor,
    bool include_best,
    std::vector<std::pair<std::vector<int>, float>>* ids) const {}

util::Status SentencePieceProcessor::PopulateSentencePieceText(
    absl::string_view input,
    absl::string_view normalized,
    const std::vector<size_t>& norm_to_orig,
    const EncodeResult& result,
    SentencePieceText* spt) const {}  // namespace sentencepiece

util::Status SentencePieceProcessor::Encode(absl::string_view input,
                                            SentencePieceText* spt) const {}

util::Status SentencePieceProcessor::NBestEncode(
    absl::string_view input,
    int nbest_size,
    NBestSentencePieceText* nbest_spt) const {}

util::Status SentencePieceProcessor::SampleEncode(
    absl::string_view input,
    int nbest_size,
    float alpha,
    SentencePieceText* spt) const {}

util::Status SentencePieceProcessor::SampleEncodeAndScore(
    absl::string_view input,
    int samples,
    float alpha,
    bool wor,
    bool include_best,
    NBestSentencePieceText* samples_spt) const {}

util::Status SentencePieceProcessor::CalculateEntropy(absl::string_view input,
                                                      float alpha,
                                                      float* entropy) const {}

util::Status SentencePieceProcessor::Decode(
    const std::vector<std::string>& pieces,
    SentencePieceText* spt) const {}

util::Status SentencePieceProcessor::Decode(
    const std::vector<absl::string_view>& pieces,
    SentencePieceText* spt) const {}

util::Status SentencePieceProcessor::Decode(const std::vector<int>& ids,
                                            SentencePieceText* spt) const {}

#define CHECK_STATUS_OR_RETURN_DEFAULT(value)

int SentencePieceProcessor::GetPieceSize() const {}

int SentencePieceProcessor::PieceToId(absl::string_view piece) const {}

const std::string &SentencePieceProcessor::IdToPiece(int id) const {}

float SentencePieceProcessor::GetScore(int id) const {}

bool SentencePieceProcessor::IsControl(int id) const {}

bool SentencePieceProcessor::IsUnknown(int id) const {}

bool SentencePieceProcessor::IsUnused(int id) const {}

bool SentencePieceProcessor::IsByte(int id) const {}

int SentencePieceProcessor::unk_id() const {}

int SentencePieceProcessor::bos_id() const {}

int SentencePieceProcessor::eos_id() const {}

int SentencePieceProcessor::pad_id() const {}

// static
util::Status SentencePieceProcessor::ApplyExtraOptions(
    const std::vector<ExtraOption>& extra_options,
    SentencePieceText* spt) const {}

// static
util::Status SentencePieceProcessor::ParseExtraOptions(
    absl::string_view _extra_option,
    std::vector<SentencePieceProcessor::ExtraOption>* extra_options) const {}

void SentencePieceProcessor::SetModel(std::unique_ptr<ModelInterface> &&model) {}

void SentencePieceProcessor::SetNormalizer(
    std::unique_ptr<normalizer::Normalizer> &&normalizer) {}

const ModelProto &SentencePieceProcessor::model_proto() const {}

std::string SentencePieceProcessor::serialized_model_proto() const {}

// Set seed value of random generator.
// Do not set static_cast<unique_int>(-1),
// as this seed is reserved for initializing from
// std::random_device.
void SetRandomGeneratorSeed(unsigned int seed);

namespace io {
util::Status LoadModelProto(absl::string_view filename,
                            ModelProto* model_proto) {}

util::Status SaveModelProto(absl::string_view filename,
                            const ModelProto& model_proto) {}
}  // namespace io
}  // namespace sentencepiece