unigram_model.cc | Explore in Territory

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!

#include "unigram_model.h"

#include <algorithm>
#include <cfloat>
#include <cmath>
#include <complex>
#include <map>
#include <queue>
#include <string>
#include <utility>
#include <vector>

#include "absl/container/flat_hash_map.h"
#include "absl/memory/memory.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "util.h"

namespace sentencepiece {
namespace unigram {
namespace {

// Size of nodes pre-allocated in Lattice.
constexpr size_t kPreallocateLatticeNodeSize = …;

constexpr float kUnkPenalty = …;
constexpr float kEpsilon = …;

// Returns log(exp(x) + exp(y)).
// if init_mode is true, returns log(exp(y)) == y.
// log(\sum_i exp(a[i])) can be computed as
// for (int i = 0; i < a.size(); ++i)
//   x = LogSumExp(x, a[i], i == 0);
inline float LogSumExp(float x, float y, bool init_mode) { … }

// Returns a sample from a standard Gumbel distribution.
// If U  ~ U[0, 1], -log(-log U) ~ G(0,1)
inline float Gumbel() { … }
}  // namespace

Lattice::Lattice() : … { … }
Lattice::~Lattice() { … }

const std::vector<Lattice::Node *> &Lattice::begin_nodes(int pos) const { … }

const std::vector<Lattice::Node *> &Lattice::end_nodes(int pos) const { … }

int Lattice::size() const { … }

int Lattice::utf8_size() const { … }

const char *Lattice::sentence() const { … }

const char *Lattice::surface(int pos) const { … }

Lattice::Node *Lattice::bos_node() const { … }

Lattice::Node *Lattice::eos_node() const { … }

Lattice::Node *Lattice::NewNode() { … }

void Lattice::Clear() { … }

void Lattice::SetSentence(absl::string_view sentence) { … }

Lattice::Node *Lattice::Insert(int pos, int length) { … }

Lattice::LatticePathWithScore Lattice::Viterbi() { … }

std::vector<float> Lattice::ForwardAlgorithm(float inv_theta) const { … }

std::vector<float> Lattice::BackwardAlgorithm(float inv_theta) const { … }

float Lattice::PopulateMarginal(float freq,
                                std::vector<float>* expected) const { … }

float Lattice::CalculateEntropy(float inv_theta) const { … }

namespace {

// The node structure to support A* algorithm in Lattice::NBest()
struct Hypothesis { … };

// Helper function for cloning a Hypothesis and the ones on their next paths.
// The graph structure is preserved.
//
//   to_clone:  the Hypothesis to clone.
//   clone_map: mapping between the old pointers and the new pointers.
//   allocator: allocate and own the cloned Hypothesis.
//
// Returns the cloned Hypothesis*. All Hypothesis on its "next" chain are also
// guaranteed to have been allocated via "allocator", and "clone_map" is updated
// with all new mappings.
Hypothesis* CloneHypAndDependents(
    const Hypothesis* to_clone,
    absl::flat_hash_map<const Hypothesis*, Hypothesis*>* clone_map,
    model::FreeList<Hypothesis>* allocator) { … }

}  // namespace

std::vector<Lattice::LatticePathWithScore> Lattice::NBest(size_t nbest_size,
                                                          bool sample,
                                                          float inv_theta) { … }

std::vector<Lattice::Node*> Lattice::Sample(float inv_theta) { … }

// Model::Model() {}
// Model::~Model() {}

void Model::PopulateNodes(Lattice *lattice) const { … }

int Model::PieceToId(absl::string_view piece) const { … }

void Model::BuildTrie(std::vector<std::pair<absl::string_view, int>> *pieces) { … }

Model::Model(const ModelProto &model_proto) { … }

Model::~Model() { … }

EncodeResult Model::Encode(absl::string_view normalized) const { … }

NBestEncodeResult Model::NBestEncode(absl::string_view normalized,
                                     int nbest_size) const { … }

EncodeResult Model::SampleEncode(absl::string_view normalized,
                                 float inv_theta) const { … }

NBestEncodeResult Model::SampleEncodeAndScore(absl::string_view normalized,
                                              float inv_theta,
                                              int samples,
                                              bool wor,
                                              bool include_best) const { … }

float Model::CalculateEntropy(absl::string_view normalized,
                              float inv_theta) const { … }

bool Model::VerifyOutputsEquivalent(absl::string_view expected,
                                    absl::string_view actual) const { … }

EncodeResult Model::EncodeOptimized(absl::string_view normalized) const { … }
}  // namespace unigram
}  // namespace sentencepiece
chromium/third_party/sentencepiece/src/src/unigram_model.cc