chromium/third_party/sentencepiece/src/src/sentencepiece_processor.h

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!

#ifndef SENTENCEPIECE_PROCESSOR_H_
#define SENTENCEPIECE_PROCESSOR_H_

#include <cstring>
#include <memory>
#include <string>
#include <string_view>
#include <utility>
#include <vector>

#include "absl/status/status.h"

#ifndef SWIG
namespace absl {
string_view;
}  // namespace absl
#endif  // SWIG

namespace sentencepiece {
namespace util {

StatusCode;
Status;
}  // namespace util

// SentencePieceProcessor:
// Simple and language independent tokenizer and de-tokenizer for
// Neural Network Machine Translation.
//
// SentencePieceProcessor provides Encode() and Decode() methods,
// which correspond to tokenization and de-tokenization respectively.
//
// - Encode:
//   Given a raw source sentence, encode it into a sequence
//   of pieces or vocabulary ids.
//
// - Decode:
//   Given a sequence of pieces or vocabulary ids, decode it
//   into a de-tokenized raw sentence.
//
// SentencePieceProcessor provides a lossless data conversion
// that allows the original raw sentence to be perfectly reconstructed
// from the encoded data, i.e., Decode(Encode(input)) == input.
// This characteristics is useful, as we can make the de-tokenization
// completely language independent.
//
// Usage:
//   SentencePieceProcessor sp;
//   sp.Load("//path/to/model");
//
//   vector<string> sps;
//   sp.Encode("hello world.", &sps).IgnoreError();
//
//   vector<int> ids;
//   sp.Encode("hello world.", &ids).IgnoreError();
//
//   string detok;
//   sp.Decode(sps, &detok);
//   CHECK_EQ("hello world.", detok).IgnoreError();
//
//   sp.Decode(ids, &detok);
//   CHECK_EQ("hello world.", detok).IgnoreError();
//
//  We can also use SentencePieceText which manages the byte-offsets
//  between user input (output) and internal sentence pieces.
//
//   SentencePieceText spt;
//   sp.Encode("hello world.", &spt);
//   // Emits the byte range of each piece.
//   for (const auto &piece : spt.pieces()) {
//      LOG(INFO) << piece.begin() << " " << piece.end();
//   }
//
//   sp.Decode({0, 1, 2, 3..}, &spt);
//   for (const auto &piece : spt.pieces()) {
//      LOG(INFO) << piece.begin() << " " << piece.end();
//   }
//

class NBestSentencePieceText;
class ModelInterface;
class SentencePieceText;
class ModelProto;

namespace normalizer {
class Normalizer;
}  // namespace normalizer

#ifndef SWIGGO
namespace util {
// Redefine std::string for serialized_proto interface as Python's string is
// a Unicode string. We can enforce the return value to be raw byte sequence
// with SWIG's typemap.
bytes;
}  // namespace util
#endif  // SWIGGO

class NBestSentencePieceText;
class ModelInterface;
class SentencePieceText;
class SentencePieceText_SentencePiece;

// Wrapper class of SentencePieceText
// This wrapper only allows an immutable access to the proto and
// hides the actual implementation of protobuf.
// See sentencepiece.proto for the details of this class.
class ImmutableSentencePieceText_ImmutableSentencePiece {};

class ImmutableSentencePieceText {};

// Wrapper class of SentencePieceText
// This wrapper only allows an immutable access to the proto and
// hides the actual implementation of protobuf.
// See sentencepiece.proto for the details of this class.
class ImmutableNBestSentencePieceText {};

class SentencePieceProcessor {};

// Set seed value of random generator.
// Do not set static_cast<unique_int>(-1),
// as this seed is reserved for initializing from
// std::random_device.
void SetRandomGeneratorSeed(unsigned int seed);

// IO related functions to absorb model formats.
namespace io {
// Loads `model_proto` from `filename`.
// We can instantiate SentencePieceProcessor as follows:
//
//  auto model_proto = absl::make_unique<ModelProto>();
//  io::LoadModelProto("//path/spm.model", model_proto.get());
//  SentencePieceProcessor sp;
//  CHECK_OK(sp.Load(std::move(model_proto)));
util::Status LoadModelProto(absl::string_view, ModelProto* model_proto);

// Saves `model_proto` as `filename`.
util::Status SaveModelProto(absl::string_view, const ModelProto& model_proto);
}  // namespace io
}  // namespace sentencepiece
#endif  // SENTENCEPIECE_PROCESSOR_H_