// Copyright 2016 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.! #ifndef SENTENCEPIECE_PROCESSOR_H_ #define SENTENCEPIECE_PROCESSOR_H_ #include <cstring> #include <memory> #include <string> #include <string_view> #include <utility> #include <vector> #include "absl/status/status.h" #ifndef SWIG namespace absl { string_view; } // namespace absl #endif // SWIG namespace sentencepiece { namespace util { StatusCode; Status; } // namespace util // SentencePieceProcessor: // Simple and language independent tokenizer and de-tokenizer for // Neural Network Machine Translation. // // SentencePieceProcessor provides Encode() and Decode() methods, // which correspond to tokenization and de-tokenization respectively. // // - Encode: // Given a raw source sentence, encode it into a sequence // of pieces or vocabulary ids. // // - Decode: // Given a sequence of pieces or vocabulary ids, decode it // into a de-tokenized raw sentence. // // SentencePieceProcessor provides a lossless data conversion // that allows the original raw sentence to be perfectly reconstructed // from the encoded data, i.e., Decode(Encode(input)) == input. // This characteristics is useful, as we can make the de-tokenization // completely language independent. // // Usage: // SentencePieceProcessor sp; // sp.Load("//path/to/model"); // // vector<string> sps; // sp.Encode("hello world.", &sps).IgnoreError(); // // vector<int> ids; // sp.Encode("hello world.", &ids).IgnoreError(); // // string detok; // sp.Decode(sps, &detok); // CHECK_EQ("hello world.", detok).IgnoreError(); // // sp.Decode(ids, &detok); // CHECK_EQ("hello world.", detok).IgnoreError(); // // We can also use SentencePieceText which manages the byte-offsets // between user input (output) and internal sentence pieces. // // SentencePieceText spt; // sp.Encode("hello world.", &spt); // // Emits the byte range of each piece. // for (const auto &piece : spt.pieces()) { // LOG(INFO) << piece.begin() << " " << piece.end(); // } // // sp.Decode({0, 1, 2, 3..}, &spt); // for (const auto &piece : spt.pieces()) { // LOG(INFO) << piece.begin() << " " << piece.end(); // } // class NBestSentencePieceText; class ModelInterface; class SentencePieceText; class ModelProto; namespace normalizer { class Normalizer; } // namespace normalizer #ifndef SWIGGO namespace util { // Redefine std::string for serialized_proto interface as Python's string is // a Unicode string. We can enforce the return value to be raw byte sequence // with SWIG's typemap. bytes; } // namespace util #endif // SWIGGO class NBestSentencePieceText; class ModelInterface; class SentencePieceText; class SentencePieceText_SentencePiece; // Wrapper class of SentencePieceText // This wrapper only allows an immutable access to the proto and // hides the actual implementation of protobuf. // See sentencepiece.proto for the details of this class. class ImmutableSentencePieceText_ImmutableSentencePiece { … }; class ImmutableSentencePieceText { … }; // Wrapper class of SentencePieceText // This wrapper only allows an immutable access to the proto and // hides the actual implementation of protobuf. // See sentencepiece.proto for the details of this class. class ImmutableNBestSentencePieceText { … }; class SentencePieceProcessor { … }; // Set seed value of random generator. // Do not set static_cast<unique_int>(-1), // as this seed is reserved for initializing from // std::random_device. void SetRandomGeneratorSeed(unsigned int seed); // IO related functions to absorb model formats. namespace io { // Loads `model_proto` from `filename`. // We can instantiate SentencePieceProcessor as follows: // // auto model_proto = absl::make_unique<ModelProto>(); // io::LoadModelProto("//path/spm.model", model_proto.get()); // SentencePieceProcessor sp; // CHECK_OK(sp.Load(std::move(model_proto))); util::Status LoadModelProto(absl::string_view, ModelProto* model_proto); // Saves `model_proto` as `filename`. util::Status SaveModelProto(absl::string_view, const ModelProto& model_proto); } // namespace io } // namespace sentencepiece #endif // SENTENCEPIECE_PROCESSOR_H_