chromium/third_party/sentencepiece/src/src/builder.cc

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!

#include "builder.h"

#include <algorithm>
#include <functional>
#include <utility>

#include "absl/strings/str_join.h"
#include "absl/strings/str_replace.h"
#include "absl/strings/str_split.h"
#include "absl/strings/strip.h"
#include "filesystem.h"

#ifdef ENABLE_NFKC_COMPILE
#include <unicode/errorcode.h>
#include <unicode/locid.h>
#include <unicode/normlzr.h>
#include <unicode/numfmt.h>
#include <unicode/rbnf.h>
#include <unicode/utypes.h>
#endif  // ENABLE_NFKC_COMPILE

#include <set>

#include "normalization_rule.h"
#include "normalizer.h"
#include "third_party/darts_clone/darts.h"
#include "util.h"

namespace sentencepiece {
namespace normalizer {
namespace {

constexpr int kMaxUnicode =;

static constexpr char kDefaultNormalizerName[] =;

#ifndef ENABLE_NFKC_COMPILE
static constexpr char kCompileError[] =;
#endif

#ifdef ENABLE_NFKC_COMPILE
// Normalize `input` with ICU's normalizer with `mode`.
Builder::Chars UnicodeNormalize(UNormalizationMode mode,
                                const Builder::Chars &input) {
  const std::string utf8 = string_util::UnicodeTextToUTF8(input);
  CHECK(!utf8.empty());

  icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(utf8.c_str());

  UErrorCode status = U_ZERO_ERROR;
  icu::UnicodeString dst;
  icu::Normalizer::normalize(ustr, mode, 0, dst, status);
  CHECK(U_SUCCESS(status));
  std::string normalized;
  normalized.reserve(dst.length() * 3);
  dst.toUTF8String(normalized);
  return string_util::UTF8ToUnicodeText(normalized);
}

Builder::Chars ToNFKD(const Builder::Chars &input) {
  return UnicodeNormalize(UNORM_NFKD, input);
}

Builder::Chars ToNFKC(const Builder::Chars &input) {
  return UnicodeNormalize(UNORM_NFKC, input);
}

Builder::Chars ToNFC(const Builder::Chars &input) {
  return UnicodeNormalize(UNORM_NFC, input);
}

Builder::Chars ToNFD(const Builder::Chars &input) {
  return UnicodeNormalize(UNORM_NFD, input);
}

// Given an NFKD-normalized string, returns a set of all strings which are
// normalized into the same `nfkd`. `norm2orig` is the normalized to
// un-normalized character mapping.
std::vector<Builder::Chars> ExpandUnnormalized(
    const Builder::Chars &nfkd,
    const std::map<char32, std::set<char32>> &norm2orig) {
  CHECK(!nfkd.empty());
  std::vector<Builder::Chars> results;
  for (const auto c : port::FindOrDie(norm2orig, nfkd[0])) {
    results.push_back({c});
  }
  for (size_t i = 1; i < nfkd.size(); ++i) {
    const auto &orig = port::FindOrDie(norm2orig, nfkd[i]);
    std::vector<Builder::Chars> new_results;
    for (const auto &r : results) {
      for (const auto c : orig) {
        new_results.emplace_back(r);
        new_results.back().push_back(c);
      }
    }
    results = std::move(new_results);
  }
  CHECK_EQ(nfkd.size(), results[0].size());
  return results;
}
#endif

// Normalizes `src` with `chars_map` and returns normalized Chars.
// `max_len` specifies the maximum length of the key in `chars_map`.
Builder::Chars Normalize(const Builder::CharsMap &chars_map,
                         const Builder::Chars &src, int max_len) {}
}  // namespace

// static
util::Status Builder::CompileCharsMap(const CharsMap& chars_map,
                                      std::string* output) {}

// static
util::Status Builder::DecompileCharsMap(absl::string_view blob,
                                        Builder::CharsMap* chars_map) {}

// static
util::Status Builder::GetPrecompiledCharsMap(absl::string_view name,
                                             std::string* output) {}

// static
util::Status Builder::BuildNFKCMap(CharsMap* chars_map) {}

util::Status Builder::BuildNmtNFKCMap(CharsMap* chars_map) {}

// static
util::Status Builder::MergeUnicodeCaseFoldMap(Builder::CharsMap* chars_map) {}

// static
util::Status Builder::BuildNFKC_CFMap(CharsMap* chars_map) {}

//  static
util::Status Builder::BuildNmtNFKC_CFMap(CharsMap* chars_map) {}

// static
util::Status Builder::BuildNFKDMap(CharsMap* chars_map) {}

// static
util::Status Builder::LoadCharsMap(absl::string_view filename,
                                   CharsMap* chars_map) {}

// static
util::Status Builder::SaveCharsMap(absl::string_view filename,
                                   const Builder::CharsMap& chars_map) {}

// static
util::Status Builder::RemoveRedundantMap(CharsMap* chars_map) {}
}  // namespace normalizer
}  // namespace sentencepiece