#include "builder.h"
#include <algorithm>
#include <functional>
#include <utility>
#include "absl/strings/str_join.h"
#include "absl/strings/str_replace.h"
#include "absl/strings/str_split.h"
#include "absl/strings/strip.h"
#include "filesystem.h"
#ifdef ENABLE_NFKC_COMPILE
#include <unicode/errorcode.h>
#include <unicode/locid.h>
#include <unicode/normlzr.h>
#include <unicode/numfmt.h>
#include <unicode/rbnf.h>
#include <unicode/utypes.h>
#endif
#include <set>
#include "normalization_rule.h"
#include "normalizer.h"
#include "third_party/darts_clone/darts.h"
#include "util.h"
namespace sentencepiece {
namespace normalizer {
namespace {
constexpr int kMaxUnicode = …;
static constexpr char kDefaultNormalizerName[] = …;
#ifndef ENABLE_NFKC_COMPILE
static constexpr char kCompileError[] = …;
#endif
#ifdef ENABLE_NFKC_COMPILE
Builder::Chars UnicodeNormalize(UNormalizationMode mode,
const Builder::Chars &input) {
const std::string utf8 = string_util::UnicodeTextToUTF8(input);
CHECK(!utf8.empty());
icu::UnicodeString ustr = icu::UnicodeString::fromUTF8(utf8.c_str());
UErrorCode status = U_ZERO_ERROR;
icu::UnicodeString dst;
icu::Normalizer::normalize(ustr, mode, 0, dst, status);
CHECK(U_SUCCESS(status));
std::string normalized;
normalized.reserve(dst.length() * 3);
dst.toUTF8String(normalized);
return string_util::UTF8ToUnicodeText(normalized);
}
Builder::Chars ToNFKD(const Builder::Chars &input) {
return UnicodeNormalize(UNORM_NFKD, input);
}
Builder::Chars ToNFKC(const Builder::Chars &input) {
return UnicodeNormalize(UNORM_NFKC, input);
}
Builder::Chars ToNFC(const Builder::Chars &input) {
return UnicodeNormalize(UNORM_NFC, input);
}
Builder::Chars ToNFD(const Builder::Chars &input) {
return UnicodeNormalize(UNORM_NFD, input);
}
std::vector<Builder::Chars> ExpandUnnormalized(
const Builder::Chars &nfkd,
const std::map<char32, std::set<char32>> &norm2orig) {
CHECK(!nfkd.empty());
std::vector<Builder::Chars> results;
for (const auto c : port::FindOrDie(norm2orig, nfkd[0])) {
results.push_back({c});
}
for (size_t i = 1; i < nfkd.size(); ++i) {
const auto &orig = port::FindOrDie(norm2orig, nfkd[i]);
std::vector<Builder::Chars> new_results;
for (const auto &r : results) {
for (const auto c : orig) {
new_results.emplace_back(r);
new_results.back().push_back(c);
}
}
results = std::move(new_results);
}
CHECK_EQ(nfkd.size(), results[0].size());
return results;
}
#endif
Builder::Chars Normalize(const Builder::CharsMap &chars_map,
const Builder::Chars &src, int max_len) { … }
}
util::Status Builder::CompileCharsMap(const CharsMap& chars_map,
std::string* output) { … }
util::Status Builder::DecompileCharsMap(absl::string_view blob,
Builder::CharsMap* chars_map) { … }
util::Status Builder::GetPrecompiledCharsMap(absl::string_view name,
std::string* output) { … }
util::Status Builder::BuildNFKCMap(CharsMap* chars_map) { … }
util::Status Builder::BuildNmtNFKCMap(CharsMap* chars_map) { … }
util::Status Builder::MergeUnicodeCaseFoldMap(Builder::CharsMap* chars_map) { … }
util::Status Builder::BuildNFKC_CFMap(CharsMap* chars_map) { … }
util::Status Builder::BuildNmtNFKC_CFMap(CharsMap* chars_map) { … }
util::Status Builder::BuildNFKDMap(CharsMap* chars_map) { … }
util::Status Builder::LoadCharsMap(absl::string_view filename,
CharsMap* chars_map) { … }
util::Status Builder::SaveCharsMap(absl::string_view filename,
const Builder::CharsMap& chars_map) { … }
util::Status Builder::RemoveRedundantMap(CharsMap* chars_map) { … }
}
}