// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!
#include <vector>
#include "builder.h"
#include "normalizer.h"
#include "sentencepiece_trainer.h"
#include "testharness.h"
#include "util.h"
namespace sentencepiece {
namespace normalizer {
namespace {
// Space symbol
#define WS "\xe2\x96\x81"
// Replacement char
#define RC "\xEF\xBF\xBD"
NormalizerSpec MakeDefaultSpec() {
return SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc");
}
} // namespace
TEST(NormalizerTest, NormalizeTest) {
auto spec = MakeDefaultSpec();
const Normalizer normalizer(spec);
// Empty strings.
EXPECT_EQ("", normalizer.Normalize(""));
EXPECT_EQ("", normalizer.Normalize(" "));
EXPECT_EQ("", normalizer.Normalize(" "));
// Sentence with heading/tailing/redundant spaces.
EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC"));
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC "));
EXPECT_EQ(WS "A" WS "B" WS "C", normalizer.Normalize(" A B C "));
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC "));
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC "));
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC"));
EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC "));
// NFKC char to char normalization.
EXPECT_EQ(WS "123", normalizer.Normalize("①②③"));
// NFKC char to multi-char normalization.
EXPECT_EQ(WS "株式会社", normalizer.Normalize("㍿"));
// Half width katakana, character composition happens.
EXPECT_EQ(WS "グーグル", normalizer.Normalize(" グーグル "));
EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl",
normalizer.Normalize(" I saw a girl "));
// Remove control chars.
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(0x7F)));
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(0x8F)));
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(0x9F)));
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(0x0B)));
for (char32 c = 0x10; c <= 0x1F; ++c) {
EXPECT_EQ("", normalizer.Normalize(string_util::UnicodeCharToUTF8(c)));
}
}
TEST(NormalizerTest, NormalizeWithoutDummyPrefixTest) {
auto spec = MakeDefaultSpec();
spec.set_add_dummy_prefix(false);
const Normalizer normalizer(spec);
// Empty strings.
EXPECT_EQ("", normalizer.Normalize(""));
EXPECT_EQ("", normalizer.Normalize(" "));
EXPECT_EQ("", normalizer.Normalize(" "));
// Sentence with heading/tailing/redundant spaces.
EXPECT_EQ("ABC", normalizer.Normalize("ABC"));
EXPECT_EQ("ABC", normalizer.Normalize(" ABC "));
EXPECT_EQ("A" WS "B" WS "C", normalizer.Normalize(" A B C "));
EXPECT_EQ("ABC", normalizer.Normalize(" ABC "));
EXPECT_EQ("ABC", normalizer.Normalize(" ABC "));
EXPECT_EQ("ABC", normalizer.Normalize(" ABC"));
EXPECT_EQ("ABC", normalizer.Normalize(" ABC "));
}
TEST(NormalizerTest, NormalizeTreatWSAsSuffixTest) {
auto spec = MakeDefaultSpec();
TrainerSpec trainer_spec;
trainer_spec.set_treat_whitespace_as_suffix(true);
const Normalizer normalizer(spec, trainer_spec);
EXPECT_EQ("", normalizer.Normalize(""));
EXPECT_EQ("", normalizer.Normalize(" "));
EXPECT_EQ("", normalizer.Normalize(" "));
EXPECT_EQ("ABC" WS, normalizer.Normalize("ABC"));
EXPECT_EQ("ABC" WS, normalizer.Normalize(" ABC "));
EXPECT_EQ("A" WS "B" WS "C" WS, normalizer.Normalize(" A B C "));
EXPECT_EQ("ABC" WS, normalizer.Normalize(" ABC "));
}
TEST(NormalizerTest, NormalizeWithoutRemoveExtraWhitespacesTest) {
auto spec = MakeDefaultSpec();
spec.set_remove_extra_whitespaces(false);
const Normalizer normalizer(spec);
// Empty strings.
EXPECT_EQ("", normalizer.Normalize(""));
EXPECT_EQ(WS WS WS WS WS WS WS, normalizer.Normalize(" "));
EXPECT_EQ(WS WS, normalizer.Normalize(" "));
// Sentence with heading/tailing/redundant spaces.
EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC"));
EXPECT_EQ(WS WS "ABC" WS, normalizer.Normalize(" ABC "));
EXPECT_EQ(WS WS WS "A" WS WS "B" WS WS "C" WS WS,
normalizer.Normalize(" A B C "));
}
TEST(NormalizerTest, NormalizeWithoutEscapeWhitespacesTest) {
auto spec = MakeDefaultSpec();
spec.set_add_dummy_prefix(false);
spec.set_remove_extra_whitespaces(true);
spec.set_escape_whitespaces(false);
const Normalizer normalizer(spec);
// Empty strings.
EXPECT_EQ("", normalizer.Normalize(""));
EXPECT_EQ("", normalizer.Normalize(" "));
EXPECT_EQ("", normalizer.Normalize(" "));
// Sentence with heading/tailing/redundant spaces.
EXPECT_EQ("ABC", normalizer.Normalize("ABC"));
EXPECT_EQ("ABC", normalizer.Normalize(" ABC "));
EXPECT_EQ("A B C", normalizer.Normalize(" A B C "));
EXPECT_EQ("A B C", normalizer.Normalize("A B C"));
}
TEST(NormalizeTest, NomalizeWithSpaceContainedRules) {
Builder::CharsMap charsmap;
auto AddRule = [&](const std::string &src, const std::string &trg) {
Builder::Chars src_chars, trg_chars;
for (const char32 c : string_util::UTF8ToUnicodeText(src)) {
src_chars.push_back(c);
}
for (const char32 c : string_util::UTF8ToUnicodeText(trg)) {
trg_chars.push_back(c);
}
charsmap[src_chars] = trg_chars;
};
// Adds rules containing whitespaes.
AddRule("a", " A");
AddRule("b", "B");
AddRule("c", "D E");
AddRule("d", " F G ");
NormalizerSpec spec;
EXPECT_TRUE(
Builder::CompileCharsMap(charsmap, spec.mutable_precompiled_charsmap())
.ok());
// Test default behavior
{
const Normalizer normalizer(spec);
EXPECT_EQ(WS "A", normalizer.Normalize("a"));
EXPECT_EQ(WS "B" WS "A", normalizer.Normalize("ba"));
EXPECT_EQ(WS "D" WS "E", normalizer.Normalize("c"));
EXPECT_EQ(WS "F" WS "G" WS "A", normalizer.Normalize("da"));
EXPECT_EQ(WS "A" WS "F" WS "G", normalizer.Normalize("ad"));
EXPECT_EQ(WS "A" WS "F" WS "G" WS "B", normalizer.Normalize("adb"));
}
spec.set_escape_whitespaces(false);
{
spec.set_add_dummy_prefix(false);
spec.set_remove_extra_whitespaces(true);
const Normalizer normalizer(spec);
EXPECT_EQ("A", normalizer.Normalize("a"));
EXPECT_EQ("B A", normalizer.Normalize("ba"));
EXPECT_EQ("D E", normalizer.Normalize("c"));
EXPECT_EQ("F G A", normalizer.Normalize("da"));
EXPECT_EQ("A F G", normalizer.Normalize("ad"));
EXPECT_EQ("A F G B", normalizer.Normalize("adb"));
}
{
spec.set_add_dummy_prefix(false);
spec.set_remove_extra_whitespaces(false);
const Normalizer normalizer(spec);
EXPECT_EQ(" A", normalizer.Normalize("a"));
EXPECT_EQ("B A", normalizer.Normalize("ba"));
EXPECT_EQ("D E", normalizer.Normalize("c"));
EXPECT_EQ(" F G A", normalizer.Normalize("da"));
EXPECT_EQ(" A F G ", normalizer.Normalize("ad"));
EXPECT_EQ(" A F G B", normalizer.Normalize("adb"));
}
{
spec.set_add_dummy_prefix(true);
spec.set_remove_extra_whitespaces(true);
const Normalizer normalizer(spec);
EXPECT_EQ(" A", normalizer.Normalize("a"));
EXPECT_EQ(" B A", normalizer.Normalize("ba"));
EXPECT_EQ(" D E", normalizer.Normalize("c"));
EXPECT_EQ(" F G A", normalizer.Normalize("da"));
EXPECT_EQ(" A F G", normalizer.Normalize("ad"));
EXPECT_EQ(" A F G B", normalizer.Normalize("adb"));
}
{
spec.set_add_dummy_prefix(true);
spec.set_remove_extra_whitespaces(false);
const Normalizer normalizer(spec);
EXPECT_EQ(" A", normalizer.Normalize("a"));
EXPECT_EQ(" B A", normalizer.Normalize("ba"));
EXPECT_EQ(" D E", normalizer.Normalize("c"));
EXPECT_EQ(" F G A", normalizer.Normalize("da"));
EXPECT_EQ(" A F G ", normalizer.Normalize("ad"));
EXPECT_EQ(" A F G B", normalizer.Normalize("adb"));
}
// Added several corner cases around spaces.
struct SpacePattern {
bool add_dummy_prefix;
bool remove_extra_whitespaces;
bool escape_whitespaces;
const char *input;
const char *expected;
};
constexpr SpacePattern kSpacePatternData[] = {
{false, false, false, WS, WS}, {false, false, true, WS, WS},
{false, true, false, WS, WS}, {false, true, true, WS, ""},
{true, false, false, WS, " " WS}, {true, false, true, WS, WS WS},
{true, true, false, WS, " " WS}, {true, true, true, WS, ""},
{false, false, false, " ", " "}, {false, false, true, " ", WS},
{false, true, false, " ", ""}, {false, true, true, " ", ""},
{true, false, false, " ", " "}, {true, false, true, " ", WS WS},
{true, true, false, " ", ""}, {true, true, true, " ", ""}};
for (const auto &c : kSpacePatternData) {
spec.set_add_dummy_prefix(c.add_dummy_prefix);
spec.set_remove_extra_whitespaces(c.remove_extra_whitespaces);
spec.set_escape_whitespaces(c.escape_whitespaces);
const Normalizer normalizer(spec);
EXPECT_EQ(c.expected, normalizer.Normalize(c.input));
}
}
TEST(NormalizerTest, NormalizeReplacementChar) {
auto spec = MakeDefaultSpec();
spec.set_add_dummy_prefix(false);
const Normalizer normalizer(spec);
EXPECT_EQ("abc" RC "xy", normalizer.Normalize("abc\x80xy"));
EXPECT_EQ("abc" RC, normalizer.Normalize("abc\xc3"));
EXPECT_EQ("ab" RC RC "xy", normalizer.Normalize("ab\xe3\x81xy"));
EXPECT_EQ("a" RC RC RC "xy", normalizer.Normalize("a\xf3\x81\x81xy"));
EXPECT_EQ("ab" RC RC "xy", normalizer.Normalize("ab\xc0\x82xy"));
}
TEST(NormalizerTest, NormalizeFullTest) {
std::vector<size_t> n2i;
std::string output;
auto spec = MakeDefaultSpec();
const Normalizer normalizer(spec);
{
const std::string input = "I saw a girl";
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok());
EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl", output);
const std::vector<size_t> expected = {0, 0, 0, // WS (3byte)
0, // I
1, 1, 1, // WS
2, 3, 4, // saw
5, 5, 5, // WS
6, // a
7, 7, 7, // WS
8, 9, 10, 11, // girl
12};
EXPECT_EQ(expected, n2i);
}
{
const std::string input = " I saw a girl ";
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok());
LOG(INFO) << output;
EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl", output);
const std::vector<size_t> expected = {1, 1, 1, // WS (3byte)
1, // I
2, 2, 2, // WS
5, 6, 7, // saw
8, 8, 8, // WS
9, // a
10, 10, 10, // WS
17, 18, 19, 20, // girl
21};
EXPECT_EQ(expected, n2i);
}
{
const std::string input = " グーグル "; // halfwidth katakana
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok());
EXPECT_EQ(WS "グーグル", output);
const std::vector<size_t> expected = {1, 1, 1, // WS (3byte)
1, 1, 1, // グ
7, 7, 7, // ー
10, 10, 10, // グ
16, 16, 16, // ル
19};
EXPECT_EQ(expected, n2i);
}
{
const std::string input = "①②③";
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok());
EXPECT_EQ(WS "123", output);
const std::vector<size_t> expected = {0, 0, 0, // WS (3byte)
0, // 1
3, // 2
6, // 3
9};
EXPECT_EQ(expected, n2i);
}
{
const std::string input = "㍿";
EXPECT_TRUE(normalizer.Normalize(input, &output, &n2i).ok());
EXPECT_EQ(WS "株式会社", output);
const std::vector<size_t> expected = {0, 0, 0, // WS (3byte)
0, 0, 0, // 株
0, 0, 0, // 式
0, 0, 0, // 会
0, 0, 0, // 社
3};
// When "株式" is one piece, this has no alignment to the input.
// Sentencepieces which includes the last character ("会社" or "社")
// have the alignment to the input.
EXPECT_EQ(expected, n2i);
}
}
TEST(NormalizerTest, EncodeDecodePrecompiledCharsMapTest) {
const std::string blob = Normalizer::EncodePrecompiledCharsMap("foo", "bar");
std::string buf;
absl::string_view trie_blob, normalized_blob;
EXPECT_TRUE(Normalizer::DecodePrecompiledCharsMap(blob, &trie_blob,
&normalized_blob, &buf)
.ok());
EXPECT_EQ("foo", trie_blob);
EXPECT_EQ("bar", normalized_blob);
EXPECT_FALSE(Normalizer::DecodePrecompiledCharsMap("", &trie_blob,
&normalized_blob, &buf)
.ok());
}
TEST(NormalizerTest, StatusTest) {
NormalizerSpec spec;
{
const Normalizer normalizer(spec);
EXPECT_TRUE(normalizer.status().ok()); // fallback to identity.
}
{
spec.set_precompiled_charsmap("x");
const Normalizer normalizer(spec);
EXPECT_FALSE(normalizer.status().ok());
}
spec = MakeDefaultSpec();
{
const Normalizer normalizer(spec);
EXPECT_TRUE(normalizer.status().ok());
}
}
TEST(NormalizerTest, PrefixMatcherTest) {
const PrefixMatcher matcher({"abc", "ab", "xy", "京都"});
bool found;
EXPECT_EQ(1, matcher.PrefixMatch("test", &found));
EXPECT_FALSE(found);
EXPECT_EQ(3, matcher.PrefixMatch("abcd", &found));
EXPECT_TRUE(found);
EXPECT_EQ(2, matcher.PrefixMatch("abxy", &found));
EXPECT_TRUE(found);
EXPECT_EQ(1, matcher.PrefixMatch("x", &found));
EXPECT_FALSE(found);
EXPECT_EQ(2, matcher.PrefixMatch("xyz", &found));
EXPECT_TRUE(found);
EXPECT_EQ(6, matcher.PrefixMatch("京都大学", &found));
EXPECT_TRUE(found);
EXPECT_EQ(3, matcher.PrefixMatch("東京大学", &found));
EXPECT_FALSE(found);
EXPECT_EQ("", matcher.GlobalReplace("", ""));
EXPECT_EQ("", matcher.GlobalReplace("abc", ""));
EXPECT_EQ("--de-pqr", matcher.GlobalReplace("xyabcdeabpqr", "-"));
}
TEST(NormalizerTest, PrefixMatcherWithEmptyTest) {
const PrefixMatcher matcher({});
bool found;
EXPECT_EQ(1, matcher.PrefixMatch("test", &found));
EXPECT_FALSE(found);
EXPECT_EQ(1, matcher.PrefixMatch("abcd", &found));
EXPECT_FALSE(found);
EXPECT_EQ(1, matcher.PrefixMatch("abxy", &found));
EXPECT_FALSE(found);
EXPECT_EQ(1, matcher.PrefixMatch("x", &found));
EXPECT_FALSE(found);
EXPECT_EQ(1, matcher.PrefixMatch("xyz", &found));
EXPECT_FALSE(found);
EXPECT_EQ(3, matcher.PrefixMatch("京都大学", &found));
EXPECT_FALSE(found);
EXPECT_EQ(3, matcher.PrefixMatch("東京大学", &found));
EXPECT_FALSE(found);
EXPECT_EQ("", matcher.GlobalReplace("", ""));
EXPECT_EQ("abc", matcher.GlobalReplace("abc", ""));
}
} // namespace normalizer
} // namespace sentencepiece