chromium/third_party/sentencepiece/src/src/builder_test.cc

// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.!

#include "builder.h"
#include "absl/strings/str_cat.h"
#include "common.h"
#include "filesystem.h"
#include "normalizer.h"
#include "sentencepiece_trainer.h"
#include "testharness.h"
#include "util.h"

namespace sentencepiece {
namespace normalizer {

// Space symbol
#define WS "\xe2\x96\x81"

TEST(BuilderTest, RemoveRedundantMapTest) {
  Builder::CharsMap chars_map;

  // ab => AB, a => A, b => B, abc => BCA
  chars_map[{0x0061}] = {0x0041};
  chars_map[{0x0062}] = {0x0042};
  chars_map[{0x0061, 0x0062}] = {0x0041, 0x0042};
  chars_map[{0x0061, 0x0062, 0x0063}] = {0x0043, 0x0042, 0x0041};

  EXPECT_TRUE(Builder::RemoveRedundantMap(&chars_map).ok());
  EXPECT_EQ(3, chars_map.size());
  EXPECT_EQ(chars_map.end(), chars_map.find({0x0061, 0x0062}));
  EXPECT_NE(chars_map.end(), chars_map.find({0x0061}));
  EXPECT_NE(chars_map.end(), chars_map.find({0x0062}));
  EXPECT_NE(chars_map.end(), chars_map.find({0x0061, 0x0062, 0x0063}));
}

TEST(BuilderTest, GetPrecompiledCharsMapWithInvalidNameTest) {
  std::string output;
  EXPECT_FALSE(Builder::GetPrecompiledCharsMap("", &output).ok());
  EXPECT_FALSE(Builder::GetPrecompiledCharsMap("__UNKNOWN__", &output).ok());
}

TEST(BuilderTest, BuildNFKCMapTest) {
  Builder::CharsMap chars_map;
#ifdef ENABLE_NFKC_COMPILE
  EXPECT_TRUE(Builder::BuildNFKCMap(&chars_map).ok());
  EXPECT_TRUE(!chars_map.empty());
#else
  EXPECT_TRUE(Builder::BuildNFKCMap(&chars_map).ok());
#endif
}

TEST(BuilderTest, GetPrecompiledCharsMapTest) {
  {
    const NormalizerSpec spec =
        SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc");
    const Normalizer normalizer(spec);
    EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC"));
    EXPECT_EQ(WS "(株)", normalizer.Normalize("㈱"));
    EXPECT_EQ(WS "グーグル", normalizer.Normalize("グーグル"));
  }

  {
    const NormalizerSpec spec =
        SentencePieceTrainer::GetNormalizerSpec("nfkc_cf");
    const Normalizer normalizer(spec);
    EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
    EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
  }

  {
    const NormalizerSpec spec =
        SentencePieceTrainer::GetNormalizerSpec("nmt_nfkc_cf");
    const Normalizer normalizer(spec);
    EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
    EXPECT_EQ(WS "abc", normalizer.Normalize("ABC"));
  }

  {
    const NormalizerSpec spec =
        SentencePieceTrainer::GetNormalizerSpec("identity");
    EXPECT_TRUE(spec.precompiled_charsmap().empty());
    const Normalizer normalizer(spec);
    EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC"));
    EXPECT_EQ(WS "㈱", normalizer.Normalize("㈱"));
    EXPECT_EQ(WS "グーグル", normalizer.Normalize("グーグル"));
  }
}

TEST(BuilderTest, CompileCharsMap) {
  Builder::CharsMap chars_map;

  // Lowercase => Uppercase
  for (char32 lc = static_cast<char32>('a'); lc <= static_cast<char32>('z');
       ++lc) {
    const char32 uc = lc + 'A' - 'a';
    chars_map[{lc}] = {uc};
  }

  // あいう => abc
  chars_map[{0x3042, 0x3044, 0x3046}] = {0x0061, 0x0062, 0x0063};

  // えお => remove
  chars_map[{0x3048, 0x304A}] = {};

  NormalizerSpec spec;
  EXPECT_TRUE(
      Builder::CompileCharsMap(chars_map, spec.mutable_precompiled_charsmap())
          .ok());
  Builder::CharsMap decompiled_chars_map;
  EXPECT_TRUE(Builder::DecompileCharsMap(spec.precompiled_charsmap(),
                                         &decompiled_chars_map)
                  .ok());
  EXPECT_EQ(chars_map, decompiled_chars_map);

  spec.set_add_dummy_prefix(false);
  const Normalizer normalizer(spec);

  EXPECT_EQ("ABC", normalizer.Normalize("abc"));
  EXPECT_EQ("ABC", normalizer.Normalize("ABC"));
  EXPECT_EQ("XY" WS "Z", normalizer.Normalize("xy z"));

  EXPECT_EQ("あ", normalizer.Normalize("あ"));
  EXPECT_EQ("abc", normalizer.Normalize("あいう"));
  EXPECT_EQ("abcえ", normalizer.Normalize("あいうえ"));
  EXPECT_EQ("ABCabcD", normalizer.Normalize("abcあいうd"));
  EXPECT_EQ("abcか", normalizer.Normalize("あいうえおか"));
}

static constexpr char kTestInputData[] = "nfkc.tsv";

TEST(BuilderTest, LoadCharsMapTest) {
  Builder::CharsMap chars_map;
  ASSERT_TRUE(
      Builder::LoadCharsMap(
          util::JoinPath(absl::GetFlag(FLAGS_test_srcdir), kTestInputData),
          &chars_map)
          .ok());

  std::string precompiled, expected;
  ASSERT_TRUE(Builder::CompileCharsMap(chars_map, &precompiled).ok());

  // Round-trip.
  Builder::CharsMap decompiled_chars_map;
  ASSERT_TRUE(
      Builder::DecompileCharsMap(precompiled, &decompiled_chars_map).ok());
  EXPECT_EQ(chars_map, decompiled_chars_map);

  ASSERT_TRUE(
      Builder::SaveCharsMap(
          util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "output.tsv"),
          chars_map)
          .ok());

  Builder::CharsMap saved_chars_map;
  ASSERT_TRUE(
      Builder::LoadCharsMap(
          util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "output.tsv"),
          &saved_chars_map)
          .ok());
  EXPECT_EQ(chars_map, saved_chars_map);

#ifdef ENABLE_NFKC_COMPILE
  Builder::CharsMap nfkc_map;
  ASSERT_TRUE(Builder::BuildNFKCMap(&nfkc_map).ok());
  ASSERT_TRUE(Builder::CompileCharsMap(nfkc_map, &expected).ok());
#endif
}

TEST(BuilderTest, LoadCharsMapWithEmptyeTest) {
  {
    auto output = filesystem::NewWritableFile(
        util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test.tsv"));
    output->WriteLine("0061\t0041");
    output->WriteLine("0062");
    output->WriteLine("0063\t\t#foo=>bar");
  }

  Builder::CharsMap chars_map;
  EXPECT_TRUE(Builder::LoadCharsMap(
                  util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test.tsv"),
                  &chars_map)
                  .ok());

  EXPECT_EQ(3, chars_map.size());
  EXPECT_EQ(std::vector<char32>({0x0041}), chars_map[{0x0061}]);
  EXPECT_EQ(std::vector<char32>({}), chars_map[{0x0062}]);
  EXPECT_EQ(std::vector<char32>({}), chars_map[{0x0063}]);

  EXPECT_TRUE(
      Builder::SaveCharsMap(
          util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test_out.tsv"),
          chars_map)
          .ok());

  Builder::CharsMap new_chars_map;
  EXPECT_TRUE(
      Builder::LoadCharsMap(
          util::JoinPath(absl::GetFlag(FLAGS_test_tmpdir), "test_out.tsv"),
          &new_chars_map)
          .ok());
  EXPECT_EQ(chars_map, new_chars_map);
}

TEST(BuilderTest, ContainsTooManySharedPrefixTest) {
  Builder::CharsMap chars_map;
  std::vector<char32> keys;
  // chars_map contains too many shared prefix ("aaaa...");
  for (int i = 0; i < 100; ++i) {
    keys.push_back('a');
    chars_map[keys] = {'b'};
  }
  std::string output;
  EXPECT_FALSE(Builder::CompileCharsMap(chars_map, &output).ok());
}

}  // namespace normalizer
}  // namespace sentencepiece