chromium/chromeos/ash/components/local_search_service/inverted_index_unittest.cc

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chromeos/ash/components/local_search_service/inverted_index.h"

#include <cmath>
#include <string>
#include <unordered_map>
#include <vector>

#include "base/functional/bind.h"
#include "base/strings/utf_string_conversions.h"
#include "base/test/task_environment.h"
#include "chromeos/ash/components/local_search_service/shared_structs.h"
#include "chromeos/ash/components/local_search_service/test_utils.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace ash::local_search_service {

namespace {

std::vector<float> GetScoresFromTfidfResult(
    const std::vector<TfidfResult>& results) {
  std::vector<float> scores;
  for (const auto& item : results) {
    scores.push_back(std::roundf(std::get<2>(item) * 100) / 100.0);
  }
  return scores;
}

constexpr double kDefaultWeight = 1.0;

}  // namespace

class InvertedIndexTest : public ::testing::Test {
 public:
  void SetUp() override {
    // All content weights are initialized to |kDefaultWeight|.
    index_.doc_length_ =
        std::unordered_map<std::string, uint32_t>({{"doc1", 8}, {"doc2", 6}});

    index_.dictionary_[u"A"] = PostingList(
        {{"doc1",
          Posting({WeightedPosition(kDefaultWeight, Position("header", 1, 1)),
                   WeightedPosition(kDefaultWeight, Position("header", 3, 1)),
                   WeightedPosition(kDefaultWeight, Position("body", 5, 1)),
                   WeightedPosition(kDefaultWeight, Position("body", 7, 1))})},
         {"doc2",
          Posting(
              {WeightedPosition(kDefaultWeight, Position("header", 2, 1)),
               WeightedPosition(kDefaultWeight, Position("header", 4, 1))})}});

    index_.dictionary_[u"B"] = PostingList(
        {{"doc1",
          Posting(
              {WeightedPosition(kDefaultWeight, Position("header", 2, 1)),
               WeightedPosition(kDefaultWeight, Position("body", 4, 1)),
               WeightedPosition(kDefaultWeight, Position("header", 6, 1)),
               WeightedPosition(kDefaultWeight, Position("body", 8, 1))})}});

    index_.dictionary_[u"C"] = PostingList(
        {{"doc2",
          Posting(
              {WeightedPosition(kDefaultWeight, Position("header", 1, 1)),
               WeightedPosition(kDefaultWeight, Position("body", 3, 1)),
               WeightedPosition(kDefaultWeight, Position("header", 5, 1)),
               WeightedPosition(kDefaultWeight, Position("body", 7, 1))})}});
    index_.terms_to_be_updated_.insert(u"A");
    index_.terms_to_be_updated_.insert(u"B");
    index_.terms_to_be_updated_.insert(u"C");

    // Manually set |is_index_built_| below because the docs above were not
    // added to the index using the AddOrUpdate method.
    index_.is_index_built_ = false;
  }

  PostingList FindTerm(const std::u16string& term) {
    return index_.FindTerm(term);
  }

  std::vector<Result> FindMatchingDocumentsApproximately(
      const std::unordered_set<std::u16string>& terms,
      double prefix_threshold,
      double block_threshold) {
    return index_.FindMatchingDocumentsApproximately(terms, prefix_threshold,
                                                     block_threshold);
  }

  void AddDocumentsAndCheck(const DocumentToUpdate& documents) {
    bool callback_done = false;
    index_.AddDocuments(
        documents,
        base::BindOnce([](bool* callback_done) { *callback_done = true; },
                       &callback_done));
    Wait();
    ASSERT_TRUE(callback_done);
  }

  void RemoveDocumentsAndCheck(const std::vector<std::string>& doc_ids,
                               uint32_t expect_num_deleted) {
    bool callback_done = false;
    uint32_t num_deleted = 0u;
    index_.RemoveDocuments(doc_ids,
                           base::BindOnce(
                               [](bool* callback_done, uint32_t* num_deleted,
                                  uint32_t num_deleted_callback) {
                                 *callback_done = true;
                                 *num_deleted = num_deleted_callback;
                               },
                               &callback_done, &num_deleted));
    Wait();
    ASSERT_TRUE(callback_done);
    EXPECT_EQ(num_deleted, expect_num_deleted);
  }

  void UpdateDocumentsAndCheck(const DocumentToUpdate& documents,
                               uint32_t expect_num_deleted) {
    bool callback_done = false;
    uint32_t num_deleted = 0u;
    index_.UpdateDocuments(documents,
                           base::BindOnce(
                               [](bool* callback_done, uint32_t* num_deleted,
                                  uint32_t num_deleted_callback) {
                                 *callback_done = true;
                                 *num_deleted = num_deleted_callback;
                               },
                               &callback_done, &num_deleted));
    Wait();
    ASSERT_TRUE(callback_done);
    EXPECT_EQ(num_deleted, expect_num_deleted);
  }

  void ClearInvertedIndexAndCheck() {
    bool callback_done = false;
    index_.ClearInvertedIndex(base::BindOnce(
        [](bool* callback_done) { *callback_done = true; }, &callback_done));
    Wait();
    ASSERT_TRUE(callback_done);
  }

  std::vector<TfidfResult> GetTfidf(const std::u16string& term) {
    return index_.GetTfidf(term);
  }

  bool GetTfidfForDocId(const std::u16string& term,
                        const std::string& docid,
                        float* tfidf,
                        size_t* number_positions) {
    const auto& posting = GetTfidf(term);
    if (posting.empty()) {
      return false;
    }

    for (const auto& tfidf_result : posting) {
      if (std::get<0>(tfidf_result) == docid) {
        *number_positions = std::get<1>(tfidf_result).size();
        *tfidf = std::get<2>(tfidf_result);
        return true;
      }
    }
    return false;
  }

  void BuildInvertedIndexAndCheck() {
    bool callback_done = false;
    index_.BuildInvertedIndex(base::BindOnce(
        [](bool* callback_done) { *callback_done = true; }, &callback_done));
    Wait();
    ASSERT_TRUE(callback_done);
  }

  bool IsInvertedIndexBuilt() { return index_.IsInvertedIndexBuilt(); }

  std::unordered_map<std::u16string, PostingList> GetDictionary() {
    return index_.dictionary_;
  }

  std::unordered_map<std::string, uint32_t> GetDocLength() {
    return index_.doc_length_;
  }

  std::unordered_map<std::u16string, std::vector<TfidfResult>> GetTfidfCache() {
    return index_.tfidf_cache_;
  }

  DocumentToUpdate GetDocumentsToUpdate() {
    return index_.documents_to_update_;
  }

  TermSet GetTermToBeUpdated() { return index_.terms_to_be_updated_; }

  void Wait() { task_environment_.RunUntilIdle(); }

 protected:
  base::test::TaskEnvironment task_environment_{
      base::test::TaskEnvironment::MainThreadType::DEFAULT,
      base::test::TaskEnvironment::ThreadPoolExecutionMode::QUEUED};

 private:
  InvertedIndex index_;
};

TEST_F(InvertedIndexTest, FindTermTest) {
  PostingList result = FindTerm(u"A");
  ASSERT_EQ(result.size(), 2u);
  EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
  EXPECT_EQ(result["doc1"][0].position.start, 1u);
  EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight);
  EXPECT_EQ(result["doc1"][1].position.start, 3u);
  EXPECT_EQ(result["doc1"][2].weight, kDefaultWeight);
  EXPECT_EQ(result["doc1"][2].position.start, 5u);
  EXPECT_EQ(result["doc1"][3].weight, kDefaultWeight);
  EXPECT_EQ(result["doc1"][3].position.start, 7u);

  EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
  EXPECT_EQ(result["doc2"][0].position.start, 2u);
  EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
  EXPECT_EQ(result["doc2"][1].position.start, 4u);
}

TEST_F(InvertedIndexTest, AddNewDocumentTest) {
  const std::u16string a_utf16(u"A");
  const std::u16string d_utf16(u"D");

  AddDocumentsAndCheck({{"doc3",
                         {{a_utf16,
                           {{kDefaultWeight, {"header", 1, 1}},
                            {kDefaultWeight / 2, {"body", 2, 1}},
                            {kDefaultWeight, {"header", 4, 1}}}},
                          {d_utf16,
                           {{kDefaultWeight, {"header", 3, 1}},
                            {kDefaultWeight / 2, {"body", 5, 1}}}}}}});

  EXPECT_EQ(GetDocLength()["doc3"], 5u);

  // Find "A"
  PostingList result = FindTerm(a_utf16);
  ASSERT_EQ(result.size(), 3u);
  EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
  EXPECT_EQ(result["doc3"][0].position.start, 1u);
  EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
  EXPECT_EQ(result["doc3"][1].position.start, 2u);
  EXPECT_EQ(result["doc3"][2].weight, kDefaultWeight);
  EXPECT_EQ(result["doc3"][2].position.start, 4u);

  // Find "D"
  result = FindTerm(d_utf16);
  ASSERT_EQ(result.size(), 1u);
  EXPECT_EQ(result["doc3"][0].weight, kDefaultWeight);
  EXPECT_EQ(result["doc3"][0].position.start, 3u);
  EXPECT_EQ(result["doc3"][1].weight, kDefaultWeight / 2);
  EXPECT_EQ(result["doc3"][1].position.start, 5u);

  // Add multiple documents
  AddDocumentsAndCheck({{"doc4",
                         {{u"E",
                           {{kDefaultWeight, {"header", 1, 1}},
                            {kDefaultWeight / 2, {"body", 2, 1}},
                            {kDefaultWeight, {"header", 4, 1}}}},
                          {u"F",
                           {{kDefaultWeight, {"header", 3, 1}},
                            {kDefaultWeight / 2, {"body", 5, 1}}}}}},
                        {"doc5",
                         {{u"E",
                           {{kDefaultWeight, {"header", 1, 1}},
                            {kDefaultWeight / 2, {"body", 2, 1}},
                            {kDefaultWeight, {"header", 4, 1}}}},
                          {u"G",
                           {{kDefaultWeight, {"header", 3, 1}},
                            {kDefaultWeight / 2, {"body", 5, 1}}}}}}});

  // Find "E"
  result = FindTerm(u"E");
  ASSERT_EQ(result.size(), 2u);

  // Find "F"
  result = FindTerm(u"F");
  ASSERT_EQ(result.size(), 1u);
}

TEST_F(InvertedIndexTest, ReplaceDocumentTest) {
  const std::u16string a_utf16(u"A");
  const std::u16string d_utf16(u"D");

  AddDocumentsAndCheck({{"doc1",
                         {{a_utf16,
                           {{kDefaultWeight, {"header", 1, 1}},
                            {kDefaultWeight / 4, {"body", 2, 1}},
                            {kDefaultWeight / 2, {"header", 4, 1}}}},
                          {d_utf16,
                           {{kDefaultWeight / 3, {"header", 3, 1}},
                            {kDefaultWeight / 5, {"body", 5, 1}}}}}}});

  EXPECT_EQ(GetDocLength()["doc1"], 5u);
  EXPECT_EQ(GetDocLength()["doc2"], 6u);

  // Find "A"
  PostingList result = FindTerm(a_utf16);
  ASSERT_EQ(result.size(), 2u);
  EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight);
  EXPECT_EQ(result["doc1"][0].position.start, 1u);
  EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 4);
  EXPECT_EQ(result["doc1"][1].position.start, 2u);
  EXPECT_EQ(result["doc1"][2].weight, kDefaultWeight / 2);
  EXPECT_EQ(result["doc1"][2].position.start, 4u);

  // Find "B"
  result = FindTerm(u"B");
  ASSERT_EQ(result.size(), 0u);

  // Find "D"
  result = FindTerm(d_utf16);
  ASSERT_EQ(result.size(), 1u);
  EXPECT_EQ(result["doc1"][0].weight, kDefaultWeight / 3);
  EXPECT_EQ(result["doc1"][0].position.start, 3u);
  EXPECT_EQ(result["doc1"][1].weight, kDefaultWeight / 5);
  EXPECT_EQ(result["doc1"][1].position.start, 5u);
}

TEST_F(InvertedIndexTest, RemoveDocumentTest) {
  EXPECT_EQ(GetDictionary().size(), 3u);
  EXPECT_EQ(GetDocLength().size(), 2u);

  RemoveDocumentsAndCheck({"doc1"}, 1u);

  EXPECT_EQ(GetDictionary().size(), 2u);
  EXPECT_EQ(GetDocLength().size(), 1u);
  EXPECT_EQ(GetDocLength()["doc2"], 6u);

  // Find "A"
  PostingList result = FindTerm(u"A");
  ASSERT_EQ(result.size(), 1u);
  EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
  EXPECT_EQ(result["doc2"][0].position.start, 2u);
  EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
  EXPECT_EQ(result["doc2"][1].position.start, 4u);

  // Find "B"
  result = FindTerm(u"B");
  ASSERT_EQ(result.size(), 0u);

  // Find "C"
  result = FindTerm(u"C");
  ASSERT_EQ(result.size(), 1u);
  EXPECT_EQ(result["doc2"][0].weight, kDefaultWeight);
  EXPECT_EQ(result["doc2"][0].position.start, 1u);
  EXPECT_EQ(result["doc2"][1].weight, kDefaultWeight);
  EXPECT_EQ(result["doc2"][1].position.start, 3u);
  EXPECT_EQ(result["doc2"][2].weight, kDefaultWeight);
  EXPECT_EQ(result["doc2"][2].position.start, 5u);
  EXPECT_EQ(result["doc2"][3].weight, kDefaultWeight);
  EXPECT_EQ(result["doc2"][3].position.start, 7u);

  // Removes multiple documents, but only "doc2" is actually removed since
  // "doc1" and "doc3" don't exist.
  RemoveDocumentsAndCheck({"doc1", "doc2", "doc3"}, 1u);
  EXPECT_EQ(GetDictionary().size(), 0u);
  EXPECT_EQ(GetDocLength().size(), 0u);
}

TEST_F(InvertedIndexTest, UpdateIndexTest) {
  EXPECT_EQ(GetTfidfCache().size(), 0u);
  BuildInvertedIndexAndCheck();
  EXPECT_EQ(GetTfidfCache().size(), 3u);

  // Replaces "doc1"
  AddDocumentsAndCheck({{"doc1",
                         {{u"A",
                           {{kDefaultWeight / 2, {"header", 1, 1}},
                            {kDefaultWeight / 4, {"body", 2, 1}},
                            {kDefaultWeight / 2, {"header", 4, 1}}}},
                          {u"D",
                           {{kDefaultWeight, {"header", 3, 1}},
                            {kDefaultWeight, {"body", 5, 1}}}}}}});

  EXPECT_FALSE(IsInvertedIndexBuilt());
  BuildInvertedIndexAndCheck();

  EXPECT_EQ(GetTfidfCache().size(), 3u);

  std::vector<TfidfResult> results = GetTfidf(u"A");
  const double expected_tfidf_A_doc1 =
      std::roundf(
          TfIdfScore(
              /*num_docs=*/2,
              /*num_docs_with_term=*/2,
              /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight / 2 +
                  kDefaultWeight / 4 + kDefaultWeight / 2,
              /*doc_length=*/5) *
          100) /
      100;
  const double expected_tfidf_A_doc2 =
      std::roundf(
          TfIdfScore(/*num_docs=*/2,
                     /*num_docs_with_term=*/2,
                     /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
                     /*doc_length=*/6) *
          100) /
      100;
  EXPECT_THAT(GetScoresFromTfidfResult(results),
              testing::UnorderedElementsAre(expected_tfidf_A_doc1,
                                            expected_tfidf_A_doc2));

  results = GetTfidf(u"B");
  EXPECT_THAT(GetScoresFromTfidfResult(results),
              testing::UnorderedElementsAre());

  results = GetTfidf(u"C");
  const double expected_tfidf_C_doc2 =
      std::roundf(
          TfIdfScore(/*num_docs=*/2,
                     /*num_docs_with_term=*/1,
                     /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 4,
                     /*doc_length=*/6) *
          100) /
      100;
  EXPECT_THAT(GetScoresFromTfidfResult(results),
              testing::UnorderedElementsAre(expected_tfidf_C_doc2));

  results = GetTfidf(u"D");
  const double expected_tfidf_D_doc1 =
      std::roundf(
          TfIdfScore(/*num_docs=*/2,
                     /*num_docs_with_term=*/1,
                     /*weighted_num_term_occurrence_in_doc=*/kDefaultWeight * 2,
                     /*doc_length=*/5) *
          100) /
      100;
  EXPECT_THAT(GetScoresFromTfidfResult(results),
              testing::UnorderedElementsAre(expected_tfidf_D_doc1));
}

TEST_F(InvertedIndexTest, UpdateDocumentsTest) {
  EXPECT_EQ(GetTfidfCache().size(), 0u);
  BuildInvertedIndexAndCheck();
  EXPECT_EQ(GetTfidfCache().size(), 3u);

  // Replaces "doc1" and remove "doc2"
  UpdateDocumentsAndCheck({{"doc1",
                            {{u"A",
                              {{kDefaultWeight / 2, {"header", 1, 1}},
                               {kDefaultWeight / 4, {"body", 2, 1}},
                               {kDefaultWeight / 2, {"header", 4, 1}}}},
                             {u"D",
                              {{kDefaultWeight, {"header", 3, 1}},
                               {kDefaultWeight, {"body", 5, 1}}}}}},
                           {"doc2", {}}},
                          1u);
  BuildInvertedIndexAndCheck();

  EXPECT_EQ(GetTfidfCache().size(), 2u);

  std::vector<TfidfResult> results = GetTfidf(u"C");
  EXPECT_EQ(results.size(), 0u);
}

TEST_F(InvertedIndexTest, ClearInvertedIndexTest) {
  EXPECT_EQ(GetTfidfCache().size(), 0u);
  BuildInvertedIndexAndCheck();
  EXPECT_EQ(GetTfidfCache().size(), 3u);

  // Add a document and clear the index simultaneously.
  const std::u16string a_utf16(u"A");
  const std::u16string d_utf16(u"D");
  AddDocumentsAndCheck({{"doc3",
                         {{a_utf16,
                           {{kDefaultWeight, {"header", 1, 1}},
                            {kDefaultWeight / 2, {"body", 2, 1}},
                            {kDefaultWeight, {"header", 4, 1}}}},
                          {d_utf16,
                           {{kDefaultWeight, {"header", 3, 1}},
                            {kDefaultWeight / 2, {"body", 5, 1}}}}}}});
  ClearInvertedIndexAndCheck();

  EXPECT_EQ(GetTfidfCache().size(), 0u);
  EXPECT_EQ(GetTermToBeUpdated().size(), 0u);
  EXPECT_EQ(GetDocLength().size(), 0u);
  EXPECT_EQ(GetDictionary().size(), 0u);
  EXPECT_EQ(GetDocumentsToUpdate().size(), 0u);
}

TEST_F(InvertedIndexTest, FindMatchingDocumentsApproximatelyTest) {
  const double prefix_threshold = 1.0;
  const double block_threshold = 1.0;
  const std::u16string a_utf16(u"A");
  const std::u16string b_utf16(u"B");
  const std::u16string c_utf16(u"C");
  const std::u16string d_utf16(u"D");

  // Replace doc1, same occurrences, just different weights.
  AddDocumentsAndCheck({{"doc1",
                         {{a_utf16,
                           {{kDefaultWeight, {"header", 1, 1}},
                            {kDefaultWeight, {"header", 3, 1}},
                            {kDefaultWeight / 2, {"body", 5, 1}},
                            {kDefaultWeight / 2, {"body", 7, 1}}}},
                          {b_utf16,
                           {{kDefaultWeight / 2, {"header", 2, 1}},
                            {kDefaultWeight / 2, {"header", 6, 1}},
                            {kDefaultWeight / 3, {"body", 4, 1}},
                            {kDefaultWeight / 3, {"body", 5, 1}}}}}}});

  {
    // "A" exists in "doc1" and "doc2". The score of each document is simply A's
    // TF-IDF score.
    const std::vector<Result> matching_docs =
        FindMatchingDocumentsApproximately({a_utf16}, prefix_threshold,
                                           block_threshold);

    EXPECT_EQ(matching_docs.size(), 2u);
    std::vector<std::string> expected_docids = {"doc1", "doc2"};
    // TODO(jiameng): for testing, we should  use another independent method to
    // verify scores.
    std::vector<float> actual_scores;
    for (size_t i = 0; i < 2; ++i) {
      const auto& docid = expected_docids[i];
      float expected_score = 0;
      size_t expected_num_pos = 0u;
      EXPECT_TRUE(
          GetTfidfForDocId(a_utf16, docid, &expected_score, &expected_num_pos));
      CheckResult(matching_docs[i], docid, expected_score, expected_num_pos);
      actual_scores.push_back(expected_score);
    }

    // Check scores are non-increasing.
    EXPECT_GE(actual_scores[0], actual_scores[1]);
  }

  {
    // "D" does not exist in the index.
    const std::vector<Result> matching_docs =
        FindMatchingDocumentsApproximately({d_utf16}, prefix_threshold,
                                           block_threshold);

    EXPECT_TRUE(matching_docs.empty());
  }

  {
    // Query is {"A", "B", "C"}, which matches all documents.
    const std::vector<Result> matching_docs =
        FindMatchingDocumentsApproximately({a_utf16, b_utf16, c_utf16},
                                           prefix_threshold, block_threshold);
    EXPECT_EQ(matching_docs.size(), 2u);

    // Ranked documents are {"doc2", "doc1"}.
    // "doc2" score comes from sum of TF-IDF of "A" and "C"
    float expected_score2_A = 0;
    size_t expected_num_pos2_A = 0u;
    EXPECT_TRUE(GetTfidfForDocId(a_utf16, "doc2", &expected_score2_A,
                                 &expected_num_pos2_A));
    float expected_score2_C = 0;
    size_t expected_num_pos2_C = 0u;
    EXPECT_TRUE(GetTfidfForDocId(c_utf16, "doc2", &expected_score2_C,
                                 &expected_num_pos2_C));
    const float expected_score2 = expected_score2_A + expected_score2_C;
    const size_t expected_num_pos2 = expected_num_pos2_A + expected_num_pos2_C;
    CheckResult(matching_docs[0], "doc2", expected_score2, expected_num_pos2);

    // "doc1" score comes from sum of TF-IDF of "A" and "B".
    float expected_score1_A = 0;
    size_t expected_num_pos1_A = 0u;
    EXPECT_TRUE(GetTfidfForDocId(a_utf16, "doc1", &expected_score1_A,
                                 &expected_num_pos1_A));
    float expected_score1_B = 0;
    size_t expected_num_pos1_B = 0u;
    EXPECT_TRUE(GetTfidfForDocId(b_utf16, "doc1", &expected_score1_B,
                                 &expected_num_pos1_B));
    const float expected_score1 = expected_score1_A + expected_score1_B;
    const size_t expected_num_pos1 = expected_num_pos1_B + expected_num_pos1_B;
    CheckResult(matching_docs[1], "doc1", expected_score1, expected_num_pos1);

    EXPECT_GE(expected_score2, expected_score1);
  }
}

}  // namespace ash::local_search_service