chromium/chromeos/ash/components/string_matching/sequence_matcher.cc

// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chromeos/ash/components/string_matching/sequence_matcher.h"

#include <algorithm>
#include <cmath>
#include <queue>

#include "base/check_op.h"

namespace ash::string_matching {

namespace {

using Match = SequenceMatcher::Match;
using Matches = std::vector<Match>;

// The maximum recognized text size if text-length agnosticism is applied.
constexpr int kTextAgnosticismSize = 9;

bool CompareMatches(const Match& m1, const Match& m2) {
  return m1.pos_first_string < m2.pos_first_string;
}

}  // namespace

SequenceMatcher::Match::Match() = default;
SequenceMatcher::Match::Match(int pos_first, int pos_second, int len)
    : pos_first_string(pos_first), pos_second_string(pos_second), length(len) {
  DCHECK_GE(pos_first_string, 0);
  DCHECK_GE(pos_second_string, 0);
  DCHECK_GE(length, 0);
}

SequenceMatcher::SequenceMatcher(const std::u16string& first_string,
                                 const std::u16string& second_string,
                                 double num_matching_blocks_penalty)
    : first_string_(first_string),
      second_string_(second_string),
      num_matching_blocks_penalty_(num_matching_blocks_penalty),
      dp_common_string_(second_string.size() + 1, 0) {
  if (first_string_.empty() && second_string_.empty()) {
    block_matching_ratio_ = 0;
  }

  for (size_t i = 0; i < second_string_.size(); i++) {
    char_to_positions_[second_string_[i]].emplace_back(i);
  }
}

SequenceMatcher::~SequenceMatcher() = default;

// Compute the longest common substring, with optimisations for:
//
// 1) Time: By pre-computing some letter positions (stored in
// `char_to_positions_`.
//
// 2) Memory: Store only the latest row of the DP table (in
// `dp_common_string_`).
//
// 3) Time: Fast-update `dp_common_string_`.
Match SequenceMatcher::FindLongestMatch(int first_start,
                                        int first_end,
                                        int second_start,
                                        int second_end) {
  Match match(first_start, second_start, 0);

  // These two vectors are used for fast updating of `dp_common_string_`.
  // Only erase or update values which are known to have been changed.
  //
  //   `dp_values_to_erase` contains the values which should be erased from
  //     `dp_common_string_`.
  //   `dp_values_to_affect` contains the values which should be updated in
  //     `dp_common_string_`.
  std::vector<std::pair<int, int>> dp_values_to_erase;
  std::vector<std::pair<int, int>> dp_values_to_affect;

  // Outer loop: Iterate through the characters of `first_string`.
  // Keep up-to-date `dp_common_string_` (the latest row of the DP table).
  for (int i = first_start; i < first_end; i++) {
    dp_values_to_affect.clear();

    // Inner loop: Iterate through characters of `second_string`, where those
    // characters are equal to first_string_[i], and within range.
    for (auto j : char_to_positions_[first_string_[i]]) {
      if (j < second_start) {
        continue;
      }
      if (j >= second_end) {
        break;
      }
      // dp_common_string_[j + 1] is the length of the longest common substring
      // ending at first_string_[i] and second_string_[j].
      const int length = dp_common_string_[j] + 1;
      dp_values_to_affect.emplace_back(j + 1, length);

      // Store newly-found longer matches.
      if (length > match.length) {
        match.pos_first_string = i - length + 1;
        match.pos_second_string = j - length + 1;
        match.length = length;
      }
    }
    // Update `dp_common_string_`.
    for (auto const& element : dp_values_to_erase) {
      dp_common_string_[element.first] = 0;
    }
    for (auto const& element : dp_values_to_affect) {
      dp_common_string_[element.first] = element.second;
    }
    std::swap(dp_values_to_erase, dp_values_to_affect);
  }
  // Erase temporary values in preparation for future calls.
  std::fill(dp_common_string_.begin(), dp_common_string_.end(), 0);

  return match;
}

Matches SequenceMatcher::GetMatchingBlocks() {
  if (!matching_blocks_.empty()) {
    return matching_blocks_;
  }

  // This queue contains a tuple of 4 integers that represent 2 substrings to
  // find the longest match in the following order: first_start, first_end,
  // second_start, second_end.
  std::queue<std::tuple<int, int, int, int>> queue_block;
  queue_block.emplace(0, first_string_.size(), 0, second_string_.size());

  // Find all matching blocks recursively. Prioritize longer blocks: Find the
  // longest matching block first, then recurse to the left and right into the
  // remaining as-yet unmatched sections of the two strings.
  while (!queue_block.empty()) {
    int first_start, first_end, second_start, second_end;
    std::tie(first_start, first_end, second_start, second_end) =
        queue_block.front();
    queue_block.pop();

    const Match match =
        FindLongestMatch(first_start, first_end, second_start, second_end);

    if (match.length > 0) {
      // Exclude matching blocks with whitespace only.
      const bool whitespace_only =
          match.length == 1 && first_string_[match.pos_first_string] == u' ';
      if (!whitespace_only) {
        matching_blocks_.push_back(match);
      }

      // Recurse left.
      if (first_start < match.pos_first_string &&
          second_start < match.pos_second_string) {
        queue_block.emplace(first_start, match.pos_first_string, second_start,
                            match.pos_second_string);
      }
      // Recurse right.
      if (match.pos_first_string + match.length < first_end &&
          match.pos_second_string + match.length < second_end) {
        queue_block.emplace(match.pos_first_string + match.length, first_end,
                            match.pos_second_string + match.length, second_end);
      }
    }
  }

  // Always store a final matching block. In case no matching blocks
  // were discovered above, this final matching block serves
  // the purpose of indicating that block matching has taken place.
  matching_blocks_.push_back(
      Match(first_string_.size(), second_string_.size(), 0));
  sort(matching_blocks_.begin(), matching_blocks_.end(), CompareMatches);
  return matching_blocks_;
}

double SequenceMatcher::Ratio(bool text_length_agnostic) {
  // Uses block matching to calculate ratio.
  if (block_matching_ratio_ < 0) {
    int sum_match = 0;
    const int query_size = first_string_.size();
    const int text_size = second_string_.size();

    int sum_length = query_size;
    // Text-length agnosticism is applied for long texts if
    // `text_length_agnostic` is true, but we still keep it not shorter
    // than the query length. Text length agnosticism is ignored if we have a
    // longer query than the text.
    if (text_length_agnostic && query_size < text_size) {
      int max_recognized_text_length =
          std::max(kTextAgnosticismSize, query_size);
      sum_length += std::min(text_size, max_recognized_text_length);
    } else {
      sum_length += text_size;
    }

    DCHECK_NE(sum_length, 0);
    const int num_blocks = GetMatchingBlocks().size();
    for (const auto& match : GetMatchingBlocks()) {
      sum_match += match.length;
    }
    // The last block is always a placeholder "empty" block, so subtract one.
    // And, allow for one "penalty-free" block, so subtract one again. Hence,
    // apply a penalty by using |num_blocks - 2|. Example:
    //
    // If num_blocks = 5, the actual number of matching blocks is 4. This
    // means there are 3 blocks in excess of 1.
    block_matching_ratio_ =
        2.0 * sum_match / sum_length *
        exp(-(num_blocks - 2) * num_matching_blocks_penalty_);
  }
  return block_matching_ratio_;
}

}  //  namespace ash::string_matching