chromium/chromeos/ash/components/string_matching/sequence_matcher.h

// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
#define CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_

#include <string>
#include <unordered_map>
#include <vector>

namespace ash::string_matching {

namespace {

constexpr double kNumMatchingBlocksPenalty = 0.1;
constexpr bool kUseTextLengthAgnosticism = false;

}  // namespace

// Performs the calculation of similarity level between 2 strings. This class's
// functionality is inspired by python's difflib.SequenceMatcher library.
// (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher)
//
// TODO(crbug.com/1336160): The unused edit distance pathway has been removed.
// Consider coming up with a new edit-distance method having the awareness of
// string structures. (e.g., awareness of block matching and token break
// locations).
class SequenceMatcher {
 public:
  // Representing a common substring between `first_string_` and
  // `second_string_`.
  struct Match {
    Match();
    Match(int pos_first, int pos_second, int len);
    // Starting position of the common substring in `first_string_`.
    int pos_first_string;
    // Starting position of the common substring in `second_string_`.
    int pos_second_string;
    // Length of the common substring.
    int length;
  };

  // `num_matching_blocks_penalty` is used to penalize too many small matching
  // blocks. For the same number of matching characters, we prefer fewer
  // matching blocks. Value equal to 0 means no penalty. Values greater than 0
  // means heavier penalty will be applied to larger number of blocks.
  SequenceMatcher(
      const std::u16string& first_string,
      const std::u16string& second_string,
      double num_matching_blocks_penalty = kNumMatchingBlocksPenalty);

  SequenceMatcher(const SequenceMatcher&) = delete;
  SequenceMatcher& operator=(const SequenceMatcher&) = delete;

  ~SequenceMatcher();

  // Calculates similarity ratio of `first_string_` and `second_string_`.
  //
  // In the actual string matching in launcher searches, we will input with the
  // query as `first_string_` and the text as `second_string_`. As the query is
  // likely to be shorter than the text in most cases, we would like to
  // ignore/lower the influence of the amounts of any remaining unmatched
  // portions of the `second_string_` onto the ratio (i.e., "text-length
  // agnosticism").
  //
  // Thus, We will trim the text length if it is too long and
  // `text_length_agnostic` is true, and it only works for the block
  // matching algorithm.
  double Ratio(bool text_length_agnostic = kUseTextLengthAgnosticism);
  // Finds the longest common substring between
  // `first_string_[first_start:first_end]` and
  // `second_string_[second_start:second_end]`. Used by
  // GetMatchingBlocks().
  Match FindLongestMatch(int first_start,
                         int first_end,
                         int second_start,
                         int second_end);
  // Get all matching blocks of `first_string_` and `second_string_`.
  // All blocks will be sorted by their starting position within
  // `first_string_`.
  //
  // The last matching block will always be:
  //
  //   Match(first_string_.size(), second_string_.size(), 0).
  //
  // This is to cover the case where two strings have no matching blocks,
  // so that we have something to store in `matching_blocks_` to indicate
  // that matching has taken place.
  std::vector<Match> GetMatchingBlocks();

 private:
  std::u16string first_string_;
  std::u16string second_string_;
  double num_matching_blocks_penalty_;
  double block_matching_ratio_ = -1.0;
  std::vector<Match> matching_blocks_;

  // For each character `c` in `second_string_`, this variable
  // `char_to_positions_` stores all positions where `c` occurs in
  // `second_string_`.
  std::unordered_map<char, std::vector<int>> char_to_positions_;
  // Memory for dynamic programming algorithm used in FindLongestMatch().
  std::vector<int> dp_common_string_;
};

}  // namespace ash::string_matching

#endif  // CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_