// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
#define CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
#include <string>
#include <unordered_map>
#include <vector>
namespace ash::string_matching {
namespace {
constexpr double kNumMatchingBlocksPenalty = 0.1;
constexpr bool kUseTextLengthAgnosticism = false;
} // namespace
// Performs the calculation of similarity level between 2 strings. This class's
// functionality is inspired by python's difflib.SequenceMatcher library.
// (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher)
//
// TODO(crbug.com/1336160): The unused edit distance pathway has been removed.
// Consider coming up with a new edit-distance method having the awareness of
// string structures. (e.g., awareness of block matching and token break
// locations).
class SequenceMatcher {
public:
// Representing a common substring between `first_string_` and
// `second_string_`.
struct Match {
Match();
Match(int pos_first, int pos_second, int len);
// Starting position of the common substring in `first_string_`.
int pos_first_string;
// Starting position of the common substring in `second_string_`.
int pos_second_string;
// Length of the common substring.
int length;
};
// `num_matching_blocks_penalty` is used to penalize too many small matching
// blocks. For the same number of matching characters, we prefer fewer
// matching blocks. Value equal to 0 means no penalty. Values greater than 0
// means heavier penalty will be applied to larger number of blocks.
SequenceMatcher(
const std::u16string& first_string,
const std::u16string& second_string,
double num_matching_blocks_penalty = kNumMatchingBlocksPenalty);
SequenceMatcher(const SequenceMatcher&) = delete;
SequenceMatcher& operator=(const SequenceMatcher&) = delete;
~SequenceMatcher();
// Calculates similarity ratio of `first_string_` and `second_string_`.
//
// In the actual string matching in launcher searches, we will input with the
// query as `first_string_` and the text as `second_string_`. As the query is
// likely to be shorter than the text in most cases, we would like to
// ignore/lower the influence of the amounts of any remaining unmatched
// portions of the `second_string_` onto the ratio (i.e., "text-length
// agnosticism").
//
// Thus, We will trim the text length if it is too long and
// `text_length_agnostic` is true, and it only works for the block
// matching algorithm.
double Ratio(bool text_length_agnostic = kUseTextLengthAgnosticism);
// Finds the longest common substring between
// `first_string_[first_start:first_end]` and
// `second_string_[second_start:second_end]`. Used by
// GetMatchingBlocks().
Match FindLongestMatch(int first_start,
int first_end,
int second_start,
int second_end);
// Get all matching blocks of `first_string_` and `second_string_`.
// All blocks will be sorted by their starting position within
// `first_string_`.
//
// The last matching block will always be:
//
// Match(first_string_.size(), second_string_.size(), 0).
//
// This is to cover the case where two strings have no matching blocks,
// so that we have something to store in `matching_blocks_` to indicate
// that matching has taken place.
std::vector<Match> GetMatchingBlocks();
private:
std::u16string first_string_;
std::u16string second_string_;
double num_matching_blocks_penalty_;
double block_matching_ratio_ = -1.0;
std::vector<Match> matching_blocks_;
// For each character `c` in `second_string_`, this variable
// `char_to_positions_` stores all positions where `c` occurs in
// `second_string_`.
std::unordered_map<char, std::vector<int>> char_to_positions_;
// Memory for dynamic programming algorithm used in FindLongestMatch().
std::vector<int> dp_common_string_;
};
} // namespace ash::string_matching
#endif // CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_