in_memory_url_index_types.h | Explore in Territory

// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_
#define COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_

#include <stddef.h>

#include <map>
#include <string>
#include <unordered_map>
#include <vector>

#include "base/containers/flat_set.h"
#include "components/history/core/browser/history_types.h"
#include "url/gurl.h"

// Convenience Types -----------------------------------------------------------

String16Vector;
String16Set;
Char16Set;
Char16Vector;

// A vector that contains the offsets at which each word starts within a string.
WordStarts;

// Matches within URL and Title Strings ----------------------------------------

// Specifies where an omnibox term occurs within a string. Used for specifying
// highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
// scoring a result.
struct TermMatch { … };
TermMatches;

// Returns the joined TermMatches of each term. See MatchTermInString.
TermMatches MatchTermsInString(const String16Vector& terms,
                               const std::u16string& cleaned_string);

// Returns a `TermMatches` which has an entry for each occurrence of the string
// `term` found in the string `cleaned_string`. Use `CleanUpUrlForMatching()` or
// CleanUpTitleForMatching() before passing `cleaned_string` to this function.
// The function marks each match with `term_num` so that the resulting
// `TermMatches` can be merged with other `TermMatches` for other terms. Note
// that only the first 2,048 characters of `cleaned_string` are considered
// during the match operation.
TermMatches MatchTermInString(const std::u16string& term,
                              const std::u16string& cleaned_string,
                              int term_num);

// Sorts |matches| by offset and returns the result.
TermMatches SortMatches(const TermMatches& matches);

// Removes overlapping substring matches from |matches| and returns the
// cleaned up matches.  Assumes |matches| is already sorted.
TermMatches DeoverlapMatches(const TermMatches& sorted_matches);

// Extracts and returns the offsets from |matches|.  This includes both
// the offsets corresponding to the beginning of a match and the offsets
// corresponding to the end of a match (i.e., offset+length for that match).
std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches);

// Replaces the offsets and lengths in |matches| with those given in |offsets|.
// |offsets| gives beginning and ending offsets for each match; this function
// translates (beginning, ending) offset into (beginning offset, length of
// match).  It deletes any matches for which an endpoint is npos and returns
// the updated list of matches.
TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
                                        const std::vector<size_t>& offsets);

// Utility Functions -----------------------------------------------------------

// Breaks the string `cleaned_uni_string` down into individual words. Use
// `CleanUpUrlForMatching()` or `CleanUpTitleForMatching()` before passing
// `cleaned_uni_string` to this function. If `word_starts` is not NULL then
// clears and pushes the offsets within `cleaned_uni_string` at which each word
// starts onto `word_starts`. These offsets are collected only up to the first
// `kMaxSignificantChars` of `cleaned_uni_string`.
String16Set String16SetFromString16(const std::u16string& cleaned_uni_string,
                                    WordStarts* word_starts);

// Breaks the `cleaned_uni_string` string down into individual words and return
// a vector with the individual words in their original order. Use
// `CleanUpUrlForMatching()` or `CleanUpTitleForMatching()` before passing
// `cleaned_uni_string` to this function. The string is broken using
// `BreakIterator`'s `BREAK_WORD` detection logic, augmented so that it
// additionally breaks words at underscores. The resulting list will contain
// only words containing alpha-numeric characters. If `word_starts` is not NULL
// then clears and pushes the word starts onto `word_starts`.
//
// Example:
//   Given: `cleaned_uni_string`: "http://www.google.com/ harry the_rabbit."
//   Returns: "http", "www", "google", "com", "harry", "the", "rabbit"
String16Vector String16VectorFromString16(
    const std::u16string& cleaned_uni_string,
    WordStarts* word_starts);

// Breaks the |uni_word| string down into its individual characters.
// Note that this is temporarily intended to work on a single word, but
// _will_ work on a string of words, perhaps with unexpected results.
// TODO(mrossetti): Lots of optimizations possible here for not restarting
// a search if the user is just typing along. Also, change this to uniString
// and properly handle substring matches, scoring and sorting the results
// by score. Also, provide the metrics for where the matches occur so that
// the UI can highlight the matched sections.
Char16Set Char16SetFromString16(const std::u16string& uni_word);

// Support for InMemoryURLIndex Private Data -----------------------------------

// An index into a list of all of the words we have indexed.
WordID;

// A map allowing a WordID to be determined given a word.
WordMap;

// A map from character to the word_ids of words containing that character.
WordIDSet;  // An index into the WordList.
CharWordIDMap;

// A map from word (by word_id) to history items containing that word.
HistoryID;
HistoryIDSet;
HistoryIDVector;
WordIDHistoryMap;
HistoryIDWordMap;

// Information used in scoring a particular URL.
VisitInfoVector;
struct HistoryInfoMapValue { … };

// A map from history_id to the history's URL and title.
HistoryInfoMap;

// A map from history_id to URL and page title word start metrics.
struct RowWordStarts { … };
WordStartsMap;

#endif  // COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_
chromium/components/omnibox/browser/in_memory_url_index_types.h