html_based_username_detector.cc | Explore in Territory

// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/content/renderer/html_based_username_detector.h"

#include <algorithm>
#include <array>
#include <string>
#include <utility>
#include <vector>

#include "base/containers/contains.h"
#include "base/containers/flat_set.h"
#include "base/containers/span.h"
#include "base/i18n/case_conversion.h"
#include "base/strings/string_split.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/content/renderer/form_autofill_util.h"
#include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
#include "components/autofill/core/common/form_data.h"
#include "third_party/blink/public/web/web_form_element.h"

WebFormControlElement;
WebFormElement;
WebInputElement;

namespace autofill {

namespace {

// List of separators that can appear in HTML attribute values.
constexpr char16_t kDelimiters[] = …;

// Minimum length of a word, in order not to be considered short word. Short
// words will not be searched in attribute values (especially after delimiters
// removing), because a short word may be a part of another word. A short word
// should be enclosed between delimiters, otherwise an occurrence doesn't count.
constexpr int kMinimumWordLength = …;

// For each input element that can be a username, developer and user group
// values are computed. The user group value includes what a user sees: label,
// placeholder, aria-label (all are stored in FormFieldData.label). The
// developer group value consists of name and id attribute values.
// For each group the set of short tokens (tokens shorter than
// |kMinimumWordLength|) is computed as well.
struct UsernameFieldData { … };

// Words that the algorithm looks for are split into multiple categories based
// on feature reliability.
// A category may contain a latin dictionary and a non-latin dictionary. It is
// mandatory that it has a latin one, but a non-latin might be missing.
// "Latin" translations are the translations of the words for which the
// original translation is similar to the romanized translation (translation of
// the word only using ISO basic Latin alphabet).
// "Non-latin" translations are the translations of the words that have custom,
// country specific characters.
struct CategoryOfWords { … };

// 1. Removes delimiters from |raw_value| and appends the remainder to
// |*field_data_value|. A sentinel symbol is added first if |*field_data_value|
// is not empty.
// 2. Tokenizes and appends short tokens (shorter than |kMinimumWordLength|)
// from |raw_value| to |*field_data_short_tokens|, if any.
void AppendValueAndShortTokens(
    const std::u16string& raw_value,
    std::u16string* field_data_value,
    base::flat_set<std::u16string>* field_data_short_tokens) { … }

// For the given |input_element|, compute developer and user value, along with
// sets of short tokens, and returns it.
UsernameFieldData ComputeUsernameFieldData(const FormFieldData& field) { … }

void InferUsernameFieldData(
    const FormData& form_data,
    std::vector<UsernameFieldData>* possible_usernames_data) { … }

// Check if any word from |dictionary| is encountered in computed field
// information (i.e. |value|, |tokens|).
bool CheckFieldWithDictionary(
    const std::u16string& value,
    const base::flat_set<std::u16string>& short_tokens,
    base::span<const std::u16string_view> dictionary) { … }

// Check if any word from |category| is encountered in computed field
// information (|possible_username|).
bool ContainsWordFromCategory(const UsernameFieldData& possible_username,
                              const CategoryOfWords& category) { … }

// Remove from |possible_usernames_data| the elements that definitely cannot be
// usernames, because their computed values contain at least one negative word.
void RemoveFieldsWithNegativeWords(
    std::vector<UsernameFieldData>* possible_usernames_data) { … }

// Check if any word from the given category (|category|) appears in fields from
// the form (|possible_usernames_data|). If the category words appear in more
// than 2 fields, do nothing, because it may just be a prefix. If the words
// appears in 1 or 2 fields, the first field is added to |username_predictions|.
void FindWordsFromCategoryInForm(
    const std::vector<UsernameFieldData>& possible_usernames_data,
    const CategoryOfWords& category,
    std::vector<FieldRendererId>* username_predictions) { … }

// Find username elements if there is no cached result for the given form and
// add them to |username_predictions| in the order of decreasing reliability.
void FindUsernameFieldInternal(
    const FormData& form_data,
    std::vector<FieldRendererId>* username_predictions) { … }

}  // namespace

const std::vector<FieldRendererId>& GetPredictionsFieldBasedOnHtmlAttributes(
    const FormData& form_data,
    UsernameDetectorCache* username_detector_cache) { … }

}  // namespace autofill
chromium/components/autofill/content/renderer/html_based_username_detector.cc