// Copyright 2017 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/autofill/content/renderer/html_based_username_detector.h" #include <algorithm> #include <array> #include <string> #include <utility> #include <vector> #include "base/containers/contains.h" #include "base/containers/flat_set.h" #include "base/containers/span.h" #include "base/i18n/case_conversion.h" #include "base/strings/string_split.h" #include "base/strings/utf_string_conversions.h" #include "components/autofill/content/renderer/form_autofill_util.h" #include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h" #include "components/autofill/core/common/form_data.h" #include "third_party/blink/public/web/web_form_element.h" WebFormControlElement; WebFormElement; WebInputElement; namespace autofill { namespace { // List of separators that can appear in HTML attribute values. constexpr char16_t kDelimiters[] = …; // Minimum length of a word, in order not to be considered short word. Short // words will not be searched in attribute values (especially after delimiters // removing), because a short word may be a part of another word. A short word // should be enclosed between delimiters, otherwise an occurrence doesn't count. constexpr int kMinimumWordLength = …; // For each input element that can be a username, developer and user group // values are computed. The user group value includes what a user sees: label, // placeholder, aria-label (all are stored in FormFieldData.label). The // developer group value consists of name and id attribute values. // For each group the set of short tokens (tokens shorter than // |kMinimumWordLength|) is computed as well. struct UsernameFieldData { … }; // Words that the algorithm looks for are split into multiple categories based // on feature reliability. // A category may contain a latin dictionary and a non-latin dictionary. It is // mandatory that it has a latin one, but a non-latin might be missing. // "Latin" translations are the translations of the words for which the // original translation is similar to the romanized translation (translation of // the word only using ISO basic Latin alphabet). // "Non-latin" translations are the translations of the words that have custom, // country specific characters. struct CategoryOfWords { … }; // 1. Removes delimiters from |raw_value| and appends the remainder to // |*field_data_value|. A sentinel symbol is added first if |*field_data_value| // is not empty. // 2. Tokenizes and appends short tokens (shorter than |kMinimumWordLength|) // from |raw_value| to |*field_data_short_tokens|, if any. void AppendValueAndShortTokens( const std::u16string& raw_value, std::u16string* field_data_value, base::flat_set<std::u16string>* field_data_short_tokens) { … } // For the given |input_element|, compute developer and user value, along with // sets of short tokens, and returns it. UsernameFieldData ComputeUsernameFieldData(const FormFieldData& field) { … } void InferUsernameFieldData( const FormData& form_data, std::vector<UsernameFieldData>* possible_usernames_data) { … } // Check if any word from |dictionary| is encountered in computed field // information (i.e. |value|, |tokens|). bool CheckFieldWithDictionary( const std::u16string& value, const base::flat_set<std::u16string>& short_tokens, base::span<const std::u16string_view> dictionary) { … } // Check if any word from |category| is encountered in computed field // information (|possible_username|). bool ContainsWordFromCategory(const UsernameFieldData& possible_username, const CategoryOfWords& category) { … } // Remove from |possible_usernames_data| the elements that definitely cannot be // usernames, because their computed values contain at least one negative word. void RemoveFieldsWithNegativeWords( std::vector<UsernameFieldData>* possible_usernames_data) { … } // Check if any word from the given category (|category|) appears in fields from // the form (|possible_usernames_data|). If the category words appear in more // than 2 fields, do nothing, because it may just be a prefix. If the words // appears in 1 or 2 fields, the first field is added to |username_predictions|. void FindWordsFromCategoryInForm( const std::vector<UsernameFieldData>& possible_usernames_data, const CategoryOfWords& category, std::vector<FieldRendererId>* username_predictions) { … } // Find username elements if there is no cached result for the given form and // add them to |username_predictions| in the order of decreasing reliability. void FindUsernameFieldInternal( const FormData& form_data, std::vector<FieldRendererId>* username_predictions) { … } } // namespace const std::vector<FieldRendererId>& GetPredictionsFieldBasedOnHtmlAttributes( const FormData& form_data, UsernameDetectorCache* username_detector_cache) { … } } // namespace autofill