// Copyright 2020 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h" #include <utility> #include "base/no_destructor.h" #include "base/strings/strcat.h" #include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h" #include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h" #include "base/notreached.h" namespace autofill { namespace { // Best practices for writing regular expression snippets: // By wrapping snippets in non-capture groups, i.e. (?: ... ), we ensure that a // pending "?" is interpreted as "optional" instead of a modifier of a previous // operator. E.g. `StrCat({"(?:a+)", "?"})` means an optional sequence of "a" // characters. But `StrCat({"a+", "?"})` means lazily match one or more "a" // characters. Prefer [^\s,] ('not a whitespace or a comma') over \w ('a word // character') in names, when you have concerns about hyphens (e.g. the German // name "Hans-Joachim") because '-' is not matched by \w. // Regular expressions pattern of common two-character CJK last names. // Korean names are written in Hangul. // Chinese names are written in their traditional and simplified version. // Source: // https://en.wikipedia.org/wiki/List_of_Korean_surnames // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93 const char kTwoCharacterCjkLastNamesRe[] = …; // Regular expression pattern for a Hangul (Korean) character. const char kHangulCharacterRe[] = …; // Regular expression pattern for a sequence of Hangul (Korean) character. const char kHangulCharactersRe[] = …; // Regular expression pattern to match separators as used in CJK names: // Included separators: \u30FB, \u00B7, \u3000 or a simple space. const char kCjkNameSeperatorsRe[] = …; // Regular expression pattern for common honorific name prefixes. // The list is incomplete and focused on the English and German language. // Sources: // * https://en.wikipedia.org/wiki/English_honorifics // * https://en.wikipedia.org/wiki/German_honorifics // TODO(crbug.com/1107770): Include more languages and categories. const char kHonorificPrefixRe[] = …; // Regular expression pattern for an optional last name suffix. const char kOptionalLastNameSuffixRe[] = …; // Regular expression pattern for a CJK character. const char kCjkCharacterRe[] = …; // Regular expression pattern for a sequence of CJK character. const char kCjkCharactersRe[] = …; // Regular expression pattern of common two-character Korean names. // Korean last names are written in Hangul. Note, some last names are ambiguous // in the sense that they share a common prefix with a single-character last // name. Source: https://en.wikipedia.org/wiki/List_of_Korean_surnames const char kTwoCharacterKoreanNamesRe[] = …; // Regular expression pattern to match if a string contains a common // Hispanic/Latinx last name. // It contains the most common names in Spain, Mexico, Cuba, Dominican Republic, // Puerto Rico and Guatemala. // Source: https://en.wikipedia.org/wiki/List_of_common_Spanish_surnames const char kHispanicCommonLastNameCharacteristicsRe[] = …; // Regular expression pattern to match a single word. const char kSingleWordRe[] = …; // Regular expression pattern for multiple lazy words meaning that the // expression avoids to match more than one word if possible. // Words are separated by white spaces but not by newlines or carriage returns. const char kMultipleLazyWordsRe[] = …; // Regular expression pattern to check if a name contains a Hispanic/Latinx // last name conjunction. const char kHispanicLastNameConjunctionCharacteristicsRe[] = …; // Regular expression pattern to match the conjunction used between // Hispanic/Latinx last names. const char kHispanicLastNameConjunctionsRe[] = …; // Regular expression pattern to match common prefixes belonging to a (single) // last name. // Source: https://en.wikipedia.org/wiki/List_of_family_name_affixes // According to the source, the list is partial. Changes to the list: // * "De la" and "De le" is added to support the combination of "de" and // "le"/"la" as used in Hispanic/Latinx names. // * The matching of "i" is made lazy to give the last name conjunction // precedence. const char kOptionalLastNamePrefixRe[] = …; // Regular expression to match the affixes that indicate the floor an // apartment is located in. const char kFloorAffixRe[] = …; // Prefix that indicates an apartment number. const char kApartmentNumberPrefix[] = …; // Suffix that inficates an apartment number. const char kApartmentNumberSuffix[] = …; // Regular expression to match the prefixes that indicate a house number. const char kHouseNumberOptionalPrefixRe[] = …; // Regular expressions to characterize if a string contains initials by // checking that: // * The string contains only upper case letters that may be preceded by a // point. // * Between each letter, there can be a space or a hyphen. const char kMiddleNameInitialsCharacteristicsRe[] = …; // Returns an expression to parse a CJK name that includes one separator. // The full name is parsed into |NAME_FULL|, the part of the name before the // separator is parsed into |NAME_LAST| and the part after the separator is // parsed into |NAME_FIRST|. std::string ParseSeparatedCJkNameExpression() { … } // Returns an expression to parse a CJK name that starts with a known // two-character last name. std::string ParseCommonCjkTwoCharacterLastNameExpression() { … } // Returns an expression to parse a CJK name without a separator. // The full name is parsed into |NAME_FULL|, the first character is parsed // into |NAME_LAST| and the rest into |NAME_FIRST|. std::string ParseCjkSingleCharacterLastNameExpression() { … } // Returns an expression to parse a Korean name that contains at least 4 // characters with a common Korean two-character last name. The full name is // parsed into |NAME_FULL|, the first two characters into |NAME_LAST| and the // rest into |NAME_FIRST|. std::string ParseKoreanTwoCharacterLastNameExpression() { … } // Returns an expression to determine if a name has the characteristics of a // CJK name. std::string MatchCjkNameExpression() { … } // Returns an expression to parse a full name that contains only a last name. std::string ParseOnlyLastNameExpression() { … } // Returns an expression to parse a name that consists of a first, middle and // last name with an optional honorific prefix. The full name is parsed into // |NAME_FULL|. The name can start with an honorific prefix that is ignored. // The last token is parsed into |NAME_LAST|. // This token may be preceded by a last name prefix like "Mac" or // "von" that is included in |NAME_LAST|. If the strings contains any // remaining tokens, the first token is parsed into // |NAME_FIRST| and all remaining tokens into |NAME_MIDDLE|. std::string ParseFirstMiddleLastNameExpression() { … } // Returns an expression to parse a name that starts with the last name, // followed by a comma, and than the first and middle names. // The full name is parsed into |NAME_FULL|. The name can start with an optional // honorific prefix that is ignored, followed by a single // token that is parsed into |LAST_NAME|. The |LAST_NAME| must be preceded by a // comma with optional spaces. The next token is parsed into |NAME_FIRST| and // all remaining tokens are parsed into |NAME_MIDDLE|. std::string ParseLastCommaFirstMiddleExpression() { … } // Returns an expression to parse an Hispanic/Latinx last name. // The last name can consist of two parts with an optional conjunction. // The full last name is parsed into |NAME_LAST|, the first part into // |NAME_LAST_FIRST|, the conjunction into |NAME_LAST_CONJUNCTION|, and the // second part into |NAME_LAST_SECOND|. // Each last name part consists of a space-separated toke with an optional // prefix like "de le". If only one last name part is found, it is parsed into // |NAME_LAST_SECOND|. std::string ParseHispanicLastNameExpression() { … } // Returns an expression to parse a full Hispanic/Latinx name that // contains an optional honorific prefix which is ignored, a first name, and a // last name as specified by |ParseHispanicLastNameExpression()|. std::string ParseHispanicFullNameExpression() { … } // Returns an expression that parses the whole |LAST_NAME| into // |LAST_NAME_SECOND|. std::string ParseLastNameIntoSecondLastNameExpression() { … } // Returns an expression to parse a street address into the street name, the // house number and the subpremise. The latter is parsed into the floor and // apartment number. The expression is applicable, if the street name comes // before the house number, followed by the floor and the apartment. // Both the floor and the apartment must be indicated by a prefix. // Example: Erika-Mann-Str. 44, Floor 2, Apartment 12 std::string ParseStreetNameHouseNumberExpression() { … } // Returns an expression to parse a street address into the street name, the // house number and the subpremise. The latter is parsed into the floor and // apartment number. The expression is applicable, if the street name comes // before the house number, followed by the floor and the apartment. // Both the floor and the apartment must be indicated by a suffix. // Example: Calla 1, 2º, 3ª // Where 2 is the floor and 3 the apartment number. std::string ParseStreetNameHouseNumberSuffixedFloorAndAppartmentExpression() { … } // Returns an expression to parse a street address into the street name, the // house number and the subpremise. The latter is parsed into the floor and // apartment number. The expression is applicable, if the house number comes // before the street name, followed by the floor which is indicated by a suffix // and the apartment. // Example Av. Paulista, 1098, 1º andar, apto. 101 std::string ParseStreetNameHouseNumberExpressionSuffixedFloor() { … } // Returns an expression to parse a street address into the street name, the // house number and the subpremise. The latter is parsed into the floor and // apartment number. The expression is applicable, if the house number comes // before the street name, followed by the floor and the apartment. // Both the floor and the apartment must be indicated by a prefix. // Example: 1600 Main Avenue, Floor 2, Apartment 12 std::string ParseHouseNumberStreetNameExpression() { … } } // namespace StructuredAddressesRegExProvider::StructuredAddressesRegExProvider() = default; // static StructuredAddressesRegExProvider* StructuredAddressesRegExProvider::Instance() { … } std::string StructuredAddressesRegExProvider::GetPattern( RegEx expression_identifier, const std::string& country_code) { … } const RE2* StructuredAddressesRegExProvider::GetRegEx( RegEx expression_identifier, const std::string& country_code) { … } } // namespace autofill