autofill_structured_address_regex_provider.cc

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.h"

#include <utility>

#include "base/no_destructor.h"
#include "base/strings/strcat.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_constants.h"
#include "components/autofill/core/browser/data_model/autofill_structured_address_utils.h"

#include "base/notreached.h"

namespace autofill {

namespace {

// Best practices for writing regular expression snippets:
// By wrapping snippets in non-capture groups, i.e. (?: ... ), we ensure that a
// pending "?" is interpreted as "optional" instead of a modifier of a previous
// operator. E.g. `StrCat({"(?:a+)", "?"})` means an optional sequence of "a"
// characters. But `StrCat({"a+", "?"})` means lazily match one or more "a"
// characters. Prefer [^\s,] ('not a whitespace or a comma') over \w ('a word
// character') in names, when you have concerns about hyphens (e.g. the German
// name "Hans-Joachim") because '-' is not matched by \w.

// Regular expressions pattern of common two-character CJK last names.
// Korean names are written in Hangul.
// Chinese names are written in their traditional and simplified version.
// Source:
// https://en.wikipedia.org/wiki/List_of_Korean_surnames
// https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
const char kTwoCharacterCjkLastNamesRe[] = …;

// Regular expression pattern for a Hangul (Korean) character.
const char kHangulCharacterRe[] = …;

// Regular expression pattern for a sequence of Hangul (Korean) character.
const char kHangulCharactersRe[] = …;

// Regular expression pattern to match separators as used in CJK names:
// Included separators: \u30FB, \u00B7, \u3000 or a simple space.
const char kCjkNameSeperatorsRe[] = …;

// Regular expression pattern for common honorific name prefixes.
// The list is incomplete and focused on the English and German language.
// Sources:
// * https://en.wikipedia.org/wiki/English_honorifics
// * https://en.wikipedia.org/wiki/German_honorifics
// TODO(crbug.com/1107770): Include more languages and categories.
const char kHonorificPrefixRe[] = …;

// Regular expression pattern for an optional last name suffix.
const char kOptionalLastNameSuffixRe[] = …;

// Regular expression pattern for a CJK character.
const char kCjkCharacterRe[] = …;

// Regular expression pattern for a sequence of CJK character.
const char kCjkCharactersRe[] = …;

// Regular expression pattern of common two-character Korean names.
// Korean last names are written in Hangul. Note, some last names are ambiguous
// in the sense that they share a common prefix with a single-character last
// name. Source: https://en.wikipedia.org/wiki/List_of_Korean_surnames
const char kTwoCharacterKoreanNamesRe[] = …;

// Regular expression pattern to match if a string contains a common
// Hispanic/Latinx last name.
// It contains the most common names in Spain, Mexico, Cuba, Dominican Republic,
// Puerto Rico and Guatemala.
// Source: https://en.wikipedia.org/wiki/List_of_common_Spanish_surnames
const char kHispanicCommonLastNameCharacteristicsRe[] = …;

// Regular expression pattern to match a single word.
const char kSingleWordRe[] = …;

// Regular expression pattern for multiple lazy words meaning that the
// expression avoids to match more than one word if possible.
// Words are separated by white spaces but not by newlines or carriage returns.
const char kMultipleLazyWordsRe[] = …;

// Regular expression pattern to check if a name contains a Hispanic/Latinx
// last name conjunction.
const char kHispanicLastNameConjunctionCharacteristicsRe[] = …;

// Regular expression pattern to match the conjunction used between
// Hispanic/Latinx last names.
const char kHispanicLastNameConjunctionsRe[] = …;

// Regular expression pattern to match common prefixes belonging to a (single)
// last name.
// Source: https://en.wikipedia.org/wiki/List_of_family_name_affixes
// According to the source, the list is partial. Changes to the list:
// * "De la" and "De le" is added to support the combination of "de" and
// "le"/"la" as used in Hispanic/Latinx names.
// * The matching of "i" is made lazy to give the last name conjunction
// precedence.
const char kOptionalLastNamePrefixRe[] = …;

// Regular expression to match the affixes that indicate the floor an
// apartment is located in.
const char kFloorAffixRe[] = …;

// Prefix that indicates an apartment number.
const char kApartmentNumberPrefix[] = …;

// Suffix that inficates an apartment number.
const char kApartmentNumberSuffix[] = …;

// Regular expression to match the prefixes that indicate a house number.
const char kHouseNumberOptionalPrefixRe[] = …;

// Regular expressions to characterize if a string contains initials by
// checking that:
// * The string contains only upper case letters that may be preceded by a
// point.
// * Between each letter, there can be a space or a hyphen.
const char kMiddleNameInitialsCharacteristicsRe[] = …;

// Returns an expression to parse a CJK name that includes one separator.
// The full name is parsed into |NAME_FULL|, the part of the name before the
// separator is parsed into |NAME_LAST| and the part after the separator is
// parsed into |NAME_FIRST|.
std::string ParseSeparatedCJkNameExpression() { … }

// Returns an expression to parse a CJK name that starts with a known
// two-character last name.
std::string ParseCommonCjkTwoCharacterLastNameExpression() { … }

// Returns an expression to parse a CJK name without a separator.
// The full name is parsed into |NAME_FULL|, the first character is parsed
// into |NAME_LAST| and the rest into |NAME_FIRST|.
std::string ParseCjkSingleCharacterLastNameExpression() { … }

// Returns an expression to parse a Korean name that contains at least 4
// characters with a common Korean two-character last name. The full name is
// parsed into |NAME_FULL|, the first two characters into |NAME_LAST| and the
// rest into |NAME_FIRST|.
std::string ParseKoreanTwoCharacterLastNameExpression() { … }

// Returns an expression to determine if a name has the characteristics of a
// CJK name.
std::string MatchCjkNameExpression() { … }

// Returns an expression to parse a full name that contains only a last name.
std::string ParseOnlyLastNameExpression() { … }

// Returns an expression to parse a name that consists of a first, middle and
// last name with an optional honorific prefix. The full name is parsed into
// |NAME_FULL|. The name can start with an honorific prefix that is ignored.
// The last token is parsed into |NAME_LAST|.
// This token may be preceded by a last name prefix like "Mac" or
// "von" that is included in |NAME_LAST|. If the strings contains any
// remaining tokens, the first token is parsed into
// |NAME_FIRST| and all remaining tokens into |NAME_MIDDLE|.
std::string ParseFirstMiddleLastNameExpression() { … }

// Returns an expression to parse a name that starts with the last name,
// followed by a comma, and than the first and middle names.
// The full name is parsed into |NAME_FULL|. The name can start with an optional
// honorific prefix that is ignored, followed by a single
// token that is parsed into |LAST_NAME|. The |LAST_NAME| must be preceded by a
// comma with optional spaces. The next token is parsed into |NAME_FIRST| and
// all remaining tokens are parsed into |NAME_MIDDLE|.
std::string ParseLastCommaFirstMiddleExpression() { … }

// Returns an expression to parse an Hispanic/Latinx last name.
// The last name can consist of two parts with an optional conjunction.
// The full last name is parsed into |NAME_LAST|, the first part into
// |NAME_LAST_FIRST|, the conjunction into |NAME_LAST_CONJUNCTION|, and the
// second part into |NAME_LAST_SECOND|.
// Each last name part consists of a space-separated toke with an optional
// prefix like "de le". If only one last name part is found, it is parsed into
// |NAME_LAST_SECOND|.
std::string ParseHispanicLastNameExpression() { … }

// Returns an expression to parse a full Hispanic/Latinx name that
// contains an optional honorific prefix which is ignored, a first name, and a
// last name as specified by |ParseHispanicLastNameExpression()|.
std::string ParseHispanicFullNameExpression() { … }

// Returns an expression that parses the whole |LAST_NAME| into
// |LAST_NAME_SECOND|.
std::string ParseLastNameIntoSecondLastNameExpression() { … }

// Returns an expression to parse a street address into the street name, the
// house number and the subpremise. The latter is parsed into the floor and
// apartment number. The expression is applicable, if the street name comes
// before the house number, followed by the floor and the apartment.
// Both the floor and the apartment must be indicated by a prefix.
// Example: Erika-Mann-Str. 44, Floor 2, Apartment 12
std::string ParseStreetNameHouseNumberExpression() { … }

// Returns an expression to parse a street address into the street name, the
// house number and the subpremise. The latter is parsed into the floor and
// apartment number. The expression is applicable, if the street name comes
// before the house number, followed by the floor and the apartment.
// Both the floor and the apartment must be indicated by a suffix.
// Example: Calla 1, 2º, 3ª
// Where 2 is the floor and 3 the apartment number.
std::string ParseStreetNameHouseNumberSuffixedFloorAndAppartmentExpression() { … }

// Returns an expression to parse a street address into the street name, the
// house number and the subpremise. The latter is parsed into the floor and
// apartment number. The expression is applicable, if the house number comes
// before the street name, followed by the floor which is indicated by a suffix
// and the apartment.
// Example Av. Paulista, 1098, 1º andar, apto. 101
std::string ParseStreetNameHouseNumberExpressionSuffixedFloor() { … }

// Returns an expression to parse a street address into the street name, the
// house number and the subpremise. The latter is parsed into the floor and
// apartment number. The expression is applicable, if the house number comes
// before the street name, followed by the floor and the apartment.
// Both the floor and the apartment must be indicated by a prefix.
// Example: 1600 Main Avenue, Floor 2, Apartment 12
std::string ParseHouseNumberStreetNameExpression() { … }
}  // namespace

StructuredAddressesRegExProvider::StructuredAddressesRegExProvider() = default;

// static
StructuredAddressesRegExProvider* StructuredAddressesRegExProvider::Instance() { … }

std::string StructuredAddressesRegExProvider::GetPattern(
    RegEx expression_identifier,
    const std::string& country_code) { … }

const RE2* StructuredAddressesRegExProvider::GetRegEx(
    RegEx expression_identifier,
    const std::string& country_code) { … }

}  // namespace autofill
chromium/components/autofill/core/browser/data_model/autofill_structured_address_regex_provider.cc