chromium/components/autofill/core/browser/form_parsing/transpile_regex_patterns.py

#!/usr/bin/env python

# Copyright 2020 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import argparse
from collections import defaultdict
import io
import re
import sys
import json

# Generates a function AreMatchingPatternsEqualImpl(a, b, lang_code), which
# tests whether the patterns for PatternSources a and b in language lang_code
# match. This can be used to avoid unnecessarily recomputing heuristics.
# All return values are precomputed, making this a constant time operation at
# runtime.
# - `name_to_lang_to_id_to_patternrefs`: See `generate_cpp_constants()` for a
#   description.
# - `lang_array`: A sorted list language codes for which lookups should be
#   precomputed.
# - [`min_pattern_id`, `max_pattern_id`] all valid PatternSources IDs for which
#   lookups should be precomputed.
def generate_matching_pattern_equals(name_to_lang_to_id_to_patternrefs,
    lang_array, min_pattern_id, max_pattern_id):
  # Tests if all patterns of `lang` match between to pattern source ids.
  # "PATTERN_SOURCE_DUMMY" is ignored, since it is only just to identify the
  # patterns for debugging purposes.
  def patterns_match(a_id, b_id, lang):
    return all(lang_to_id_to_patternrefs[lang][a_id] ==
      lang_to_id_to_patternrefs[lang][b_id]
      for name, lang_to_id_to_patternrefs in
      name_to_lang_to_id_to_patternrefs.items()
      if name != "PATTERN_SOURCE_DUMMY")

  yield '// Checks if all the matching patterns for the given PatternSources'
  yield '// and language are the same - meaning that computing predictions for'
  yield '// both is unnecessary, since it will yield the same result.'
  yield 'constexpr bool AreMatchingPatternsEqualImpl(PatternSource a,'
  yield '                                            PatternSource b,'
  yield '                                            LanguageCode lang_code) {'
  yield '  if (a == b) {'
  yield '    return true;'
  yield '  }'
  yield '  auto a_id = base::to_underlying(a);'
  yield '  auto b_id = base::to_underlying(b);'
  yield '  if (a_id > b_id) {'
  yield '    std::swap(a_id, b_id);'
  yield '  }'

  # Precompute the result for all provided pattern ids and languages.
  for a_id in range(min_pattern_id, max_pattern_id + 1):
    for b_id in range(a_id + 1, max_pattern_id + 1):
      langs = [lang for lang in lang_array if patterns_match(a_id, b_id, lang)]
      if langs == []:
        continue
      yield f'  if (a_id == {a_id} && b_id == {b_id}) {{'
      if langs == lang_array:
        yield '    return true;'
      else:
        disjunction = " || ".join(f"*lang_code == \"{lang}\"" for lang in langs)
        yield f'    return {disjunction};'
      yield '  }'

  yield '  return false;'
  yield '}'

# Generates a set of C++ constexpr constants to facilitate lookup of a set of
# MatchingPatterns by a given tuple (pattern name, language code).
#
# id_to_name_to_lang_to_patterns is a
# map from pattern source IDs to a
#   map from pattern names to a
#     map from language codes to a
#       list of patterns,
# where the pattern source IDs are consecutive natural numbers identifying the
# input JSON files.
#
# The constants are:
#
# - kPatterns is an array of MatchingPatterns without duplicates.
#   The array is not sorted. The patterns are condensed (see below).
# - kPatterns__<pattern source id>__<pattern name>__<language code> contains
#   the indices of the pattterns that from the pattern source for that pattern
#   name and language code.
# - kPatternMap is a fixed flat map from (pattern name, language code) tuples
#   to arrays of spans of MatchPatternRefs; the indices of the array correspond
#   to the pattern sources.
# - kLanguages is a fixed flat set of language codes across all pattern source
#   ids and all pattern names.
#
# This representation has larger binary size on Android than using nested
# lookup pattern name -> language code -> span of MatchPatternRefs, but it's
# significantly simpler.
def generate_cpp_constants(id_to_name_to_lang_to_patterns):
  # Stores a `key` in `dictionary` and assigns it a natural number.
  #
  # For example, after memoize("foo", d) and memoize("bar", d),
  # d = {"foo": 0, "bar": 1}. This is useful to generate a C++ array
  # {"foo", "bar"} and referring to these elements by their indices
  # 0 and 1.
  def memoize(key, dictionary):
    if key not in dictionary:
      dictionary[key] = len(dictionary)
    return dictionary[key]

  # Maps a Python Boolean to a C++ Boolean literal.
  def python_bool_to_cpp(b):
    return 'true' if b else 'false'

  # Maps a string literal to a C++ string literal.
  def json_to_cpp_string_literal(json_string_literal):
    return json.dumps(json_string_literal or '')

  # Maps a string literal to a C++ UTF-16 string literal.
  def json_to_cpp_u16string_literal(json_string_literal):
    return 'u'+ json.dumps(json_string_literal or '')

  # Maps a list of strings to a DenseSet containing these values.
  # The strings represents constants in the format FOO_BAR.
  # They're translated to the C++ constant kFooBar.
  def json_to_cpp_dense_set(json_enum_values, cpp_enum_type):
    # Converts FOO_BAR into kFooBar.
    def json_to_cpp_constant_symbol(json):
      assert json.isupper()
      return f'{cpp_enum_type}::k' + re.sub(
          r'(^|_)([a-z])', lambda matched: matched.group(2).upper(),
          json.lower())
    cpp_enum_values = [json_to_cpp_constant_symbol(c) for c in json_enum_values]
    return f'DenseSet<{cpp_enum_type}>{{' + ','.join(cpp_enum_values) + f'}}'

  # Maps a list of strings to a DenseSet<MatchAttribute> expression.
  # The strings must be the names of MatchAttribute constants, e.g., NAME.
  # They're mapped to C++ constants, e.g., kName.
  def json_to_cpp_match_field_attributes(enum_values):
    return json_to_cpp_dense_set(enum_values, 'MatchAttribute')

  # Maps a list of strings to a DenseSet<FormControlType> expression.
  # The strings must be the names of FormControlType constants, e.g., TEXT_AREA.
  # They're mapped to C++ constants, e.g., kTextArea.
  def json_to_cpp_form_control_types(enum_values):
    return json_to_cpp_dense_set(enum_values, 'FormControlType')

  # Maps a JSON object representing a pattern to a C++ MatchingPattern
  # expression.
  def json_to_cpp_pattern(json):
    positive_pattern = json_to_cpp_u16string_literal(json['positive_pattern'])
    negative_pattern = json_to_cpp_u16string_literal(json['negative_pattern'])
    positive_score = json['positive_score']
    match_field_attributes = json_to_cpp_match_field_attributes(
        json['match_field_attributes'])
    form_control_types = json_to_cpp_form_control_types(
        json['form_control_types'])
    return f'MatchingPattern{{\n' \
           f'  .positive_pattern = {positive_pattern},\n' \
           f'  .negative_pattern = {negative_pattern},\n' \
           f'  .positive_score = {positive_score},\n' \
           f'  .match_field_attributes = {match_field_attributes},\n' \
           f'  .form_control_types = {form_control_types},\n' \
           f'}}'

  # Name of the auxiliary C++ constant.
  def kPatterns(id, name, lang):
    return (f"kPatterns__{id}__{name}__{lang.replace('-', '_')}"
            if lang else f'kPatterns__{id}__{name}')

  # Preprocess the patterns.
  for id, name_to_lang_to_patterns in id_to_name_to_lang_to_patterns.items():
    if "__comment__" in name_to_lang_to_patterns:
      del name_to_lang_to_patterns["__comment__"]

    # Validate the JSON input.
    for name, lang_to_patterns in name_to_lang_to_patterns.items():
      if '' in lang_to_patterns:
        raise Exception('JSON format error: language is ""')

    # Remember each pattern's language.
    #
    # To ease debugging, we shall sort patterns of equal positive_score by their
    # language.
    for lang_to_patterns in name_to_lang_to_patterns.values():
      for lang, patterns in lang_to_patterns.items():
        for pattern in patterns:
          pattern['lang'] = lang

    # For each name, collect the items of all languages and add them as a
    # separate entry for the pseudo-language ''.
    for name, lang_to_patterns in name_to_lang_to_patterns.items():
      lang_to_patterns[''] = [
          p.copy() for ps in lang_to_patterns.values() for p in ps
      ]

    # Add the English patterns to all languages except for English itself and
    # the catch-all language ''.
    #
    # The idea is that these patterns should be applied (only) to the HTML
    # source code, i.e., not the user-visible labels. To this end, we mark the
    # English patterns here "supplementary", which will in the subsequent step
    # be encoded in the MatchPatternRef().
    for lang_to_patterns in name_to_lang_to_patterns.values():
      if 'en' not in lang_to_patterns:
        continue
      def make_supplementary_pattern(p):
        assert "NAME" in p['match_field_attributes']
        p = p.copy()
        p['supplementary'] = True
        return p
      for patterns in (patterns for lang, patterns in lang_to_patterns.items()
                       if lang not in ['', 'en']):
        patterns.extend(
            make_supplementary_pattern(p)
            for p in lang_to_patterns['en']
            if "NAME" in p['match_field_attributes'])

  # Populate the two maps:
  # - a map from C++ MatchingPattern expressions to their index.
  # - a map from names and languages and IDs to the their MatchingPatterns,
  #   represented as list of tuples (is_supplementary, pattern_index).
  pattern_to_index = {}
  name_to_lang_to_id_to_patternrefs = defaultdict(
      lambda: defaultdict(lambda: defaultdict(list)))
  for id, name_to_lang_to_patterns in id_to_name_to_lang_to_patterns.items():
    for name, lang_to_patterns in sorted(name_to_lang_to_patterns.items()):
      for lang, patterns in sorted(lang_to_patterns.items()):
        patternrefs = name_to_lang_to_id_to_patternrefs[name][lang][id]
        sort_key = lambda p: (-p['positive_score'], p['lang'])
        for pattern in sorted(patterns, key=sort_key):
          is_supplementary = ('supplementary' in pattern and
                              pattern['supplementary'])
          pattern_index = memoize(
              json_to_cpp_pattern(pattern), pattern_to_index)
          patternrefs.append((is_supplementary, pattern_index))

  # Generate the C++ constants.
  yield '// The patterns. Referred to by their index in MatchPatternRef.'
  yield 'constexpr auto kPatterns = std::to_array<MatchingPattern>({'
  for cpp_expr, index in sorted(
      pattern_to_index.items(), key=lambda item: item[1]):
    yield f'/*[{index}]=*/{cpp_expr},'
  yield '});'
  yield ''

  min_pattern_id = min(id_to_name_to_lang_to_patterns.keys())
  max_pattern_id = max(id_to_name_to_lang_to_patterns.keys())

  yield '// The patterns for field types and languages.'
  yield '// They are sorted by the patterns MatchingPattern::positive_score.'
  for name, lang_to_id_to_patternrefs in (
      name_to_lang_to_id_to_patternrefs.items()):
    for lang, id_to_patternrefs in lang_to_id_to_patternrefs.items():
      for id, patternrefs in id_to_patternrefs.items():
        # If another pattern source has the same patterns for this `name` and
        # `lang`, we don't need to a new array; a reference suffices.
        ids_with_the_same_patternrefs = [
            i for i in range(min_pattern_id, id)
            if i in name_to_lang_to_id_to_patternrefs[name][lang] and
            patternrefs == name_to_lang_to_id_to_patternrefs[name][lang][i]
        ]
        if ids_with_the_same_patternrefs != []:
          other_id = ids_with_the_same_patternrefs[0]
          yield (f'constexpr auto {kPatterns(id, name, lang)} = '
                 f'base::make_span({kPatterns(other_id, name, lang)});')
        else:
          yield (f'constexpr MatchPatternRef {kPatterns(id, name, lang)}[] {{' +
                 f', '.join(
                     f'MakeMatchPatternRef('+
                     f'{python_bool_to_cpp(is_supplementary)}, {index})'
                     for is_supplementary, index in patternrefs) + f'}};')
  yield ''

  yield '// The lookup map for field types and langs.'
  yield '//'
  yield '// The key type in the map is essentially a pair of const char*.'
  yield '// It also allows for lookup by std::string_view pairs (because the'
  yield '// comparator transparently accepts std::string_view pairs).'
  yield '//'
  yield '// The value type is an array of spans of MatchPatternRefs. The'
  yield '// indices of the array correspond to the pattern source: the patterns'
  yield '// from the first input JSON file are stored at index 0, etc.'
  yield '//'
  yield '// This design exploits that the different JSON files by and large'
  yield '// contain the same pattern names and languages. If instead we'
  yield '// generated an individual map for each JSON file, then, assuming four'
  yield '// JSON files, the duplicate keys would cause 60% overhead, which'
  yield '// adds up to >10K binary size on Android.'
  yield 'constexpr auto kPatternMap = base::MakeFixedFlatMap<' \
      'NameAndLanguage, '\
      'std::array<base::span<const MatchPatternRef>, ' \
      f'{max_pattern_id - min_pattern_id + 1}' \
      '>>({'
  for name, lang_to_id_to_patternrefs in sorted(
      name_to_lang_to_id_to_patternrefs.items()):
    for lang, id_to_patternrefs in sorted(lang_to_id_to_patternrefs.items()):
      pattern_array = [
          kPatterns(id, name, lang)
          if id in id_to_patternrefs else 'base::span<const MatchPatternRef>{}'
          for id in range(min_pattern_id, max_pattern_id + 1)
      ]
      yield f'  {{{{"{name}", "{lang}"}}, {{{", ".join(pattern_array)}}}}},'
  yield '}, NameAndLanguageComparator());'
  yield ''

  language_array = sorted(
      set(lang for lang_to_id_to_patternrefs in
          name_to_lang_to_id_to_patternrefs.values()
          for lang in lang_to_id_to_patternrefs.keys() if lang != ''))
  yield '// The set of language codes across all language source ids and'
  yield '// pattern names.'
  yield 'constexpr auto kLanguages = base::MakeFixedFlatSet<const char*>({'
  quoted_languages = [f'"{lang}"' for lang in language_array]
  yield f'  {", ".join(quoted_languages)}'
  yield '}, LanguageComparator());'

  yield ''
  yield from generate_matching_pattern_equals(name_to_lang_to_id_to_patternrefs,
    language_array, min_pattern_id, max_pattern_id)

def generate_cpp_lines(id_to_name_to_lang_to_patterns):
  yield """// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_FORM_PARSING_REGEX_PATTERNS_INL_H_
#define COMPONENTS_AUTOFILL_CORE_BROWSER_FORM_PARSING_REGEX_PATTERNS_INL_H_

#include <algorithm>
#include <array>
#include <string_view>

#include "base/containers/fixed_flat_map.h"
#include "base/containers/fixed_flat_set.h"
#include "base/containers/span.h"
#include "base/types/cxx23_to_underlying.h"

#include "components/autofill/core/browser/form_parsing/regex_patterns.h"
#include "components/autofill/core/common/dense_set.h"
#include "components/autofill/core/browser/form_parsing/autofill_parsing_utils.h"

namespace autofill {

// Wrapper of MatchPatternRef's private constructor.
// It's a friend of MatchPatternRef.
constexpr MatchPatternRef MakeMatchPatternRef(
    bool is_supplementary,
    MatchPatternRef::UnderlyingType index) {
  return MatchPatternRef(is_supplementary, index);
}

// A pair of const char* used as keys in the `kPatternMap`.
struct NameAndLanguage {
  using StringViewPair = std::pair<std::string_view, std::string_view>;

  // By this implicit conversion, the below comparator can be called for
  // NameAndLanguageComparator and StringViewPairs alike.
  constexpr operator StringViewPair() const {
    return {std::string_view(name), std::string_view(lang)};
  }

  const char* name;  // A pattern name.
  const char* lang;  // A language code.
};

// A less-than relation on NameAndLanguage and/or std::string_view pairs.
struct NameAndLanguageComparator {
  using is_transparent = void;

  // By way of the implicit conversion from NameAndLanguage to StringViewPair,
  // this function also accepts NameAndLanguage.
  //
  // To implement constexpr lexicographic comparison of const char* with the
  // standard library, we need to compute both the lengths of the strings before
  // we can actually compare the strings. A simple way of doing so is to convert
  // each const char* to a std::string_view and then comparing the
  // std::string_views.
  //
  // This is exactly what the comparator does: when an argument is a
  // NameAndLanguage, it is implicitly converted to a StringViewPair, which
  // is then compared to the other StringViewPair.
  constexpr bool operator()(
      NameAndLanguage::StringViewPair a,
      NameAndLanguage::StringViewPair b) const {
    int cmp = a.first.compare(b.first);
    return cmp < 0 || (cmp == 0 && a.second.compare(b.second) < 0);
  }
};

// A less-than relation on const char* and std::string_view, in particular for
// language codes.
struct LanguageComparator {
  using is_transparent = void;

  // This function also accepts const char* by implicit conversion to
  // std::string_view.
  //
  // This comparator facilitates constexpr comparison among const char*
  // similarly to the above NameAndLanguageComparator.
  constexpr bool operator()(std::string_view a, std::string_view b) const {
    return a.compare(b) < 0;
  }
};
"""
  yield from generate_cpp_constants(id_to_name_to_lang_to_patterns)
  yield """
}  // namespace autofill

#endif  // COMPONENTS_AUTOFILL_CORE_BROWSER_FORM_PARSING_REGEX_PATTERNS_INL_H_
"""

def build_cpp_file(id_to_name_to_lang_to_patterns, output_handle):
  for line in generate_cpp_lines(id_to_name_to_lang_to_patterns):
    line += '\n'
    # unicode() exists and is necessary only in Python 2, not in Python 3.
    if sys.version_info[0] < 3:
      line = unicode(s, 'utf-8')
    output_handle.write(line)

def parse_json(input_files, output_file):
  id_to_name_to_lang_to_patterns = {}
  for index, input_file in enumerate(input_files):
    with io.open(input_file, 'r', encoding='utf-8') as input_handle:
      id_to_name_to_lang_to_patterns[index] = json.load(input_handle)

  with io.open(output_file, 'w', encoding='utf-8') as output_handle:
    build_cpp_file(id_to_name_to_lang_to_patterns, output_handle)

if __name__ == '__main__':
  parser = argparse.ArgumentParser(
      description='Transpiles parsing patterns from JSON to C++.')
  parser.add_argument(
      '--input',
      metavar='json-file',
      required=True,
      type=str,
      nargs='+',
      help='JSON file(s) containing patterns')
  parser.add_argument(
      '--output',
      metavar='header-file',
      required=True,
      type=str,
      help='C++ header file to be generated')
  args = parser.parse_args()
  if not args.input or not args.output:
    parser.print_help()
  else:
    parse_json(args.input, args.output)