chromium/components/autofill/core/browser/heuristic_classification_unittests.cc

// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// The purpose of this file is to perform end-to-end form classifications for
// form structures that were recorded as JSON files. These tests currently don't
// produce 100% fidelity compared to real browsing. E.g. we don't support
// invisible fields, yet. Still, they are helpful for tuning heuristics and
// rationalization against a corpus of observed forms.
//
// Test files are located components/test/data/autofill/heuristics-json/
// and describe the status quo. Not necessarily the correct, expected behavior.
// If the classification changes (for the better or worse), a new file is
// written into this directory and the test fails. You can compare the results
// with `diff -U5 $old_file $new_file`.
//
// The structure of the input files is as follows:
//  {
//    "config": {
//       // 2 letter country code, used to mock the user's current location.
//      "country": "US",
//      // 2 letter language code, used to mock the website's language.
//      "language": "en",
//      // List of fields for which the expected type is verified. Fields not
//      // listed here are presented to the local heuristics but the outputs
//      // are not checked and mismatches are not reported.
//      "fields_in_scope": [
//        "UNKNOWN_TYPE",
//        "ADDRESS_HOME_CITY",
//        ...
//      ],
//    },
//    "sites": [
//      {
//        // URL of the website from which the form was recoreded, useful for
//        // debugging.
//        "site_url": "https://www.example.com",
//        // List of forms recorded for the website (e.g. a website can have an
//        // address form and a payment form).
//        "forms": [
//          {
//            "form_signature": "1234567",
//            "fields": [
//               {
//                 // "{form_sig}_{field_sig}_{field_rank_in_signature_group}
//                 "id": "15461699092647468671_1855613035_0",
//                 "field_signature": "1855613035",
//                 // Absolute position of the field in the form. Fields should
//                 // be sorted by "field_position". Fields are presented to the
//                 // heuristics in the order they appear in the JSON file.
//                 // This field is only used for debugging purposes.
//                 "field_position": 0,
//                 // <label>{label_attr}
//                 // <input id="{id_attr}" name="{name_attr}"
//                 //        type="{type_attr}"
//                 //        autocomplete="{autocomplete_attr}">
//                 // </label>
//                 "id_attr": "first",
//                 "name_attr": "firstName",
//                 "label_attr": "First name",
//                 "type_attr": "text",
//                 "autocomplete_attr": "given-name",
//                 // The field types a human tester considered correct.
//                 // Currently only the first type is considered.
//                 "tester_type": [
//                   "NAME_FIRST"
//                 ],
//                 // Correctness of the last classification. The value can be
//                 // one of:
//                 // - "correct" if the last classification matched the first
//                 //   "tester_type".
//                 // - "not_recognized: {tester_type}, chosen_instead: {type2}"
//                 //   if {tester_type} was not recognized but the heuristics
//                 //   but classification and rationalization produced
//                 //   {type2} instead.
//                 // - "ignored: {tester_type}" if the field type is not in
//                 //   scope of the test.
//                 "last_correctness": "correct|not_recognized: ...",
//                 // ^^^^^^ THIS GETS UPDATED BY RUNNING THE TEST.
//                 // The last field type predicted by the heuristics and
//                 // rationalization.
//                 "last_classification": "NAME_FIRST"
//                 // ^^^^^^ THIS GETS UPDATED BY RUNNING THE TEST.
//              }
//            ]
//          }
//        ]
//      ],
//    // Summary of the classification.
//    "stats": {
//    // ^^^^^^ THIS GETS UPDATED BY RUNNING THE TEST
//      "high_level_stats": {
//        // Which fraction of fields had the heuristic type match the tester
//        // type.
//        "fraction_machtes": 0.7258244384259996,
//        // Number of fields for which the heuristic type matched the tester
//        // type or did not match.
//        "matches": 9112,
//        "mismatches": 3442
//      },
//      // Same staistics as above, drilled down by tester type.
//      "per_type_stats": {
//         "{tester_type}": {
//            "fraction_machtes": 0.9132743362831859,
//            "matches": 1032,
//            "mismatches": 98
//         },
//         ...
//      },
//      "ignored_types_stats": {
//        "{tester_type}": 1
//      }
//    }
//  }

#include <iomanip>
#include <sstream>
#include <string_view>

#include "base/command_line.h"
#include "base/containers/contains.h"
#include "base/containers/flat_map.h"
#include "base/containers/flat_set.h"
#include "base/feature_list.h"
#include "base/files/file_enumerator.h"
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/json/json_reader.h"
#include "base/json/json_writer.h"
#include "base/logging.h"
#include "base/no_destructor.h"
#include "base/path_service.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "base/test/scoped_command_line.h"
#include "base/test/scoped_feature_list.h"
#include "base/test/test_timeouts.h"
#include "base/values.h"
#include "build/build_config.h"
#include "components/autofill/core/browser/form_structure.h"
#include "components/autofill/core/browser/heuristic_source.h"
#include "components/autofill/core/browser/logging/log_manager.h"
#include "components/autofill/core/browser/logging/log_router.h"
#include "components/autofill/core/common/autocomplete_parsing_util.h"
#include "components/autofill/core/common/autofill_features.h"
#include "components/autofill/core/common/autofill_test_utils.h"
#include "components/autofill/core/common/form_data_test_api.h"
#include "components/autofill/core/common/language_code.h"
#include "components/variations/variations_switches.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"

#if BUILDFLAG(IS_MAC)
#include "base/apple/foundation_util.h"
#endif

AssertionFailure;
AssertionResult;
AssertionSuccess;

namespace autofill {
namespace {

// Helper class that aggregates metrics and diagnostic data about field
// classifications that matched or mismatched the expecations.
class ResultAnalyzer {};

void ResultAnalyzer::AnalyzeClassification(const FormStructure& form_structure,
                                           base::Value::Dict& form_dict) {}

base::Value ResultAnalyzer::GetResult() {}

// Returns the path containing test input files,
// components/test/data/autofill/heuristics-json/.
const base::FilePath& GetInputDir() {}

// Returns all "*.json" files in `GetInputDir()`.
std::vector<base::FilePath> GetTestFiles() {}

// Extracts data of a single field from `field_dict` using the Form `form_data`
// as contextual information.
// `field_dict` corresponds to an entry of `.sites[].forms[].fields[]` in the
// JSON input file in jq syntax (https://jqlang.github.io/jq/).
FormFieldData ParseFieldFromJsonDict(const base::Value::Dict& field_dict,
                                     const FormData& form_data) {}

[[nodiscard]] AssertionResult ParseFormFromJsonDict(
    const base::Value::Dict& form_dict,
    const std::string& site_url,
    FormData& form_data) {}

// Tests classifications of a site. The returned test result expresses whether
// the test data could be parsed and the fields could be classified. It does
// not make an assessment of whether the heuristics generated the expected data.
// That is recorded via `result_analyzer`.
// Test field classification resutls are updated in `site` in the
// `.sites[].forms[].fields[].last_classification` field. This is why the `site`
// is a mutable parameter.
// `site` corresponds to an entry of `.sites[]` in the JSON input file in jq
// syntax (https://jqlang.github.io/jq/)
[[nodiscard]] AssertionResult ClassifyFieldsOfSite(
    base::Value::Dict& site,
    const GeoIpCountryCode& client_country,
    LanguageCode page_language,
    ResultAnalyzer& result_analyzer,
    LogManager* log_manager) {}

// Creates a textual description of the statistics. This is good for a quick
// view in the delta for an EXPECT_EQ().
[[nodiscard]] std::string SummarizeStatistics(
    const base::Value::Dict& json_file) {}

class HeuristicClassificationTests
    : public testing::Test,
      public testing::WithParamInterface<base::FilePath> {};

void HeuristicClassificationTests::SetUp() {}

TEST_P(HeuristicClassificationTests, EndToEnd) {}

// Maps a test file name to a short string that is used in the test name.
// E.g. a file "internal/DE.json" becomes "DE" such that the test is called
// AllForms/HeuristicClassificationTests.EndToEnd/DE.
std::string GenerateTestName(
    const testing::TestParamInfo<base::FilePath>& info) {}

INSTANTIATE_TEST_SUITE_P();

}  // namespace
}  // namespace autofill