// Copyright 2021 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef COMPONENTS_AUTOFILL_CORE_BROWSER_FORM_FOREST_H_ #define COMPONENTS_AUTOFILL_CORE_BROWSER_FORM_FOREST_H_ #include <memory> #include <optional> #include <vector> #include "base/containers/flat_map.h" #include "base/containers/flat_set.h" #include "base/containers/span.h" #include "base/memory/raw_ptr.h" #include "base/memory/raw_ref.h" #include "components/autofill/core/browser/autofill_driver.h" #include "components/autofill/core/browser/field_types.h" #include "components/autofill/core/common/form_data.h" #include "components/autofill/core/common/unique_ids.h" #include "third_party/abseil-cpp/absl/types/variant.h" namespace autofill::internal { // FormForest converts renderer forms into a browser form and vice versa. // // A *frame-transcending* form is a form whose logical fields live in different // frames. The *renderer forms* are the FormData objects as we receive them from // the renderers of these frames. The *browser form* of a frame-transcending // form is its root FormData, with all fields of its descendant FormDatas moved // into the root. // See AutofillDriverRouter for further details on the terminology and // motivation. // // Consider the following main frame with two frame-transcending forms: // // +--------------- Frame ---------------+ // | | // +--- Form-A --+ +--- Form-C --+ // | | | | | // Field-1 Frame Field-4 Frame Frame // | | | // +--- Form-B --+ Form-D Form-E // | | | | // Field-2 Field-3 Field-5 Frame // | // Form-F // | // Field-6 // // The renderer forms are form A, B, C, D, E, F. // // The browser form of forms A and B has the fields fields 1, 2, 3, 4. // Converting this browser form back to renderer forms yields Form-A and Form-B. // // Analogously, the browser form of forms C, D, E, and F has the fields 5 and 6. // Converting this browser form back to renderer forms yields forms C, D, E, F. // // The three key functions of FormForest are: // - UpdateTreeOfRendererForm() // - GetBrowserForm() // - GetRendererFormsOfBrowserFields() // // UpdateTreeOfRendererForm() incrementally builds up a graph of frames, forms, // and fields. // // This graph is a forest in two (entirely independent) ways: // // Firstly, there may be multiple root frames. One reason is the website author // can disconnect an entire frame subtree from the rest of the frame tree in the // future using the fencedframes tag and/or disallowdocumentaccess attribute. // Another reason is that the frame hierarchy emerges gradually and therefore // some links may be unknown. For example, Form-A might point to a nonexistent // frame of Form-B because, after Form-A was last parsed, a cross-origin // navigation happened in Form-B's frame. // // Secondly, removing the root frames obtains a forest, where each tree // corresponds to a frame-transcending form. We call the roots of this forest // *root forms*. In the example, forms A and C are root forms. This is relevant // because filling operations happen on the granularity of root forms. // // As an invariant, UpdateTreeOfRendererForm() keeps each frame-transcending // form in a flattened state: fields are stored as children of their root // forms. The fields are ordered according to pre-order depth-first (DOM order) // traversal of the original tree. In our example: // // +--------------- Frame ---------------+ // | | // +-------+---- Form-A ---+-------+ +-------+- Form-C -+-------+ // | | | | | | | | | // Field-1 Field-2 Field-3 Field-4 Frame Field-5 Field-6 Frame Frame // | | | // Form-B Form-D Form-E // | // Frame // | // Form-F // // There is no meaningful order between the fields and frames in these flattened // forms. // // GetBrowserForm(renderer_form) simply retrieves the form node of // |renderer_form| and returns the root form, along with its field children. For // example, if |renderer_form| is form B, it returns form A with fields 1–4. // // GetRendererFormsOfBrowserFields(browser_fields) returns the individual // renderer forms that constitute `browser_fields`, with their fields // reinstated. For example, if `browser_fields` has fields 1–4, it returns form // A with fields 1 and 4, and form B with fields 2 and 3. // // The node types in the forest always alternate as follows: // - The root nodes are frames. // - The children of frames are forms. // - The children of forms are frames or fields. // - Fields are leaves. Forms and frames may be leaves. // // Frames, forms, and fields are represented as FrameData, FormData, and // FormFieldData objects respectively. The graph is stored as follows: // - All FrameData nodes are stored directly in FormForest::frame_datas_. // - The FrameData -> FormData edges are stored in the FrameData::child_forms // vector of FormData objects. That is, each FrameData directly holds its // children. // - The FormData -> FrameData edges are stored in the FormData::child_frames // vector of FrameTokens. To retrieve the actual FrameData child, the token // is resolved to a LocalFrameToken if necessary (see Resolve()), and then // looked up in FormForest::frame_datas_. // - The FormData -> FormFieldData edges are stored in the FormData::fields // vector of FormFieldData objects. As per the aforementioned invariant, // fields are only stored in root forms. Each field's original parent can be // identified by FormFieldData::host_frame and FormFieldData::host_form_id. // // Reasonable usage of FormForest follows this protocol: // 1. Call UpdateTreeOfRendererForm(renderer_form) whenever a renderer form is // seen to make FormForest aware of the (new or potentially changed) form. // 2. Call GetBrowserForm(renderer_form.global_id()) directly afterwards (as // long as the renderer form is known to the FormForest). // 3. Call GetRendererFormsOfBrowserFields(browser_fields) only if // `browser_fields` was previously returned by GetBrowserForm(), perhaps with // different FormFieldData::value, FormFieldData::is_autofilled. // // For FormForest to be memory safe, // 1. UpdateTreeOfRendererForm() and GetRendererFormsOfBrowserFields() must only // be called for forms which have the following attributes set: // - FormData::host_frame // - FormData::renderer_id // - FormFieldData::host_frame // - FormFieldData::renderer_id // - FormFieldData::host_form_id // 2. GetBrowserForm() must only be called for known renderer forms. A renderer // form is *known* after a corresponding UpdateTreeOfRendererForm() call // until it is erased by EraseForms() or EraseFormsOfFrame(). // // FormForest works with LocalFrameToken and resolves the RemoteFrameTokens in // FormData::child_frames to LocalFrameTokens. // // From the perspective of a frame F, a frame G is either local or remote: // - If G is local, G is hosted by the same render process as F. // - If G is remote, G may be hosted by another render process. // // Suppose F is the parent frame of G. If G is local to F, then F refers to G in // its FormData::child_frames by G's LocalFrameToken. Otherwise, if G is remote // to F, then F uses a RemoteFrameToken as a placeholder to refer to G in // FormData::child_frames. // // While LocalFrameTokens are unique identifiers at any point in time, they may // change when a navigation happens in the frame: // - If G is local to F and a navigation causes G's render process to be // swapped so that G becomes remote, G gets a new LocalFrameToken and F will // refer to G by a fresh RemoteFrameToken. // - If G is remote to F and a navigation causes G's render process to be // swapped, then F may continue to refer to G by the same RemoteFrameToken // as before even if G's LocalFrameToken has changed. // The first example is the reason why UpdateTreeOfRendererForm() sometimes // triggers form re-extraction in a parent frame. The second example is the // reason why we do not cache LocalFrameTokens. class FormForest { … }; } // namespace autofill::internal #endif // COMPONENTS_AUTOFILL_CORE_BROWSER_FORM_FOREST_H_