// Copyright 2018 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifndef UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_ #define UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_ #include <memory> #include <string> #include <unordered_map> #include <unordered_set> #include <utility> #include <vector> #include "base/memory/raw_ptr.h" #include "third_party/cld_3/src/src/nnet_language_identifier.h" #include "ui/accessibility/ax_enums.mojom-forward.h" #include "ui/accessibility/ax_export.h" #include "ui/accessibility/ax_tree_observer.h" namespace ui { class AXNode; class AXTree; // This module implements language detection enabling Chrome to automatically // detect the language for runs of text within the page. // // Node-level language detection runs once per page after the load complete // event. This involves two passes: // *Detect* walks the tree from the given root using cld3 to detect up to 3 // potential languages per node. A ranked list is created enumerating // all potential languages on a page. // *Label* re-walks the tree, assigning a language to each node considering // the potential languages from the detect phase, page level // statistics, and the assigned languages of ancestor nodes. // // Optionally an embedder may run *sub-node* language detection which attempts // to assign languages for runs of text within a node, potentially down to the // individual character level. This is useful in cases where a single paragraph // involves switching between multiple languages, and where the speech engine // doesn't automatically switch voices to handle different character sets. // Due to the potentially small lengths of text runs involved this tends to be // lower in accuracy, and works best when a node is composed of multiple // languages with easily distinguishable scripts. // AXLanguageInfo represents the local language detection data for all text // within an AXNode. Stored on AXNode. struct AX_EXPORT AXLanguageInfo { … }; // Each AXLanguageSpan contains a language, a probability, and start and end // indices. The indices are used to specify the substring that contains the // associated language. The string which the indices are relative to is not // included in this structure. // Also, the indices are relative to a Utf8 string. // See documentation on GetLanguageAnnotationForStringAttribute for details // on how to associate this object with a string. struct AX_EXPORT AXLanguageSpan { … }; // A single AXLanguageInfoStats instance is stored on each AXTree and contains // statistics on detected languages for all the AXNodes in that tree. // // We rely on these tree-level statistics when labelling individual nodes, to // provide extra signals to increase our confidence in assigning a detected // language. // // These tree level statistics are also used to send reports on the language // detection feature to enable tuning. // // The Label step will only assign a detected language to a node if that // language is one of the most frequent languages on the page. // // For example, if a single node has detected_languages (in order of probability // assigned by cld_3): da-DK, en-AU, fr-FR, but the page statistics overall // indicate that the page is generally in en-AU and ja-JP, it is more likely to // be a mis-recognition of Danish than an accurate assignment, so we assign // en-AU instead of da-DK. class AX_EXPORT AXLanguageInfoStats { … }; // AXLanguageDetectionObserver is registered as a change observer on an AXTree // and will run language detection after each update to the tree. // // We have kept this observer separate from the AXLanguageDetectionManager as we // are aiming to launch language detection in two phases and wanted to try keep // the code paths somewhat separate. // // TODO(chrishall): After both features have launched we could consider merging // AXLanguageDetectionObserver into AXLanguageDetectionManager. // // TODO(chrishall): Investigate the cost of using AXTreeObserver, given that it // has many empty virtual methods which are called for every AXTree change and // we are only currently interested in OnAtomicUpdateFinished. class AX_EXPORT AXLanguageDetectionObserver : public AXTreeObserver { … }; // AXLanguageDetectionManager manages all of the context needed for language // detection within an AXTree. class AX_EXPORT AXLanguageDetectionManager { … }; } // namespace ui #endif // UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_