// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
module screen_ai.mojom;
import "skia/public/mojom/bitmap.mojom";
import "ui/accessibility/ax_features.mojom";
import "ui/accessibility/mojom/ax_tree_id.mojom";
import "ui/accessibility/mojom/ax_tree_update.mojom";
import "ui/gfx/geometry/mojom/geometry.mojom";
// This enum should be kept in sync with the one in
// `services/screen_ai/proto/chrome_screen_ai.proto`
enum Direction {
DIRECTION_UNSPECIFIED,
DIRECTION_LEFT_TO_RIGHT,
DIRECTION_RIGHT_TO_LEFT,
DIRECTION_TOP_TO_BOTTOM,
};
// Clients of the OCR service.
enum OcrClientType {
kTest,
kPdfViewer,
kLocalSearch,
kCameraApp,
kPdfSearchify,
kMediaApp,
};
// A wrapper struct mirroring parts of the chrome_screen_ai.proto.
struct VisualAnnotation {
array<LineBox> lines;
};
// A wrapper struct mirroring parts of the chrome_screen_ai.proto.
struct LineBox {
// Words in the text line.
array<WordBox> words;
// Text line in UTF8 format.
string text_line;
// Language guess for the line. The format is the ISO 639-1 two-letter
// language code if that is defined (e.g. "en"), or else the ISO 639-2
// three-letter code if that is defined, or else a Google-specific code.
string language;
// ID of the text block that this line belongs to.
int32 block_id;
// Index within the block that this line belongs to.
int32 order_within_block;
// Line bounding box relative to the original image.
gfx.mojom.Rect bounding_box;
// Rotation angle (in degrees, clockwise) of the line bounding box about its
// top-left corner.
float bounding_box_angle;
// Line bounding box relative to the original image with bottom edge
// representing estimated baseline of text.
gfx.mojom.Rect baseline_box;
// Rotation angle (in degrees, clockwise) of the line baseline box about its
// top-left corner.
float baseline_box_angle;
// Confidence as computed by the OCR engine. The value is in range [0, 1].
float confidence;
};
// A wrapper struct mirroring parts of the chrome_screen_ai.proto.
struct WordBox {
// A single word in UTF8 format.
string word;
// True if the word passes the internal beamsearch dictionary check.
bool dictionary_word;
// Language guess for the word. The format is the ISO 639-1 two-letter
// language code if that is defined (e.g. "en"), or else the ISO 639-2
// three-letter code if that is defined, or else a Google-specific code.
string language;
// This word is separated from next word by space.
bool has_space_after;
// Word bounding box relative to the original image.
gfx.mojom.Rect bounding_box;
// Rotation angle (in degrees, clockwise) of the word bounding box about its
// top-left corner.
float bounding_box_angle;
// The direction of the script contained in the word.
Direction direction;
// Confidence as computed by the OCR engine. The value is in range [0, 1].
float confidence;
};
// Main interface a client uses for visual annotation functions of the Screen AI
// service.
[RuntimeFeature=ax.mojom.features.kScreenAIOCREnabled]
interface ScreenAIAnnotator {
// Receives an image, such as a screenshot or a page from a PDF file, as well
// as the ID of the accessibility tree that contains the image
// (`parent_tree_id`). It then asks the Screen AI library to perform OCR on
// the image. It returns an AXTreeUpdate with nodes built from OCR results.
// The returned AXTreeUpdate is not a properly serialized update and is only
// a container for the root id of a subtree and nodes built from OCR results.
PerformOcrAndReturnAXTreeUpdate(skia.mojom.BitmapN32 image) =>
(ax.mojom.AXTreeUpdate update);
// Performs OCR on an image. Returns `VisualAnnotation` that mirrors parts
// of the underling proto.
PerformOcrAndReturnAnnotation(skia.mojom.BitmapN32 image) =>
(VisualAnnotation visual_annotation);
// Sets OCR client type for metrics.
SetClientType(OcrClientType client_type);
};
// Main interface a client uses for Main Content Extraction function of Screen
// AI service. Each RenderFrameImpl can have one AXTreeDistiller which contains
// an Screen2xMainContentExtractor.
// All interfaces of one browser profile use one ScreenAIService.
interface Screen2xMainContentExtractor {
// Receives the accessibility tree as a snapshot, schedules processing, and
// returns the main content of the given tree. ukm_source_id is a
// ukm::SourceId which is used to tie the UKM event to the main frame URL for
// metrics collection.
ExtractMainContent(ax.mojom.AXTreeUpdate snapshot, int64 ukm_source_id) =>
(array<int32> content_node_ids);
// Receives the accessibility tree as a snapshot, schedules processing, and
// returns the main node id of the given tree.
ExtractMainNode(ax.mojom.AXTreeUpdate snapshot) => (int32 main_node_id);
};
// Provides an interface to the OCR functionality of the Screen AI service.
// This interface gets bound only after Screen AI service loads its library and
// initializes it for OCR.
// OCR service can receive multiple annotator pipelines and provide results for
// them separately.
[RuntimeFeature=ax.mojom.features.kScreenAIOCREnabled]
interface OCRService {
// Binds a new annotator to the service.
BindAnnotator(pending_receiver<ScreenAIAnnotator>? annotator);
};
// Provides an interface to the Main Content Extraction functionalities of the
// Screen AI service.
// This interface gets bound only after Screen AI service loads its library and
// initializes for Main Content Extraction.
// Main Content Extraction service can receive multiple annotator pipelines and
// provide results for them separately.
[RuntimeFeature=ax.mojom.features.kScreenAIMainContentExtractionEnabled]
interface MainContentExtractionService {
// Binds a new main content extractor to the service.
BindMainContentExtractor(pending_receiver<Screen2xMainContentExtractor>?
main_content_extractor);
};