chromium/services/screen_ai/public/mojom/screen_ai_service.mojom

// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

module screen_ai.mojom;

import "skia/public/mojom/bitmap.mojom";
import "ui/accessibility/ax_features.mojom";
import "ui/accessibility/mojom/ax_tree_id.mojom";
import "ui/accessibility/mojom/ax_tree_update.mojom";
import "ui/gfx/geometry/mojom/geometry.mojom";

// This enum should be kept in sync with the one in
// `services/screen_ai/proto/chrome_screen_ai.proto`
enum Direction {
  DIRECTION_UNSPECIFIED,
  DIRECTION_LEFT_TO_RIGHT,
  DIRECTION_RIGHT_TO_LEFT,
  DIRECTION_TOP_TO_BOTTOM,
};

// Clients of the OCR service.
enum OcrClientType {
  kTest,
  kPdfViewer,
  kLocalSearch,
  kCameraApp,
  kPdfSearchify,
  kMediaApp,
};

// A wrapper struct mirroring parts of the chrome_screen_ai.proto.
struct VisualAnnotation {
  array<LineBox> lines;
};

// A wrapper struct mirroring parts of the chrome_screen_ai.proto.
struct LineBox {
  // Words in the text line.
  array<WordBox> words;

  // Text line in UTF8 format.
  string text_line;

  // Language guess for the line. The format  is the ISO 639-1 two-letter
  // language code if that is defined (e.g. "en"), or else the ISO 639-2
  // three-letter code if that is defined, or else a Google-specific code.
  string language;

  // ID of the text block that this line belongs to.
  int32 block_id;

  // Index within the block that this line belongs to.
  int32 order_within_block;

  // Line bounding box relative to the original image.
  gfx.mojom.Rect bounding_box;

  // Rotation angle (in degrees, clockwise) of the line bounding box about its
  // top-left corner.
  float bounding_box_angle;

  // Line bounding box relative to the original image with bottom edge
  // representing estimated baseline of text.
  gfx.mojom.Rect baseline_box;

  // Rotation angle (in degrees, clockwise) of the line baseline box about its
  // top-left corner.
  float baseline_box_angle;

  // Confidence as computed by the OCR engine. The value is in range [0, 1].
  float confidence;
};

// A wrapper struct mirroring parts of the chrome_screen_ai.proto.
struct WordBox {
  // A single word in UTF8 format.
  string word;

  // True if the word passes the internal beamsearch dictionary check.
  bool dictionary_word;

  // Language guess for the word. The format  is the ISO 639-1 two-letter
  // language code if that is defined (e.g. "en"), or else the ISO 639-2
  // three-letter code if that is defined, or else a Google-specific code.
  string language;

  // This word is separated from next word by space.
  bool has_space_after;

  // Word bounding box relative to the original image.
  gfx.mojom.Rect bounding_box;

  // Rotation angle (in degrees, clockwise) of the word bounding box about its
  // top-left corner.
  float bounding_box_angle;

  // The direction of the script contained in the word.
  Direction direction;

  // Confidence as computed by the OCR engine. The value is in range [0, 1].
  float confidence;
};

// Main interface a client uses for visual annotation functions of the Screen AI
// service.
[RuntimeFeature=ax.mojom.features.kScreenAIOCREnabled]
interface ScreenAIAnnotator {
  // Receives an image, such as a screenshot or a page from a PDF file, as well
  // as the ID of the accessibility tree that contains the image
  // (`parent_tree_id`). It then asks the Screen AI library to perform OCR on
  // the image. It returns an AXTreeUpdate with nodes built from OCR results.
  // The returned AXTreeUpdate is not a properly serialized update and is only
  // a container for the root id of a subtree and nodes built from OCR results.
  PerformOcrAndReturnAXTreeUpdate(skia.mojom.BitmapN32 image) =>
    (ax.mojom.AXTreeUpdate update);

  // Performs OCR on an image. Returns `VisualAnnotation` that mirrors parts
  // of the underling proto.
  PerformOcrAndReturnAnnotation(skia.mojom.BitmapN32 image) =>
    (VisualAnnotation visual_annotation);

  // Sets OCR client type for metrics.
  SetClientType(OcrClientType client_type);
};

// Main interface a client uses for Main Content Extraction function of Screen
// AI service. Each RenderFrameImpl can have one AXTreeDistiller which contains
// an Screen2xMainContentExtractor.
// All interfaces of one browser profile use one ScreenAIService.
interface Screen2xMainContentExtractor {
  // Receives the accessibility tree as a snapshot, schedules processing, and
  // returns the main content of the given tree. ukm_source_id is a
  // ukm::SourceId which is used to tie the UKM event to the main frame URL for
  // metrics collection.
  ExtractMainContent(ax.mojom.AXTreeUpdate snapshot, int64 ukm_source_id) =>
    (array<int32> content_node_ids);

  // Receives the accessibility tree as a snapshot, schedules processing, and
  // returns the main node id of the given tree.
  ExtractMainNode(ax.mojom.AXTreeUpdate snapshot) => (int32 main_node_id);
};

// Provides an interface to the OCR functionality of the Screen AI service.
// This interface gets bound only after Screen AI service loads its library and
// initializes it for OCR.
// OCR service can receive multiple annotator pipelines and provide results for
// them separately.
[RuntimeFeature=ax.mojom.features.kScreenAIOCREnabled]
interface OCRService {
  // Binds a new annotator to the service.
  BindAnnotator(pending_receiver<ScreenAIAnnotator>? annotator);
};

// Provides an interface to the Main Content Extraction functionalities of the
// Screen AI service.
// This interface gets bound only after Screen AI service loads its library and
// initializes for Main Content Extraction.
// Main Content Extraction service can receive multiple annotator pipelines and
// provide results for them separately.
[RuntimeFeature=ax.mojom.features.kScreenAIMainContentExtractionEnabled]
interface MainContentExtractionService {
  // Binds a new main content extractor to the service.
  BindMainContentExtractor(pending_receiver<Screen2xMainContentExtractor>?
    main_content_extractor);
};