chromium/chromeos/services/machine_learning/public/mojom/text_classifier.mojom

// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// API of the text classifier (libtextclassifier).

// NOTE: This mojom exists in two places and must be kept in sync:
//       Chromium:  //chromeos/services/machine_learning/public/mojom/
//       Chrome OS: src/platform2/ml/mojom/
//       Note: Other repos downstream of Chromium might also use this mojom.
// Example: A backwards-compatible mojom change (and corresponding
// implementation change) can be made in Chrome OS first, then replicated to the
// clients (Chromium, other downstream repos) later.
// Use //chromeos/services/machine_learning/public/mojom/roll_mojoms.sh to help
// replicate Chrome OS-side changes over to Chromium.

module chromeos.machine_learning.mojom;

// NOTE: The base directory for 'import' statements is expected to differ
//       between Chromium and Chrome OS versions of this file.
//       And for "time.mojom", on the chromium side, we have to use the version
//       under mojo folder, that is, "mojo/public/mojom/base/time.mojom".
import "mojo/public/mojom/base/time.mojom";

// Enum for specifying the annotation usecase.
// Must be consistent with `AnnotationUsecase` in model.fb in libtextclassifier.
[Stable, Extensible]
enum AnnotationUsecase {
  // Results are optimized for Smart{Select,Share,Linkify}
  ANNOTATION_USECASE_SMART = 0,
  // Results are optimized for using TextClassifier as an infrastructure that
  // annotates as much as possible.
  ANNOTATION_USECASE_RAW = 1,
};

// Stores data extracted from each text entity.
// Currently, only for "number" type, the real number value is stored in
// `numeric_value`. For the other types, the substring annotated is stored
// in `string_value`.
// The values come from the class `ClassificationResult` of tclib. (See
// "tclib/annotator/types.h").
// TODO(honglinyu): add data extraction for more types when needed and
// available. For example, when "date" data is needed, probably we should add
// a new struct "Date" and add a new member "Data date_value" to the following
// union.
[Stable]
union TextEntityData {
  // A numeric value.
  //  - For "number", it is the value.
  //    e.g. it is "34.3" for input string "34.3".
  double numeric_value@0;
  // Could be "url", "address" etc.
  string string_value@1;
};

// Types of text (can be phone numbers, addresses, emails and urls etc.).
// This struct is a distillation of the `ClassificationResult` of tclib. (See
// "tclib/annotator/types.h").
[Stable]
struct TextEntity {
  // The name of the type (e.g. "phone", "address", "email" and "url" etc.).
  string name@0;
  // The confidence score of the entity annotation, and the range is 0-1.
  float confidence_score@1;
  // Additional data extracted from the text.
  TextEntityData data@2;
};

// A substring of the annotated text and possible associated entities.
// This struct is a simplification of the `AnnotatedSpan` class of tclib. (See
// "tclib/annotator/types.h").
[Stable]
struct TextAnnotation {
  // The offset of the first character of the annotation.
  uint32 start_offset@0;
  // The offset of the last character of the annotation.
  uint32 end_offset@1;
  // The set of entity types associated with the substring.
  array<TextEntity> entities@2;
};

// Contains the input and parameters used to annotate the text.
// This is a combination of string and `AnnotationOptions` in tclib (see
// "tclib/annotator/types.h").
// Next MinVersion: 2
[Stable]
struct TextAnnotationRequest {
  // The text to be annotated.
  string text@0;
  // Comma-delimited locales (e.g., "en", "en,es").
  string? default_locales@1;
  // Comma-separated list of language tags.
  string? detected_text_language_tags@2;
  // Tailors the output annotations according to the specified use-case.
  AnnotationUsecase annotation_usecase@3 = ANNOTATION_USECASE_SMART;
  // For parsing relative datetimes, the reference now time against which the
  // relative datetimes get resolved.
  mojo_base.mojom.Time? reference_time@4;
  // Timezone in which the input text was written (format as accepted by ICU).
  // If empty (default), will use the system's timezone.
  string? reference_timezone@5;
  // Enabled entities. If empty (default), all types of entities will be
  // enabled.
  array<string>? enabled_entities@6;
  // To enable text annotations for simple words.
  [MinVersion=1] bool trigger_dictionary_on_beginner_words@7 = false;
};

// Marks a span in a sequence of codepoints.
// This struct is consistent with the type `CodepointSpan` of tclib. (See
// "tclib/annotator/types.h").
[Stable]
struct CodepointSpan {
  // The offset of the first character of the span.
  uint32 start_offset@0;
  // The offset of the last character of the span.
  uint32 end_offset@1;
};

// Represent a language detection result.
[Stable]
struct TextLanguage {
  // The BCP-47 language code like "en", "fr", "zh" etc.
  string locale;
  // The confidence score of the language detected (range: 0~1).
  float confidence;
};

// Contains the input and parameters used to suggest selection.
// This is a combination of the inputs of the `SuggestSelection` function
// of tclib. (See "tclib/annotator/annotate.h").
[Stable, RenamedFrom="chromeos.machine_learning.mojom.TextSuggestSelectionRequest"]
struct REMOVED_TextSuggestSelectionRequest {
  // The candidate text.
  string text@0;
  // Where the user selects.
  CodepointSpan user_selection@1;
  // Comma-delimited locales (e.g., "en", "en,es").
  string? default_locales@2;
  // Comma-separated list of BCP 47 language tags.
  string? detected_text_language_tags@3;
  // Tailors the output annotations according to the specified use-case.
  AnnotationUsecase annotation_usecase@4 = ANNOTATION_USECASE_SMART;
};

// Used to annotate entities within text strings.
// Next ordinal: 3
[Stable]
interface TextClassifier {
  // Annotate a text string and returns the detected substrings and possible
  // entities.
  Annotate@0(TextAnnotationRequest request) =>
      (array<TextAnnotation> outputs);
  // Identify the languages the text is possibly written in.
  // The returned results are sorted according to the confidence score, from the
  // highest to the lowest.
  // The maximum number of results returned is determined internally.
  // Will return an empty array if the language can not be determined.
  FindLanguages@2(string text) => (array<TextLanguage> outputs);
  // Deprecated `SuggestSelection`
  REMOVED_1@1(REMOVED_TextSuggestSelectionRequest request) =>
      (CodepointSpan outputs);
};