chromium/extensions/common/api/media_perception_private.idl

// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Private API for receiving real-time media perception information.
[platforms=("chromeos")]
namespace mediaPerceptionPrivate {
  enum Status {
    // The media analytics process is waiting to be launched.
    UNINITIALIZED,

    // The analytics process is running and the media processing pipeline is
    // started, but it is not yet receiving image frames. This is a
    // transitional state between <code>SUSPENDED</code> and
    // <code>RUNNING</code> for the time it takes to warm up the media
    // processing pipeline, which can take anywhere from a few seconds to a
    // minute.
    // Note: <code>STARTED</code> is the initial reply to SetState
    // <code>RUNNING</code>.
    STARTED,

    // The analytics process is running and the media processing pipeling is
    // injesting image frames. At this point, MediaPerception signals should
    // be coming over D-Bus.
    RUNNING,

    // Analytics process is running and the media processing pipeline is ready
    // to be set to state <code>RUNNING</code>. The D-Bus communications
    // are enabled but the media processing pipeline is suspended.
    SUSPENDED,

    // Enum for restarting the media analytics process using Upstart.
    // Calling setState <code>RESTARTING</code> will restart the media process
    // to the <code>SUSPENDED</code> state. The app has to set the state to
    // <code>RUNNING</code> in order to start receiving media perception
    // information again.
    RESTARTING,

    // Stops the media analytics process via Upstart.
    STOPPED,

    // Indicates that a ServiceError has occurred.
    SERVICE_ERROR
  };

  enum ServiceError {
    // The media analytics process could not be reached. This is likely due to
    // a faulty comms configuration or that the process crashed.
    SERVICE_UNREACHABLE,

    // The media analytics process is not running. The MPP API knows that the
    // process has not been started yet.
    SERVICE_NOT_RUNNING,

    // The media analytics process is busy launching. Wait for setState
    // <code>RUNNING</code> or setState <code>RESTARTING</code> callback.
    SERVICE_BUSY_LAUNCHING,

    // The component is not installed properly.
    SERVICE_NOT_INSTALLED,

    // Failed to establish a Mojo connection to the service.
    MOJO_CONNECTION_FAILURE
  };

  enum Feature {
    AUTOZOOM,
    HOTWORD_DETECTION,
    OCCUPANCY_DETECTION,
    EDGE_EMBEDDINGS,
    SOFTWARE_CROPPING
  };

  dictionary NamedTemplateArgument {
    DOMString? name;
    (DOMString or double)? value;
  };

  enum ComponentType {
    // The smaller component with limited functionality (smaller size and
    // limited models).
    LIGHT,
    // The fully-featured component with more functionality (larger size and
    // more models).
    FULL
  };

  // The status of the media analytics process component on the device.
  enum ComponentStatus {
    UNKNOWN,
    // The component is successfully installed and the image is mounted.
    INSTALLED,
    // The component failed to download, install or load.
    FAILED_TO_INSTALL
  };

  // Error code associated with a failure to install the media analytics
  // component.
  enum ComponentInstallationError {
    // Component requested does not exist.
    UNKNOWN_COMPONENT,

    // The update engine fails to install component.
    INSTALL_FAILURE,

    // Component can not be mounted.
    MOUNT_FAILURE,

    // The component is not compatible with the device.
    COMPATIBILITY_CHECK_FAILED,

    // The component was not found - reported for load requests with kSkip
    // update policy.
    NOT_FOUND
  };

  dictionary Component {
    ComponentType type;
  };

  // The state of the media analytics downloadable component.
  dictionary ComponentState {
    ComponentStatus status;

    // The version string for the current component.
    DOMString? version;

    // If the component installation failed, the encountered installation
    // error. Not set if the component installation succeeded.
    ComponentInstallationError? installationErrorCode;
  };

  // ------------------- Start of process management definitions. ------------
  // New interface for managing the process state of the media perception
  // service with the intention of eventually phasing out the setState() call.
  enum ProcessStatus {
    // The component process state is unknown, for example, if the process is
    // waiting to be launched. This is the initial state before
    // $(ref:setComponentProcessState) is first called.
    UNKNOWN,

    // The component process has been started.
    // This value can only be passed to $(ref:setComponentProcessState) if the
    // process is currently in state <code>STOPPED</code> or
    // <code>UNKNOWN</code>.
    STARTED,

    // The component process has been stopped.
    // This value can only be passed to $(ref:setComponentProcessState) if the
    // process is currently in state <code>STARTED</code>.
    // Note: the process is automatically stopped when the Chrome process
    // is closed.
    STOPPED,

    // Indicates that a ServiceError has occurred.
    SERVICE_ERROR
  };

  dictionary ProcessState {
    ProcessStatus? status;

    // Return parameter for $(ref:setComponentProcessState) that
    // specifies the error type for failure cases.
    ServiceError? serviceError;
  };
  // ------------------- End of process management definitions. --------------

  // The parameters for processing a particular video stream.
  dictionary VideoStreamParam {
    // Identifies the video stream described by these parameters.
    DOMString? id;

    // Frame width in pixels.
    long? width;

    // Frame height in pixels.
    long? height;

    // The frame rate at which this video stream would be processed.
    long? frameRate;
  };

  dictionary Point {
    // The horizontal distance from the top left corner of the image.
    double? x;

    // The vertical distance from the top left corner of the image.
    double? y;
  };

  // The parameters for a whiteboard in the image frame. Corners are given in
  // pixel coordinates normalized to the size of the image frame (i.e. in the
  // range [(0.0, 0.0), (1.0, 1.0)]. The aspectRatio is the physical aspect
  // ratio of the whiteboard (e.g. for a 1m high and 2m wide whiteboard, the
  // aspect ratio would be 2).
  dictionary Whiteboard {
    // The top left corner of the whiteboard in the image frame.
    Point? topLeft;

    // The top right corner of the whiteboard in the image frame.
    Point? topRight;

    // The bottom left corner of the whiteboard in the image frame.
    Point? bottomLeft;

    // The bottom right corner of the whiteboard in the image frame.
    Point? bottomRight;

    // The physical aspect ratio of the whiteboard.
    double? aspectRatio;
  };

  // The system and configuration state of the analytics process.
  dictionary State {
    Status status;

    // Optional $(ref:setState) parameter. Specifies the video device the media
    // analytics process should open while the media processing pipeline is
    // starting. To set this parameter, status has to be <code>RUNNING</code>.
    DOMString? deviceContext;

    // Return parameter for $(ref:setState) or $(ref:getState) that
    // specifies the error type for failure cases.
    ServiceError? serviceError;

    // A list of video streams processed by the analytics process. To set this
    // parameter, status has to be <code>RUNNING</code>.
    VideoStreamParam[]? videoStreamParam;

    // Media analytics configuration. It can only be used when setting state to
    // RUNNING.
    DOMString? configuration;

    // Corners and aspect ratio of the whiteboard in the image frame. Should
    // only be set when setting state to <code>RUNNING</code> and configuration
    // to whiteboard.
    Whiteboard? whiteboard;

    // A list of enabled media perception features.
    Feature[]? features;

    // A list of named parameters to be substituted at start-up. Will
    // only have effect when setting state to <code>RUNNING</code>.
    NamedTemplateArgument[]? namedTemplateArguments;
  };

  dictionary BoundingBox {
    // Specifies whether the points are normalized to the size of the image.
    boolean? normalized;

    // The two points that define the corners of a bounding box.
    Point? topLeft;
    Point? bottomRight;
  };

  enum DistanceUnits {
    UNSPECIFIED,
    METERS,
    PIXELS
  };

  // Generic dictionary to encapsulate a distance magnitude and units.
  dictionary Distance {
    // This field provides flexibility to report depths or distances of
    // different entity types with different units.
    DistanceUnits? units;

    double? magnitude;
  };

  enum EntityType {
    UNSPECIFIED,
    FACE,
    PERSON,
    MOTION_REGION,
    LABELED_REGION
  };

  enum FramePerceptionType {
    UNKNOWN_TYPE,
    FACE_DETECTION,
    PERSON_DETECTION,
    MOTION_DETECTION
  };

  dictionary Entity {
    // A unique id associated with the detected entity, which can be used to
    // track the entity over time.
    long? id;

    EntityType? type;

    // Label for this entity.
    DOMString? entityLabel;

    // Minimum box which captures entire detected entity.
    BoundingBox? boundingBox;

    // A value for the quality of this detection.
    double? confidence;

    // The estimated depth of the entity from the camera.
    Distance? depth;
  };

  dictionary PacketLatency {
    // Label for this packet.
    DOMString? packetLabel;

    // Packet processing latency in microseconds.
    long? latencyUsec;
  };

  // Type of lighting conditions.
  enum LightCondition {
    UNSPECIFIED,

    // No noticeable change occurred.
    NO_CHANGE,

    // Light was switched on in the room.
    TURNED_ON,

    // Light was switched off in the room.
    TURNED_OFF,

    // Light gradually got dimmer (for example, due to a sunset).
    DIMMER,

    // Light gradually got brighter (for example, due to a sunrise).
    BRIGHTER,

    // Black frame was detected - the current frame contains only noise.
    BLACK_FRAME
  };

  // Detection of human presence close to the camera.
  dictionary VideoHumanPresenceDetection {
    // Indicates a probability in [0, 1] interval that a human is present in
    // the video frame.
    double? humanPresenceLikelihood;

    // Indicates a probability in [0, 1] that motion has been detected in the
    // video frame.
    double? motionDetectedLikelihood;

    // Indicates lighting condition in the video frame.
    LightCondition? lightCondition;

    // Indicates a probablity in [0, 1] interval that
    // <code>lightCondition</code> value is correct.
    double? lightConditionLikelihood;
  };

  // The set of computer vision metadata for an image frame.
  dictionary FramePerception {
    long? frameId;

    long? frameWidthInPx;
    long? frameHeightInPx;

    // The timestamp associated with the frame (when its recieved by the
    // analytics process).
    double? timestamp;

    // The list of entities detected in this frame.
    Entity[]? entities;

    // Processing latency for a list of packets.
    PacketLatency[]? packetLatency;

    // Human presence detection results for a video frame.
    VideoHumanPresenceDetection? videoHumanPresenceDetection;

    // Indicates what types of frame perception were run.
    FramePerceptionType[]? framePerceptionTypes;
  };

  // An estimate of the direction that the sound is coming from.
  dictionary AudioLocalization {
    // An angle in radians in the horizontal plane. It roughly points to the
    // peak in the probability distribution of azimuth defined below.
    double? azimuthRadians;

    // A probability distribution for the current snapshot in time that shows
    // the likelihood of a sound source being at a particular azimuth. For
    // example, <code>azimuthScores = [0.1, 0.2, 0.3, 0.4]</code> means that
    // the probability that the sound is coming from an azimuth of 0, pi/2, pi,
    // 3*pi/2 is 0.1, 0.2, 0.3 and 0.4, respectively.
    double[]? azimuthScores;
  };

  // Spectrogram of an audio frame.
  dictionary AudioSpectrogram {
    double[]? values;
  };

  // Detection of human presence close to the microphone.
  dictionary AudioHumanPresenceDetection {
    // Indicates a probability in [0, 1] interval that a human has caused a
    // sound close to the microphone.
    double? humanPresenceLikelihood;

    // Estimate of the noise spectrogram.
    AudioSpectrogram? noiseSpectrogram;

    // Spectrogram of an audio frame.
    AudioSpectrogram? frameSpectrogram;
  };

  enum HotwordType {
    UNKNOWN_TYPE,
    OK_GOOGLE
  };

  // A hotword detected in the audio stream.
  dictionary Hotword {
    // Unique identifier for the hotword instance. Note that a single hotword
    // instance can span more than one audio frame. In that case a single
    // hotword instance can be reported in multiple Hotword or HotwordDetection
    // results. Hotword results associated with the same hotword instance will
    // have the same <code>id</code>.
    long? id;

    // Indicates the type of this hotword.
    HotwordType? type;

    // Id of the audio frame in which the hotword was detected.
    long? frameId;

    // Indicates the start time of this hotword in the audio frame.
    long? startTimestampMs;

    // Indicates the end time of this hotword in the audio frame.
    long? endTimestampMs;

    // Indicates a probability in [0, 1] interval that this hotword is present
    // in the audio frame.
    double? confidence;
  };

  // Detection of hotword in the audio stream.
  dictionary HotwordDetection {
    Hotword[]? hotwords;
  };

  // Audio perception results for an audio frame.
  dictionary AudioPerception {
    // A timestamp in microseconds attached when this message was generated.
    double? timestampUs;

    // Audio localization results for an audio frame.
    AudioLocalization? audioLocalization;

    // Audio human presence detection results for an audio frame.
    AudioHumanPresenceDetection? audioHumanPresenceDetection;

    // Hotword detection results.
    HotwordDetection? hotwordDetection;
  };

  // Detection of human presence based on both audio and video inputs.
  dictionary AudioVisualHumanPresenceDetection {
    // Indicates a probability in [0, 1] interval that a human is present.
    double? humanPresenceLikelihood;
  };

  // Perception results based on both audio and video inputs.
  dictionary AudioVisualPerception {
    // A timestamp in microseconds attached when this message was generated.
    double? timestampUs;

    // Human presence detection results.
    AudioVisualHumanPresenceDetection? audioVisualHumanPresenceDetection;
  };

  // Stores metadata such as version of media perception features.
  dictionary Metadata {
    DOMString? visualExperienceControllerVersion;
  };

  dictionary MediaPerception {
    // The time the media perception data was emitted by the media processing
    // pipeline. This value will be greater than the timestamp stored within
    // the FramePerception dictionary and the difference between them can be
    // viewed as the processing time for a single frame.
    double? timestamp;

    // An array of framePerceptions.
    FramePerception[]? framePerceptions;

    // An array of audio perceptions.
    AudioPerception[]? audioPerceptions;

    // An array of audio-visual perceptions.
    AudioVisualPerception[]? audioVisualPerceptions;

    // Stores metadata such as version of media perception features.
    Metadata? metadata;
  };

  enum ImageFormat {
    // Image represented by RGB data channels.
    RAW,
    PNG,
    JPEG
  };

  dictionary ImageFrame {
    long? width;
    long? height;

    ImageFormat? format;

    long? dataLength;

    // The bytes of the image frame.
    ArrayBuffer? frame;
  };

  dictionary PerceptionSample {
    // The video analytics FramePerception for the associated image frame
    // data.
    FramePerception? framePerception;

    // The image frame data for the associated FramePerception object.
    ImageFrame? imageFrame;

    // The audio perception results for an audio frame.
    AudioPerception? audioPerception;

    // Perception results based on both audio and video inputs.
    AudioVisualPerception? audioVisualPerception;

    // Stores metadata such as version of media perception features.
    Metadata? metadata;
  };

  dictionary Diagnostics {
    // Return parameter for $(ref:getDiagnostics) that specifies the error
    // type for failure cases.
    ServiceError? serviceError;

    // A buffer of image frames and the associated video analytics information
    // that can be used to diagnose a malfunction.
    PerceptionSample[]? perceptionSamples;
  };

  callback StateCallback = void(State state);

  callback DiagnosticsCallback = void(Diagnostics diagnostics);

  callback ComponentStateCallback = void(ComponentState componentState);

  callback ProcessStateCallback = void(ProcessState processState);

  interface Functions {
    // Gets the status of the media perception process.
    // |callback| : The current state of the system.
    static void getState(StateCallback callback);

    // Sets the desired state of the system.
    // |state| : A dictionary with the desired new state. The only settable
    // states are <code>RUNNING</code>, <code>SUSPENDED</code>, and
    // <code>RESTARTING</code>.
    // |callback| : Invoked with the State of the system after setting it. Can
    // be used to verify the state was set as desired.
    static void setState(
        State state,
        StateCallback callback);

    // Get a diagnostics buffer out of the video analytics process.
    // |callback| : Returns a Diagnostics dictionary object.
    static void getDiagnostics(DiagnosticsCallback callback);

    // Attempts to download and load the media analytics component. This
    // function should be called every time a client starts using this API. If
    // the component is already loaded, the callback will simply return that
    // information. The process must be <code>STOPPED</code> for this function
    // to succeed.
    // Note: If a different component type is desired, this function can
    // be called with the new desired type and the new component will be
    // downloaded and installed.
    // |component| : The desired component to install and load.
    // |callback| : Returns the state of the component.
    static void setAnalyticsComponent(
        Component component,
        ComponentStateCallback callback);

    // Manages the lifetime of the component process. This function should
    // only be used if the component is installed. It will fail if the
    // component is not installed.
    // |processState| : The desired state for the component process.
    // |callback| : Reports the new state of the process, which is expected to
    // be the same as the desired state, unless something goes wrong.
    static void setComponentProcessState(
        ProcessState processState,
        ProcessStateCallback callback);
  };

  interface Events {
    // Fired when media perception information is received from the media
    // analytics process.
    // |mediaPerception| : The dictionary which contains a dump of everything
    // the analytics process has detected or determined from the incoming media
    // streams.
    static void onMediaPerception(MediaPerception mediaPerception);
  };
};