// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Private API for receiving real-time media perception information.
[platforms=("chromeos")]
namespace mediaPerceptionPrivate {
enum Status {
// The media analytics process is waiting to be launched.
UNINITIALIZED,
// The analytics process is running and the media processing pipeline is
// started, but it is not yet receiving image frames. This is a
// transitional state between <code>SUSPENDED</code> and
// <code>RUNNING</code> for the time it takes to warm up the media
// processing pipeline, which can take anywhere from a few seconds to a
// minute.
// Note: <code>STARTED</code> is the initial reply to SetState
// <code>RUNNING</code>.
STARTED,
// The analytics process is running and the media processing pipeling is
// injesting image frames. At this point, MediaPerception signals should
// be coming over D-Bus.
RUNNING,
// Analytics process is running and the media processing pipeline is ready
// to be set to state <code>RUNNING</code>. The D-Bus communications
// are enabled but the media processing pipeline is suspended.
SUSPENDED,
// Enum for restarting the media analytics process using Upstart.
// Calling setState <code>RESTARTING</code> will restart the media process
// to the <code>SUSPENDED</code> state. The app has to set the state to
// <code>RUNNING</code> in order to start receiving media perception
// information again.
RESTARTING,
// Stops the media analytics process via Upstart.
STOPPED,
// Indicates that a ServiceError has occurred.
SERVICE_ERROR
};
enum ServiceError {
// The media analytics process could not be reached. This is likely due to
// a faulty comms configuration or that the process crashed.
SERVICE_UNREACHABLE,
// The media analytics process is not running. The MPP API knows that the
// process has not been started yet.
SERVICE_NOT_RUNNING,
// The media analytics process is busy launching. Wait for setState
// <code>RUNNING</code> or setState <code>RESTARTING</code> callback.
SERVICE_BUSY_LAUNCHING,
// The component is not installed properly.
SERVICE_NOT_INSTALLED,
// Failed to establish a Mojo connection to the service.
MOJO_CONNECTION_FAILURE
};
enum Feature {
AUTOZOOM,
HOTWORD_DETECTION,
OCCUPANCY_DETECTION,
EDGE_EMBEDDINGS,
SOFTWARE_CROPPING
};
dictionary NamedTemplateArgument {
DOMString? name;
(DOMString or double)? value;
};
enum ComponentType {
// The smaller component with limited functionality (smaller size and
// limited models).
LIGHT,
// The fully-featured component with more functionality (larger size and
// more models).
FULL
};
// The status of the media analytics process component on the device.
enum ComponentStatus {
UNKNOWN,
// The component is successfully installed and the image is mounted.
INSTALLED,
// The component failed to download, install or load.
FAILED_TO_INSTALL
};
// Error code associated with a failure to install the media analytics
// component.
enum ComponentInstallationError {
// Component requested does not exist.
UNKNOWN_COMPONENT,
// The update engine fails to install component.
INSTALL_FAILURE,
// Component can not be mounted.
MOUNT_FAILURE,
// The component is not compatible with the device.
COMPATIBILITY_CHECK_FAILED,
// The component was not found - reported for load requests with kSkip
// update policy.
NOT_FOUND
};
dictionary Component {
ComponentType type;
};
// The state of the media analytics downloadable component.
dictionary ComponentState {
ComponentStatus status;
// The version string for the current component.
DOMString? version;
// If the component installation failed, the encountered installation
// error. Not set if the component installation succeeded.
ComponentInstallationError? installationErrorCode;
};
// ------------------- Start of process management definitions. ------------
// New interface for managing the process state of the media perception
// service with the intention of eventually phasing out the setState() call.
enum ProcessStatus {
// The component process state is unknown, for example, if the process is
// waiting to be launched. This is the initial state before
// $(ref:setComponentProcessState) is first called.
UNKNOWN,
// The component process has been started.
// This value can only be passed to $(ref:setComponentProcessState) if the
// process is currently in state <code>STOPPED</code> or
// <code>UNKNOWN</code>.
STARTED,
// The component process has been stopped.
// This value can only be passed to $(ref:setComponentProcessState) if the
// process is currently in state <code>STARTED</code>.
// Note: the process is automatically stopped when the Chrome process
// is closed.
STOPPED,
// Indicates that a ServiceError has occurred.
SERVICE_ERROR
};
dictionary ProcessState {
ProcessStatus? status;
// Return parameter for $(ref:setComponentProcessState) that
// specifies the error type for failure cases.
ServiceError? serviceError;
};
// ------------------- End of process management definitions. --------------
// The parameters for processing a particular video stream.
dictionary VideoStreamParam {
// Identifies the video stream described by these parameters.
DOMString? id;
// Frame width in pixels.
long? width;
// Frame height in pixels.
long? height;
// The frame rate at which this video stream would be processed.
long? frameRate;
};
dictionary Point {
// The horizontal distance from the top left corner of the image.
double? x;
// The vertical distance from the top left corner of the image.
double? y;
};
// The parameters for a whiteboard in the image frame. Corners are given in
// pixel coordinates normalized to the size of the image frame (i.e. in the
// range [(0.0, 0.0), (1.0, 1.0)]. The aspectRatio is the physical aspect
// ratio of the whiteboard (e.g. for a 1m high and 2m wide whiteboard, the
// aspect ratio would be 2).
dictionary Whiteboard {
// The top left corner of the whiteboard in the image frame.
Point? topLeft;
// The top right corner of the whiteboard in the image frame.
Point? topRight;
// The bottom left corner of the whiteboard in the image frame.
Point? bottomLeft;
// The bottom right corner of the whiteboard in the image frame.
Point? bottomRight;
// The physical aspect ratio of the whiteboard.
double? aspectRatio;
};
// The system and configuration state of the analytics process.
dictionary State {
Status status;
// Optional $(ref:setState) parameter. Specifies the video device the media
// analytics process should open while the media processing pipeline is
// starting. To set this parameter, status has to be <code>RUNNING</code>.
DOMString? deviceContext;
// Return parameter for $(ref:setState) or $(ref:getState) that
// specifies the error type for failure cases.
ServiceError? serviceError;
// A list of video streams processed by the analytics process. To set this
// parameter, status has to be <code>RUNNING</code>.
VideoStreamParam[]? videoStreamParam;
// Media analytics configuration. It can only be used when setting state to
// RUNNING.
DOMString? configuration;
// Corners and aspect ratio of the whiteboard in the image frame. Should
// only be set when setting state to <code>RUNNING</code> and configuration
// to whiteboard.
Whiteboard? whiteboard;
// A list of enabled media perception features.
Feature[]? features;
// A list of named parameters to be substituted at start-up. Will
// only have effect when setting state to <code>RUNNING</code>.
NamedTemplateArgument[]? namedTemplateArguments;
};
dictionary BoundingBox {
// Specifies whether the points are normalized to the size of the image.
boolean? normalized;
// The two points that define the corners of a bounding box.
Point? topLeft;
Point? bottomRight;
};
enum DistanceUnits {
UNSPECIFIED,
METERS,
PIXELS
};
// Generic dictionary to encapsulate a distance magnitude and units.
dictionary Distance {
// This field provides flexibility to report depths or distances of
// different entity types with different units.
DistanceUnits? units;
double? magnitude;
};
enum EntityType {
UNSPECIFIED,
FACE,
PERSON,
MOTION_REGION,
LABELED_REGION
};
enum FramePerceptionType {
UNKNOWN_TYPE,
FACE_DETECTION,
PERSON_DETECTION,
MOTION_DETECTION
};
dictionary Entity {
// A unique id associated with the detected entity, which can be used to
// track the entity over time.
long? id;
EntityType? type;
// Label for this entity.
DOMString? entityLabel;
// Minimum box which captures entire detected entity.
BoundingBox? boundingBox;
// A value for the quality of this detection.
double? confidence;
// The estimated depth of the entity from the camera.
Distance? depth;
};
dictionary PacketLatency {
// Label for this packet.
DOMString? packetLabel;
// Packet processing latency in microseconds.
long? latencyUsec;
};
// Type of lighting conditions.
enum LightCondition {
UNSPECIFIED,
// No noticeable change occurred.
NO_CHANGE,
// Light was switched on in the room.
TURNED_ON,
// Light was switched off in the room.
TURNED_OFF,
// Light gradually got dimmer (for example, due to a sunset).
DIMMER,
// Light gradually got brighter (for example, due to a sunrise).
BRIGHTER,
// Black frame was detected - the current frame contains only noise.
BLACK_FRAME
};
// Detection of human presence close to the camera.
dictionary VideoHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human is present in
// the video frame.
double? humanPresenceLikelihood;
// Indicates a probability in [0, 1] that motion has been detected in the
// video frame.
double? motionDetectedLikelihood;
// Indicates lighting condition in the video frame.
LightCondition? lightCondition;
// Indicates a probablity in [0, 1] interval that
// <code>lightCondition</code> value is correct.
double? lightConditionLikelihood;
};
// The set of computer vision metadata for an image frame.
dictionary FramePerception {
long? frameId;
long? frameWidthInPx;
long? frameHeightInPx;
// The timestamp associated with the frame (when its recieved by the
// analytics process).
double? timestamp;
// The list of entities detected in this frame.
Entity[]? entities;
// Processing latency for a list of packets.
PacketLatency[]? packetLatency;
// Human presence detection results for a video frame.
VideoHumanPresenceDetection? videoHumanPresenceDetection;
// Indicates what types of frame perception were run.
FramePerceptionType[]? framePerceptionTypes;
};
// An estimate of the direction that the sound is coming from.
dictionary AudioLocalization {
// An angle in radians in the horizontal plane. It roughly points to the
// peak in the probability distribution of azimuth defined below.
double? azimuthRadians;
// A probability distribution for the current snapshot in time that shows
// the likelihood of a sound source being at a particular azimuth. For
// example, <code>azimuthScores = [0.1, 0.2, 0.3, 0.4]</code> means that
// the probability that the sound is coming from an azimuth of 0, pi/2, pi,
// 3*pi/2 is 0.1, 0.2, 0.3 and 0.4, respectively.
double[]? azimuthScores;
};
// Spectrogram of an audio frame.
dictionary AudioSpectrogram {
double[]? values;
};
// Detection of human presence close to the microphone.
dictionary AudioHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human has caused a
// sound close to the microphone.
double? humanPresenceLikelihood;
// Estimate of the noise spectrogram.
AudioSpectrogram? noiseSpectrogram;
// Spectrogram of an audio frame.
AudioSpectrogram? frameSpectrogram;
};
enum HotwordType {
UNKNOWN_TYPE,
OK_GOOGLE
};
// A hotword detected in the audio stream.
dictionary Hotword {
// Unique identifier for the hotword instance. Note that a single hotword
// instance can span more than one audio frame. In that case a single
// hotword instance can be reported in multiple Hotword or HotwordDetection
// results. Hotword results associated with the same hotword instance will
// have the same <code>id</code>.
long? id;
// Indicates the type of this hotword.
HotwordType? type;
// Id of the audio frame in which the hotword was detected.
long? frameId;
// Indicates the start time of this hotword in the audio frame.
long? startTimestampMs;
// Indicates the end time of this hotword in the audio frame.
long? endTimestampMs;
// Indicates a probability in [0, 1] interval that this hotword is present
// in the audio frame.
double? confidence;
};
// Detection of hotword in the audio stream.
dictionary HotwordDetection {
Hotword[]? hotwords;
};
// Audio perception results for an audio frame.
dictionary AudioPerception {
// A timestamp in microseconds attached when this message was generated.
double? timestampUs;
// Audio localization results for an audio frame.
AudioLocalization? audioLocalization;
// Audio human presence detection results for an audio frame.
AudioHumanPresenceDetection? audioHumanPresenceDetection;
// Hotword detection results.
HotwordDetection? hotwordDetection;
};
// Detection of human presence based on both audio and video inputs.
dictionary AudioVisualHumanPresenceDetection {
// Indicates a probability in [0, 1] interval that a human is present.
double? humanPresenceLikelihood;
};
// Perception results based on both audio and video inputs.
dictionary AudioVisualPerception {
// A timestamp in microseconds attached when this message was generated.
double? timestampUs;
// Human presence detection results.
AudioVisualHumanPresenceDetection? audioVisualHumanPresenceDetection;
};
// Stores metadata such as version of media perception features.
dictionary Metadata {
DOMString? visualExperienceControllerVersion;
};
dictionary MediaPerception {
// The time the media perception data was emitted by the media processing
// pipeline. This value will be greater than the timestamp stored within
// the FramePerception dictionary and the difference between them can be
// viewed as the processing time for a single frame.
double? timestamp;
// An array of framePerceptions.
FramePerception[]? framePerceptions;
// An array of audio perceptions.
AudioPerception[]? audioPerceptions;
// An array of audio-visual perceptions.
AudioVisualPerception[]? audioVisualPerceptions;
// Stores metadata such as version of media perception features.
Metadata? metadata;
};
enum ImageFormat {
// Image represented by RGB data channels.
RAW,
PNG,
JPEG
};
dictionary ImageFrame {
long? width;
long? height;
ImageFormat? format;
long? dataLength;
// The bytes of the image frame.
ArrayBuffer? frame;
};
dictionary PerceptionSample {
// The video analytics FramePerception for the associated image frame
// data.
FramePerception? framePerception;
// The image frame data for the associated FramePerception object.
ImageFrame? imageFrame;
// The audio perception results for an audio frame.
AudioPerception? audioPerception;
// Perception results based on both audio and video inputs.
AudioVisualPerception? audioVisualPerception;
// Stores metadata such as version of media perception features.
Metadata? metadata;
};
dictionary Diagnostics {
// Return parameter for $(ref:getDiagnostics) that specifies the error
// type for failure cases.
ServiceError? serviceError;
// A buffer of image frames and the associated video analytics information
// that can be used to diagnose a malfunction.
PerceptionSample[]? perceptionSamples;
};
callback StateCallback = void(State state);
callback DiagnosticsCallback = void(Diagnostics diagnostics);
callback ComponentStateCallback = void(ComponentState componentState);
callback ProcessStateCallback = void(ProcessState processState);
interface Functions {
// Gets the status of the media perception process.
// |callback| : The current state of the system.
static void getState(StateCallback callback);
// Sets the desired state of the system.
// |state| : A dictionary with the desired new state. The only settable
// states are <code>RUNNING</code>, <code>SUSPENDED</code>, and
// <code>RESTARTING</code>.
// |callback| : Invoked with the State of the system after setting it. Can
// be used to verify the state was set as desired.
static void setState(
State state,
StateCallback callback);
// Get a diagnostics buffer out of the video analytics process.
// |callback| : Returns a Diagnostics dictionary object.
static void getDiagnostics(DiagnosticsCallback callback);
// Attempts to download and load the media analytics component. This
// function should be called every time a client starts using this API. If
// the component is already loaded, the callback will simply return that
// information. The process must be <code>STOPPED</code> for this function
// to succeed.
// Note: If a different component type is desired, this function can
// be called with the new desired type and the new component will be
// downloaded and installed.
// |component| : The desired component to install and load.
// |callback| : Returns the state of the component.
static void setAnalyticsComponent(
Component component,
ComponentStateCallback callback);
// Manages the lifetime of the component process. This function should
// only be used if the component is installed. It will fail if the
// component is not installed.
// |processState| : The desired state for the component process.
// |callback| : Reports the new state of the process, which is expected to
// be the same as the desired state, unless something goes wrong.
static void setComponentProcessState(
ProcessState processState,
ProcessStateCallback callback);
};
interface Events {
// Fired when media perception information is received from the media
// analytics process.
// |mediaPerception| : The dictionary which contains a dump of everything
// the analytics process has detected or determined from the incoming media
// streams.
static void onMediaPerception(MediaPerception mediaPerception);
};
};