media_perception.proto | Explore in Territory

syntax = "proto2";

option optimize_for = LITE_RUNTIME;

package mri;

// The output of the media analytics process. Implicitly tied to the
// MediaPerception dictionary defined in Chromium source at
// src/extensions/common/api/media_perception_private.idl for the
// Chromium mediaPerceptionPrivate API. This .proto needs to be compatible
// with the version used in the binary checked into the Chromebox For
// Meetings private overlay.
//
// This message is packaged by the graph runner when a PerceptionSample
// or array of PerceptionSamples comes out of the graph.
message MediaPerception {
  // The timestamp attached when this data originated from the analysis process.
  optional uint64 timestamp = 1;  // In milliseconds since Epoch.

  // A single FramePerception message or array of perceptions (if reporting the
  // results from multiple frames).
  repeated FramePerception frame_perception = 2;

  // A single AudioPerception message or array of audio perceptions (if
  // reporting the results from multiple audio frames).
  repeated AudioPerception audio_perception = 3;

  // A single AudioVisualPerception message or array of audio-visual
  // perceptions.
  repeated AudioVisualPerception audio_visual_perception = 4;

  // Stores metadata such as version of media perception features.
  optional Metadata metadata = 5;
}

// Stores metadata such as version of media perception features.
message Metadata {
  optional string visual_experience_controller_version = 1;
}

// Used to transmit a history of image frames and their associated annotations.
// This is accumulated over time by the graph runner.
message Diagnostics {
  repeated PerceptionSample perception_sample = 1;
}

message VideoStreamParam {
  // Identifies the video stream described by these parameters.
  optional string id = 1;

  // Frame width in pixels.
  optional int32 width = 2;

  // Frame height in pixels.
  optional int32 height = 3;

  // The frame rate at which this video stream would be processed.
  optional float frame_rate = 4;
}

message State {
  enum Status {
    STATUS_UNSPECIFIED = 0;  // Unused required default value for Proto enums.
    UNINITIALIZED = 1;  // Media analytics working on loading configuration.
    STARTED = 2;        // Analysis process running but not recieving frames.
    RUNNING = 3;        // Analysis process running and injesting frames.
    SUSPENDED = 4;      // Media analytics process waiting to be started.
    RESTARTING = 5;     // Media analytics process should be restarted.
    STOPPED = 6;        // Media analytics process should be stopped.
  }

  // Note: RUNNING and SUSPENDED are the only two states which should be sent to
  // SetState.
  optional Status status = 1;

  // Device context so that the media analytics process can better select the
  // right video device to open.
  optional string device_context = 2;

  // A list of video streams processed by the analytics process.
  repeated VideoStreamParam video_stream_param = 3;

  // Media analytics configuration. It can only be used when setting state to
  // RUNNING.
  optional string configuration = 4;

  // Corners and aspect ratio of the whiteboard in the image frame. Should only
  // be set when setting state to running and configuration to whiteboard.
  optional Whiteboard whiteboard = 5;

  enum Feature {
    // UNSET is not a real feature value.
    FEATURE_UNSET = 0;
    FEATURE_AUTOZOOM = 1;
    FEATURE_HOTWORD_DETECTION = 2;
    FEATURE_OCCUPANCY_DETECTION = 3;
    FEATURE_EDGE_EMBEDDINGS = 4;
    FEATURE_SOFTWARE_CROPPING = 5;
  }

  // A list of enabled media perception features.
  repeated Feature features = 6;

  message NamedTemplateArgument {
    optional string name = 1;
    oneof value {
      string str = 2;
      double num = 3;
    }
  }

  // An optional list of template arguments to be substituted at
  // run-time.  Each argument present in this list will be set to the
  // specified values, others will not be changed. Furthermore, nested
  // arguments (dictionaries) are not supported.
  // CAUTION: These template args can be used to overwrite the
  // Feature flags defined above since feature flags are implemented
  // as numeric template args.
  repeated NamedTemplateArgument named_template_arguments = 7;
}

// This is the output of the MediaPerceptionSinkCalculator.
message PerceptionSample {
  optional FramePerception frame_perception = 1;

  // The image frame data associated with the frame perception.
  optional ImageFrame image_frame = 2;

  optional AudioPerception audio_perception = 3;

  optional AudioVisualPerception audio_visual_perception = 4;

  // Stores metadata such as version of media perception features.
  optional Metadata metadata = 5;
}

// Perception results based on both audio and video inputs.
message AudioVisualPerception {
  // A timestamp in microseconds attached when this message was generated.
  optional uint64 timestamp_us = 1;

  // Human presence detection results.
  optional AudioVisualHumanPresenceDetection
      audio_visual_human_presence_detection = 2;
}

// Detection of human presence based on both audio and video inputs.
message AudioVisualHumanPresenceDetection {
  // Indicates a probability in [0, 1] interval that a human is present.
  optional double human_presence_likelihood = 1;
}

// Audio perception results for an audio frame.
message AudioPerception {
  // A timestamp in microseconds attached when this message was generated.
  optional uint64 timestamp_us = 1;

  // Audio localization results for an audio frame.
  optional AudioLocalization audio_localization = 2;

  // Human presence detection results for an audio frame.
  optional AudioHumanPresenceDetection audio_human_presence_detection = 3;

  // Hotword detection results.
  optional HotwordDetection hotword_detection = 4;
}

// Detection of hotword in the audio stream.
message HotwordDetection {
  enum Type {
    UNKNOWN_TYPE = 0;
    OK_GOOGLE = 1;
  }

  message Hotword {
    // Unique identifier for the hotword instance. Note that a single hotword
    // instance can span more than one audio frame. In that case a single
    // hotword instance can be reported in multiple Hotword or HotwordDetection
    // messages. Hotword messages associated with the same hotword instance will
    // have the same id.
    optional uint64 id = 1;

    // Indicates the type of this hotword.
    optional Type type = 2;

    // Id of the audio frame in which the hotword was detected.
    optional uint64 frame_id = 3;

    // Indicates the start time of this hotword in the audio frame.
    optional uint64 start_timestamp_ms = 4;

    // Indicates the end time of this hotword in the audio frame.
    optional uint64 end_timestamp_ms = 5;

    // Indicates a probability in [0, 1] interval that this hotword is present
    // in the audio frame.
    optional float confidence = 6;
  }

  repeated Hotword hotwords = 1;
}

// An estimate of the direction that the sound is coming from.
message AudioLocalization {
  // An angle in radians in the horizontal plane. It roughly points to the peak
  // in the probability distribution of azimuth defined below.
  optional double azimuth_radians = 1;

  // A probability distribution for the current snapshot in time that shows the
  // likelihood of a sound source being at a particular azimuth. For example,
  // azimuthScores = [0.1, 0.2, 0.3, 0.4] means that the probability that the
  // sound is coming from an azimuth of 0, pi/2, pi, 3*pi/2 is 0.1, 0.2, 0.3 and
  // 0.4, respectively.
  repeated double azimuth_scores = 2;
}

// Detection of human presence close to the microphone.
message AudioHumanPresenceDetection {
  // Indicates a probability in [0, 1] interval that a human has caused a sound
  // close to the microphone.
  optional double human_presence_likelihood = 1;

  // Estimate of the noise spectrogram.
  optional AudioSpectrogram noise_spectrogram = 2;

  // Spectrogram of an audio frame.
  optional AudioSpectrogram frame_spectrogram = 3;
}

// Spectrogram of an audio frame.
message AudioSpectrogram {
  repeated double values = 1;
}

// This message stores the image frame along with the meta data.
message ImageFrame {
  enum Format {
    FORMAT_UNSPECIFIED = 0;  // Unused required default value for Proto enums.
    RGB = 1;                 // Raw rgb image.
    PNG = 2;                 // PNG image.
    JPEG = 3;                // JPEG image.
  }
  optional int32 width = 1;
  optional int32 height = 2;
  optional Format format = 3;
  optional int32 data_length = 4;
  optional bytes pixel_data = 5;
}

// The set of computer vision metadata for an image frame.
message FramePerception {
  optional uint64 frame_id = 1;

  optional uint32 frame_width_in_px = 2;
  optional uint32 frame_height_in_px = 3;

  // The timestamp associated with the frame (when it enters the graph).
  optional uint64 timestamp = 4;

  // The list of entities detected for this frame.
  repeated Entity entity = 5;

  // Latency measurement for a list of packet streams in drishti graph.
  repeated PacketLatency packet_latency = 6;

  // Human presence detection results for a video frame.
  optional VideoHumanPresenceDetection video_human_presence_detection = 7;

  enum PerceptionType {
    UNKNOWN_TYPE = 0;
    FACE_DETECTION = 1;
    MOTION_DETECTION = 2;
    PERSON_DETECTION = 3;
  }

  // Indicates what types of frame perception were run.
  repeated PerceptionType perception_types = 8;
}

// Detection of human presence close to the camera.
message VideoHumanPresenceDetection {
  // Indicates a probability in [0, 1] interval that a human is present in the
  // video frame.
  optional double human_presence_likelihood = 1;

  // Indicates a probability in [0, 1] interval that motion has been detected
  // in the video frame.
  optional double motion_detected_likelihood = 2;

  // Type of lighting conditions.
  enum LightCondition {
    UNSPECIFIED = 0;

    // No noticeable change occurred.
    NO_CHANGE = 1;

    // Light was switched on in the room.
    TURNED_ON = 2;

    // Light was switched off in the room.
    TURNED_OFF = 3;

    // Light gradually got dimmer (for example, due to a sunset).
    DIMMER = 4;

    // Light gradually got brighter (for example, due to a sunrise).
    BRIGHTER = 5;

    // Black frame detected - the current frame contains only noise.
    BLACK_FRAME = 6;
  }

  // Indicates lighting condition in the video frame.
  optional LightCondition light_condition = 3;

  // Indicates a probability in [0, 1] interval that light condition value is
  // correct.
  optional double light_condition_likelihood = 4;
}

message Entity {
  // A unique id associated with the detected entity, which can be used to track
  // the entity over time.
  optional uint32 id = 1;

  enum EntityType {
    UNSPECIFIED = 0;
    FACE = 1;
    PERSON = 2;
    MOTION_REGION = 3;
    LABELED_REGION = 4;
  }

  optional EntityType type = 2;

  // Minimum box, which captures entire detected entity.
  optional BoundingBox bounding_box = 3;

  // A value for the quality of this detection.
  optional float confidence = 4;

  // Perpendicular distance (depth) from the camera plane to the entity.
  optional Distance depth = 5;

  // String label for this entity.
  optional string label = 6;
}

message BoundingBox {
  // The points that define the corners of a bounding box.
  optional Point top_left = 1;
  optional Point bottom_right = 2;
  // Indicates whether or not these coordinates are normalized to values between
  // 0 and 1.
  optional bool normalized = 3 [default = false];
}

message PacketLatency {
  // An identifier label for the packet.
  optional string label = 1;

  // Delay in microseconds with respect to a reference packet.
  optional uint64 latency_usec = 2;
}

message Point {
  // x represents the horizontal distance from the top left corner of the image
  // to the point.
  optional float x = 1;
  // y represents the vertical distance from the top left corner of the image to
  // the point.
  optional float y = 2;
}

// Generic message object to encapsulate a distance magnitude and units.
message Distance {
  enum DistanceUnits {
    UNITS_UNSPECIFIED = 0;
    METERS = 1;
    PIXELS = 2;
  }

  optional DistanceUnits units = 1;

  optional float magnitude = 2;
}

// The parameters for a whiteboard in the image frame.
message Whiteboard {
  // The top left corner of the whiteboard in the image frame.
  optional Point top_left = 1;

  // The top right corner of the whiteboard in the image frame.
  optional Point top_right = 2;

  // The bottom left corner of the whiteboard in the image frame.
  optional Point bottom_left = 3;

  // The bottom right corner of the whiteboard in the image frame.
  optional Point bottom_right = 4;

  // The physical aspect ratio of the whiteboard.
  optional float aspect_ratio = 5;
}
chromium/chromeos/ash/components/dbus/media_analytics/media_perception.proto