// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/util/tracking/motion_models.proto";
// Next tag: 38
message MotionBoxState {
// Position (top-left corner) and fixed size of the current MotionBox,
// specified w.r.t. normalized domain (in [0, 1] along both dimensions).
optional float pos_x = 1;
optional float pos_y = 2;
optional float width = 3;
optional float height = 4;
// Optional degrees of freedom; scale and rotation w.r.t. center of the box,
// i.e. [pos_x, pos_y] + 0.5 * [width, height].
// To activate see TrackStepOptions::TrackingDegrees.
optional float scale = 5 [default = 1.0];
optional float rotation = 30 [default = 0.0]; // in radians.
message Quad {
// Vertex 0 is according to x_0 = vertices(0), y_0 = vertices(1)
// Vertex 1 is according to x_1 = vertices(2), y_1 = vertices(3)
// Vertex 2 is according to x_2 = vertices(4), y_2 = vertices(5)
// Vertex 3 is according to x_3 = vertices(6), y_3 = vertices(7)
// Order of vertices should be aligned in counter-clockwise manner
// 0---------3
// | |
// | |
// 1---------2
repeated float vertices = 1;
}
// This field is only used when we try to track under
// TRACKING_DEGREE_OBJECT_PERSPECTIVE.
optional Quad quad = 34;
// Aspect ratio (width / height) for the tracked rectangle in physical space.
optional float aspect_ratio = 35;
// Whether we want this box to be potentially grouped with other boxes
// to track together. This is useful for tracking small boxes that lie
// on a plane. For example, when we detect a plane,
// track the plane, then all boxes within the plane can share the same
// homography transform.
optional bool request_grouping = 37 [default = false];
// For quad tracking using pnp solver,
// Whether we use perspective-n-points to track quad between frames.
// That mode requires:
// 1. The quad which is being tracked is an rectangle in the physical world.
// 2. The `asepct_ratio` field has to be set in MotionBoxState.
optional Homography pnp_homography = 36;
// Object velocity in x and y, specified as normalized spatial unit per
// standard frame period (here calibrated w.r.t. kTrackingDefaultFps = 30
// FPS), that is 33.3 ms. Object velocity refers to velocity after
// subtracting camera motion.
// If current frame period is 66.67 ms (i.e. 15 fps); actual velocity is
// obtained by multipling with a factor of 2. Similar for 60 fps factor
// is 0.5f.
// Standard frame period is chosen for legacy reasons to keep TrackStepOptions
// defaults.
optional float dx = 7;
optional float dy = 8;
// Weighted average of object velocity magnitude of inlier points (expressed
// in normalized spatial units per standard frame period).
optional float kinetic_energy = 17;
// Specifies how valid the prior was in the last step.
optional float prior_weight = 9;
// Tracking status indicating result of tracking:
// UNTRACKED: Box can not be tracked
// (either out of bound or too many tracking failures).
// EMPTY: Box has size of <= 0 along at least on of its dimensions
// (collapsed).
// NO_FEATURES: No features found within the box, tracking is not possible.
// TRACKED: Successful tracking.
// DUPLICATED: Successful tracked, but duplicated from previous result as
// frame was duplicated.
// BOX_TRACKED_OUT_OF_BOUND: Successful tracked, out of bound from screen
// area. Will advance by camera motion. Only used for static objects.
enum TrackStatus {
BOX_UNTRACKED = 0;
BOX_EMPTY = 1;
BOX_NO_FEATURES = 2;
BOX_TRACKED = 3;
BOX_DUPLICATED = 4;
BOX_TRACKED_OUT_OF_BOUND = 5;
}
optional TrackStatus track_status = 10 [default = BOX_UNTRACKED];
// Spatial prior (presence of inliers, i.e. where is the object located within
// the box that is currently being tracked) as a pair of
// a) prior (in [0, 1]) and
// b) confidence (number of features converted to score within
// [0, 1]).
// Prior is defined over a grid of size spatial_prior_grid_size x
// spatial_prior_grid_size.
optional int32 spatial_prior_grid_size = 11 [default = 10];
repeated float spatial_prior = 12 [packed = true];
repeated float spatial_confidence = 13 [packed = true];
// Difference score between previous prior and current prior (in [0, 1]).
// Currently not used.
optional float prior_diff = 14;
// Score determining how much predicted motion disagrees with measured motion.
// If measured motion deviates strongly from predicted motion, disparity is
// +/-1, if motion agrees with predicted motion, disparity is 0.
// Sign indicates measured motion is accelerating (> 0)
// or de-accelerating (< 0) w.r.t. predicted motion.
optional float motion_disparity = 15;
// Score determining how discriminative estimated motion model is.
// In [0, 1] where 0 no discrimination w.r.t. background and 1
// high discrimination.
optional float background_discrimination = 16;
// Center of mass for inliers after tracking (center of feature that were used
// for motion estimation)
optional float inlier_center_x = 18;
optional float inlier_center_y = 19;
// Approximate number of inliers (each features scores a zero [outlier]
// or one [inlier]).
optional float inlier_sum = 24;
// Ratio of above inlier_sum to average inlier_sum across last states.
optional float inlier_ratio = 25;
// Extent (width and height of inliers).
optional float inlier_width = 22;
optional float inlier_height = 23;
// Set of current inlier tracking ids.
repeated uint32 inlier_ids = 26 [packed = true];
// Corresponding x,y coordinates for each inlier.
repeated uint32 inlier_id_match_pos = 31 [packed = true];
// Corresponding inlier score (currently: length of inlier observed).
repeated uint32 inlier_length = 27 [packed = true];
// Set of outlier ids.
repeated uint32 outlier_ids = 28 [packed = true];
// Corresponding x,y coordinates for each outlier.
repeated uint32 outlier_id_match_pos = 32 [packed = true];
// Confidence of box tracked in the range [0, 1], with 0 being least
// confident, and 1 being most confident. A reasonable threshold is 0.5
// to filter out unconfident boxes.
optional float tracking_confidence = 33;
// Additional internal state.
optional MotionBoxInternalState internal = 29;
reserved 20, 21;
}
// Captures additional internal state info about the tracking.
message MotionBoxInternalState {
// Stores all motion vectors that were used for tracking
// as packed arrays, capturing position, object motion, camera motion,
// tracking id and corresponding inlier weight.
repeated float pos_x = 1 [packed = true];
repeated float pos_y = 2 [packed = true];
repeated float dx = 3 [packed = true];
repeated float dy = 4 [packed = true];
repeated float camera_dx = 5 [packed = true];
repeated float camera_dy = 6 [packed = true];
repeated int32 track_id = 7 [packed = true];
// Within [0, 1]. 0 = outlier; 1 = inlier.
repeated float inlier_score = 8 [packed = true];
}
// Next tag: 42
message TrackStepOptions {
// Degrees of freedom being used for tracking. By default tracker only uses
// translation. Additionally scale and rotation from the camera motion
// and / or object motion can be taken into account.
enum TrackingDegrees {
TRACKING_DEGREE_TRANSLATION = 0;
// Additional tracking degrees according to camera motion.
TRACKING_DEGREE_CAMERA_SCALE = 1;
TRACKING_DEGREE_CAMERA_ROTATION = 2;
TRACKING_DEGREE_CAMERA_ROTATION_SCALE = 3;
// TODO: Implement!
TRACKING_DEGREE_CAMERA_PERSPECTIVE = 4;
// Tracking degrees modeling object motion. Note that additional
// object degrees of freedom are only applied when estimation is deemed
// stable, in particular sufficient inliers are present.
// By default, does NOT apply camera motion. If that is desired set
// the flag: track_object_and_camera to true.
TRACKING_DEGREE_OBJECT_SCALE = 5;
TRACKING_DEGREE_OBJECT_ROTATION = 6;
TRACKING_DEGREE_OBJECT_ROTATION_SCALE = 7;
TRACKING_DEGREE_OBJECT_PERSPECTIVE = 8;
}
optional TrackingDegrees tracking_degrees = 28
[default = TRACKING_DEGREE_TRANSLATION];
// If set and one of the TRACKING_DEGREE_OBJECT degrees are set also applies
// camera motion in addition to the object motion.
optional bool track_object_and_camera = 32 [default = false];
// Number of iterations to iteratively estimate model and re-estimate
// influence of each vector.
optional int32 irls_iterations = 1 [default = 5];
// Gaussian spatial prior sigma relative to box size.
// For motivation, see this plot: http://goo.gl/BCfcy.
optional float spatial_sigma = 2 [default = 0.15];
// Gaussian velocity prior sigma. It is computed as the maximum of the
// absolute minimum sigma (in normalized domain) and the relative sigma
// w.r.t. previous motion.
optional float min_motion_sigma = 3 [default = 0.002];
optional float relative_motion_sigma = 4 [default = 0.3];
// Settings for motion disparity. Difference between previous and current
// motion magnitude is scored linearly, from motion_disparity_low_level to
// motion_disparity_high_level (mapped to score of 0 and 1 respectively).
// Motivation is to ensure acceleration between frames are within reasonable
// bounds.
// Represents a maximum acceleration of around 4 - 5 pixels per frame in 360p
// video to be unpenalized, with accelerations of around >= 10 pixels being
// considered inconsitent with prediction.
optional float motion_disparity_low_level = 6 [default = 8e-3];
optional float motion_disparity_high_level = 7 [default = 1.6e-2];
// Motion disparity decays across frames. Disparity of previous frame decays
// over time. If disparity in current frame is not higher, i.e. the larger
// of the current and decayed disparity is taken.
// Motivation is, that if acceleration was unreasonable high (and we likely
// lost tracking) we enter a stage of trying to regain tracking by looking for
// vectors that agree with the previous prediction.
optional float disparity_decay = 8 [default = 0.8];
// Object motion is given as linear combination of previous and measured
// motion depending on the motion_disparity (a high disparity is giving high
// weight to the previous motion).
// We enforce at least a minimum of the below motion_prior_weight regardless
// of the motion disparity.
optional float motion_prior_weight = 9 [default = 0.2];
// Settings for motion discrimination.
//
// Current motion magnitude is scored linearly,
// from background_discrimination_low_level to
// background_discrimination_high_level (mapped to score of 0 and 1
// respectively).
// Motivation is that high object motions are easy to discriminate from the
// background, whereas small object motions are virtually indistinguishable.
// Represents a range of 2 - 4 pixels for 360p video.
optional float background_discrimination_low_level = 10 [default = 4e-3];
optional float background_discrimination_high_level = 11 [default = 8e-3];
// Spring force settings. If difference between predicted center of the box in
// the next frame and the predicted center of the inliers deviates by more
// than inlier_center_relative_distance times the box [width|height]
// a spring force is applied to the box. The amount of force is spring_force
// times the difference.
optional float inlier_center_relative_distance = 12 [default = 0.1];
optional float inlier_spring_force = 13 [default = 0.3];
// Same as above, but for the center of large motion magnitudes.
optional float kinetic_center_relative_distance = 14 [default = 0.4];
optional float kinetic_spring_force = 15 [default = 0.5];
// Spring force towards large motions is only applied when kinetic energy is
// above the specified threshold.
optional float kinetic_spring_force_min_kinetic_energy = 21 [default = 3e-3];
// Bias of old velocity during update step.
optional float velocity_update_weight = 16 [default = 0.7];
// Maximum number of frames considered to be tracking failures ->
// If over threshold, box is considered untrackable.
optional int32 max_track_failures = 17 [default = 10];
// Domain used for tracking is always larger than the current box.
// If current motion is not negligible, box is expanded in the direction the
// motion, otherwise expanded in all directions by the amount specified below
// (w.r.t. normalized domain).
optional float expansion_size = 18 [default = 0.05];
// Features are scored based on the magnitude of their irls weights, mapped to
// [0, 1] using the following range. The range represents roughly 3 - 1.5
// pixels error for 360p video.
optional float inlier_low_weight = 19 [default = 250];
optional float inlier_high_weight = 20 [default = 500];
// Kinetic energy decays over time by the specified rate.
optional float kinetic_energy_decay = 22 [default = 0.98];
// Amount by which prior is increased/decreased in case of valid/invalid
// measurements.
optional float prior_weight_increase = 23 [default = 0.2];
// We map the amount of present kinetic energy linearly to the domain [0, 1]
// describing if an object is static (0) or moving (1).
optional float low_kinetic_energy = 24 [default = 1e-3]; // ~0.4 pix
optional float high_kinetic_energy = 25 [default = 4e-3]; // ~3 pix
// Outputs internal state to MotionBoxState.
optional bool return_internal_state = 26 [default = false];
// Specifies which weights are stored in the internal state. By default
// post-estimation weights are stored, otherwise pre-estimation weights
// are stored.
optional bool use_post_estimation_weights_for_state = 29 [default = true];
// Computes spatial grid of inliers and stores it in the MotionBoxState.
optional bool compute_spatial_prior = 27 [default = false];
// Irls initialization by performing several rounds of RANSAC to preselect
// features for motion estimation scoring outliers low and inliers to be at
// least of median inlier weight.
message IrlsInitialization {
optional bool activated = 1 [default = false];
// Rounds of RANSAC.
optional int32 rounds = 2 [default = 50];
// Normalized cutoff threshold for a vector to be considered an inlier.
optional float cutoff = 3 [default = 0.005];
}
optional IrlsInitialization irls_initialization = 30;
// Ratio between static motion and temporal scale. This is actually
// the threshold on speed, under which we consider static (non-moving object).
optional float static_motion_temporal_ratio = 33 [default = 3e-3];
// Different control parameters to terminate tracking when
// occlusion occurs.
message CancelTrackingWithOcclusionOptions {
optional bool activated = 1 [default = false];
optional float min_motion_continuity = 2 [default = 0.4];
optional float min_inlier_ratio = 3 [default = 0.1];
}
optional CancelTrackingWithOcclusionOptions
cancel_tracking_with_occlusion_options = 34;
// If number of continued inliers is less than this number, then the object
// motion model will fall back to translation model.
// Set this min_continued_inliers threshold to a low number to make sure
// they follow local object rotation and scale, but it may result in un-robust
// rotation and scale estimation if the threshold is too low. Recommend that
// you don't set a number < 4.
optional int32 object_similarity_min_contd_inliers = 35 [default = 30];
// Maximum acceptable scale component of object similarity transform.
// Minimum scale is computed as 1.0 / max_scale.
// Exclusive for tracking a box with similarity.
optional float box_similarity_max_scale = 36 [default = 1.05];
// Maximum acceptable object similarity rotation in radians.
optional float box_similarity_max_rotation = 37 [default = 0.2];
// Homography transform will first be projected to similarity, and the scale
// component of the similarity transform should be within the range of
// [1.0 / max_scale, max_scale].
optional float quad_homography_max_scale = 38 [default = 1.2];
// The rotation component of the projected similarity should be smaller than
// this maximum rotation threshold.
optional float quad_homography_max_rotation = 39 [default = 0.3];
// Pre-calibrated camera intrinsics parameters, including focal length, center
// point, distortion coefficients (only 3 radial factors) and image width /
// height. The image formation model is described here:
// https://docs.opencv.org/2.4/doc/tutorials/calib3d/camera_calibration/camera_calibration.html
// Only used for quad tracking mode. Leave it empty if unknown.
message CameraIntrinsics {
optional float fx = 1;
optional float fy = 2;
optional float cx = 3;
optional float cy = 4;
optional float k0 = 5;
optional float k1 = 6;
optional float k2 = 7;
optional int32 w = 8;
optional int32 h = 9;
}
optional CameraIntrinsics camera_intrinsics = 40;
// Specifically for quad tracking (aka TRACKING_DEGREE_OBJECT_PERSPECTIVE
// mode), if aspect_ratio field is set in start pos, pnp tracking will be
// deployed. If aspect_ratio is unknown (not set), but forced_pnp_tracking is
// true, we will first estimate the aspect ratio for the 3D quadrangle, then
// perform pnp tracking. If aspect_ratio is unknown and pnp tracking is not
// forced, general homography tracking will be deployed.
optional bool forced_pnp_tracking = 41 [default = false];
}