// Copyright 2019 The MediaPipe Authors.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
option java_package = "com.google.mediapipe.tracking";
option java_multiple_files = true;
// Captures additional information about a RegionFlowFeature's
// surrounding patch.
// Using MotionEstimation::RetrieveRegionFlowFeatureList or
// ComputeRegionFlowFeatureDescriptors the patch descriptor has the folling
// layout:
// (9 dimensional: 3 mean intensities, 3x3 covariance matrix, (only store upper
// half (6 elems) in column major order, i.e. indices for data in patch
// descriptor refer to:
// mean: 0 1 2, covariance: 3 4 5
// 6 7
// 8
message PatchDescriptor {
repeated float data = 1; // The actual feature descriptor.
// Binary feature descriptor for a particular feature.
// For example: orb
// http://citeseerx.ist.psu.edu/viewdoc/download?doi=
message BinaryFeatureDescriptor {
optional bytes data = 1;
// Internal datastructure used during temporal IRLS smoothing.
message TemporalIRLSSmoothing {
optional float weight_sum = 1 [default = 0];
optional float value_sum = 2 [default = 0];
// Tracked feature at location (x,y) with flow (dx, dy) and patch based
// error (sum of absolute value of intensity difference).
// Next tag: 19
message RegionFlowFeature {
optional float x = 1 [default = 0];
optional float y = 2 [default = 0];
optional float dx = 3 [default = 0];
optional float dy = 4 [default = 0];
// Features that belong to the same feature track are assigned a unique id
// and are identified via it.
// Note, this id is only unique within the lifetime of a RegionFlowComputation
// object. That is, if distribution or parallelization using multiple
// instances was used, the ids are only unique within that instance context.
optional int32 track_id = 13 [default = -1]; // no id.
// Tracking error as patch intensity residual (SSD).
optional float tracking_error = 5 [default = 0];
// Inverse of registration error (in pixels), after parametric motion model
// fitting. Values are in [0, 1e6].
// Low values correspond to outliers, high values to inliers.
// Set by MotionEstimation::EstimateMotions*
optional float irls_weight = 6 [default = 1.0];
// Corner response (computed as minimum eigenvalue of
// block filtered 2nd moment matrix).
optional float corner_response = 11 [default = 0.0];
// Patch feature descriptors. *For internal use only*. External clients should
// not rely on their contents.
optional PatchDescriptor feature_descriptor = 7;
optional PatchDescriptor feature_match_descriptor = 8;
// Internal datastructure used temporally during temporal IRLS smoothing.
optional TemporalIRLSSmoothing internal_irls = 10;
// Optional label for debugging purposes.
optional string label = 14;
// Flags indicating specific statuses.
enum Flags {
FLAG_BROKEN_TRACK = 1; // Used for long feature tracks if track id
// was reset.
optional int32 flags = 15;
// Unique feature id per RegionFlowComputation object.
optional int32 feature_id = 16;
// octave (pyramid layer) from which the keypoint has been extracted
optional int32 octave = 17 [default = 0];
// Feature descriptor for the current feature.
optional BinaryFeatureDescriptor binary_feature_descriptor = 18;
// Deprecated fields.
extensions 9, 12;
// RegionFlowFrame is a optical flow representation where each region has a
// consistent optical flow (adheres to local translational model).
// Regions are arranged in a regular grid according to BlockDescriptor.
// Next tag: 11.
message RegionFlowFrame {
// Next tag: 8
message RegionFlow {
required int32 region_id = 1;
// Mean anchor point (centroid) of flow vector and mean flow.
optional float centroid_x = 2 [default = 0];
optional float centroid_y = 3 [default = 0];
optional float flow_x = 4 [default = 0];
optional float flow_y = 5 [default = 0];
repeated RegionFlowFeature feature = 7;
// Deprecated fields.
extensions 6;
// Sorted by id for quick lookup.
repeated RegionFlow region_flow = 1;
// Total number of features in all RegionFlow's.
optional int32 num_total_features = 2 [default = 0];
// If set, indicates that the frame's region flow is unstable.
// (not enough features or coverage too low).
optional bool unstable_frame = 4 [default = false];
// Blur score of the current frame is defined as the n-th percentile
// of the corneress of the input frame evaluated over regions of high
// corneress. For details see BlurScoreOptions in
// region_flow_computation.proto.
// The actual value is pretty meaningless, but relative to the blur score
// of other frames one can detect blurry frames, e.g. by a 'significant'
// local maxima in a sequence of blur_scores.
optional float blur_score = 7;
optional int32 frame_width = 8;
optional int32 frame_height = 9;
// Region flow is estimated using a grid of equal sized bins as regions.
// BlockDescriptor specifies size of bins/blocks.
message BlockDescriptor {
optional int32 block_width = 1;
optional int32 block_height = 2;
optional int32 num_blocks_x = 3 [default = 0];
optional int32 num_blocks_y = 4 [default = 0];
optional BlockDescriptor block_descriptor = 10;
// Deprecated fields.
extensions 3, 5, 6;
// Encapsulates a list of features with associated flow.
// Can be extracted from RegionFlow via GetRegionFlowFeatureList
// declared in region_flow.h. This is the essential (additional) information
// required by Cropper using wobble_suppression with displacements.
// Next tag: 14
message RegionFlowFeatureList {
repeated RegionFlowFeature feature = 1;
optional int32 frame_width = 2;
optional int32 frame_height = 3;
// Set from corresponding RegionFlowFrame field.
optional bool unstable = 4 [default = false];
// Records the minimum distance from the image border for each feature and
// matching feature (if enforced > 0).
optional int32 distance_from_border = 5 [default = 0];
// Set from corresponding RegionFlowFrame field.
optional float blur_score = 6;
// If set, indicates, that features represent long tracks, i.e. each feature
// has a valid track_id() >= 0.
optional bool long_tracks = 7 [default = false];
// If long_tracks, stores number of long feature tracks that got rejected in
// this frame, as their patches were deemed inconsistent with the track's very
// first extracted patch.
optional float frac_long_features_rejected = 8 [default = 0];
// Measures visual consistency between adjacent frames. In particular, stores
// the absolute *change* in visual difference between two adjancent frame
// pairs, i.e. the modulus of the 2nd derivative of the frame appearance.
// Normalized w.r.t. number of channels and total pixels of the underlying
// frame.
// In particular for sudden changes (e.g. shot boundaries) this value will
// be significantly non-zero (> 0.05).
// Negative value per default indicates no consistency has been computed.
optional float visual_consistency = 9 [default = -1];
// Timestamp in micro seconds of the underlying frame, that is the frame
// for which the source features (not matching features) were computed.
optional int64 timestamp_usec = 10 [default = 0];
// Denotes the frame that flow was computed w.r.t. to, locally to the current
// frame. For example, if current frame is N, N + match_frame is the matching
// frame that flow was computed to.
// Values < 0 indicate backward tracking, while values > 0 indicate forward
// tracking. By default, for empty feature lists, matching frame is the
// same as current frame, i.e. match_frame = 0.
optional int32 match_frame = 11 [default = 0];
// Set, if frame is estimated to be an exact duplicate of the previous frame.
optional bool is_duplicated = 12 [default = false];
// Stores all the tracked ids that have been discarded actively in this frame.
// This information will be popluated via RegionFlowFeatureList, so that the
// downstreaming modules can receive it and use it to avoid misjudgement on
// tracking continuity.
// Discard reason:
// (1) A tracked feature has too long track, which might create drift.
// (2) A tracked feature in a highly densed area, which provides little value.
repeated int32 actively_discarded_tracked_ids = 13;
// Salient point location (normalized w.r.t. frame_width and frame_height, i.e.
// specified in the domain [0, 1] x [0, 1]).
// During retargeting and stabilization salient points introduce constraints
// that will try to keep the normalized location in the rectangle
// frame_size - normalized bounds.
// For this soft constraints are used, therefore the weight specifies
// how "important" the salient point is (higher is better).
// In particular for each point p the retargeter introduces two pairs of
// constraints of the form:
// x - slack < width - right
// and x + slack > 0 + left, with slack > 0
// where the weight specifies the importance of the slack.
// Similar to above, but constraints are introduced to keep
// the point to the left of the left bound OR the right of the right bound.
// In particular:
// x - slack < left OR
// x + slack >= right
// Similar to above, the weight specifies the importance of the slack.
// Note: Choosing a too high weight can lead to
// jerkiness as the stabilization essentially starts tracking the salient point.
message SalientPoint {
// Normalized location of the point (within domain [0, 1] x [0, 1].
optional float norm_point_x = 1 [default = 0.0];
optional float norm_point_y = 2 [default = 0.0];
enum SalientPointType {
// Salient point type. By default we try to frame the salient point within
// the bounding box specified by left, bottom, right, top. Alternatively, one
// can choose to exclude the point. For details, see discussion above.
optional SalientPointType type = 11 [default = TYPE_INCLUDE];
// Bounds are specified in normalized coordinates [0, 1], FROM the specified
// border. Opposing bounds (e.g. left and right) may not add to values
// larger than 1.
// Default bounds center salient point within centering third of the frame.
optional float left = 3 [default = 0.3];
optional float bottom = 4 [default = 0.3];
optional float right = 9 [default = 0.3];
optional float top = 10 [default = 0.3];
optional float weight = 5 [default = 15];
// In addition salient point can represent a region of interest (defined as
// ellipse of size norm_major x norm_minor (normalized to [0, 1] domain)
// which orientation is given by angle (in radians in [0, pi]).
// Due to aspect ratio change of the normalized domain, it is recommended that
// transformations to other domains are done via the ScaleSalientPoint
// function.
optional float norm_major = 6;
optional float norm_minor = 7;
// Angle of major axis with x-axis (counter-clock wise, in radians).
optional float angle = 8;
extensions 20000 to max;
// Aggregates SalientPoint's for a frame.
message SalientPointFrame {
repeated SalientPoint point = 1;
extensions 20000 to max;