// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Options for tracking by pyramidical Lucas-Kanade. Local outlier rejection
// is performed by enforcing translation models over regions (grid-based
// or obtained from segmentation).
syntax = "proto2";
package mediapipe;
import "mediapipe/util/tracking/tone_estimation.proto";
// Next tag: 33
message TrackingOptions {
// Describes direction of flow during feature tracking and for the output
// region flow.
enum FlowDirection {
FORWARD = 1; // Tracks are forward, from frame N-k -> frame N (k > 0).
BACKWARD = 2; // Tracks are backward, from frame N -> frame N-k
// (k > 0).
CONSECUTIVELY = 3; // Try forward and backward tracking consecutively.
// Forward tracking is done first. If the flow is
// stable, i.e. there are sufficient tracked features,
// the region flow computation ends. Otherwise, backward
// tracking is performed.
}
// Flow direction used internally during tracking features. Forward tracking
// allows reusing tracked features instead of explicitly tracking them in
// every frame, and can therefore be faster. See the reuse_features_XXX
// options below. However, if not reusing features, then it is best to match
// the direction for both internal tracking and output flow, for peformance
// reasons.
optional FlowDirection internal_tracking_direction = 19 [default = BACKWARD];
// Direction of flow vectors that are computed and output by calls to retrieve
// region flow, tracked features, etc. Note when this is BACKWARD, then the
// returned flow for frame N contains features tracked *from* frame N to a
// previous frame N-k. When this is FORWARD, the flow for frame N contains
// the flow from features in a previous frame N-k, tracked *to* frame N.
// Note that the output flow direction can only be set to FORWARD or BACKWARD.
optional FlowDirection output_flow_direction = 20 [default = BACKWARD];
// Specifies how a feature is tracked w.r.t. previous or next frames
// (dependent on the FlowDirection options above).
// Per default, each frame is tracked w.r.t. a single neighboring frame
// (TRACK_SINGLE_FRAME). If associations across multiple frames are desired,
// TRACK_MULTI_FRAME creates multiple results for the current frame, by
// tracking features w.r.t. multiple neighbors. Number of neighbors is
// specified by multi_frames_to_track.
// If long feature tracks are desired (i.e. a track across a frame pair
// that is identified to belong to an earlier known feature), use
// TRACK_ACROSS_FRAMES. Maximum track length can be specified by
// long_tracks_max_frames.
enum TrackingPolicy {
POLICY_SINGLE_FRAME = 1; // Tracks w.r.t. previous or next frame.
POLICY_MULTI_FRAME = 2; // Tracks w.r.t. multiple frames.
POLICY_LONG_TRACKS = 3; // Create long feature tracks.
// Requires internal_tracking_direction to be
// FORWARD. Checked against.
}
optional TrackingPolicy tracking_policy = 25 [default = POLICY_SINGLE_FRAME];
// Number of frame-pairs used for POLICY_MULTI_FRAME, ignored for other
// policies.
// Value of 1 means we are tracking features in the current frame, w.r.t.
// the previous one. Value of 2 denotes tracking of features in current
// w.r.t the previous one and the one before the previous one, etc.
optional int32 multi_frames_to_track = 1 [default = 1];
// Maximum length of long feature tracks for POLICY_LONG_TRACKS in frames.
// Note: This maximum is not hard enforced, to avoid that many long
// tracks are dropped at the same time. Instead if a feature reaches
// long_tracks_max_frames * 0.8, it will get dropped with a probability of X,
// where X is calculated, such that 95% of all qualifying features are
// dropped within the interval [.8, 1.2] * long_tracks_max_frames.
optional int32 long_tracks_max_frames = 26 [default = 300];
// Hard limit of maximum number of features. Control density of features, with
// min_feature_distance option. This limit is to guarantee that the
// run-time of RegionFlowComputation does not spiral out of control.
optional int32 max_features = 2 [default = 2000];
// Specifies the extraction method for features.
enum CornerExtractionMethod {
EXTRACTION_HARRIS = 1; // Using Harris' approximation of
// EXTRACTION_MIN_EIG_VAL.
EXTRACTION_MIN_EIG_VAL = 2; // Exact smallest eigenvalue computation.
EXTRACTION_FAST = 3; // Extract using FAST feature detector.
}
optional CornerExtractionMethod corner_extraction_method = 27
[default = EXTRACTION_MIN_EIG_VAL];
// Settings for above corner extraction methods.
message MinEigValExtractionSettings {
// Quality level of features (features with
// min_eig_value < quality_level * max_eig_value are rejected).
// Here [min|max]_eig_value denote the minimum and maximum eigen value of
// the auto-correlation matrix of the patch centered at a feature point. The
// ratio of eigenvalues denotes the "cornerness", lower means more
// pronounced corners.
// (see http://en.wikipedia.org/wiki/Harris-Affine for details.)
optional float feature_quality_level = 1 [default = 0.01];
// Features below this quality level are always discarded, even if their
// score is above feature_quality_level() * local maximum within that grid
// cell. This prevents us from including very poor features.
optional float adaptive_lowest_quality_level = 2 [default = 8e-5];
}
optional MinEigValExtractionSettings min_eig_val_settings = 28;
message HarrisExtractionSettings {
// Same as in MinEigValExtractionSettings.
optional float feature_quality_level = 1 [default = 2.5e-4];
// Note, due to Harris response being negative for some pixels,
// no lowest quality level is enforced.
}
optional HarrisExtractionSettings harris_settings = 29;
message FastExtractionSettings {
// threshold on difference between intensity of the central pixel and pixels
// of a circle around this pixel. Empirically, the larger the threshold, the
// fewer the keypoints will be detected.
// Default value set as the same with OpenCV.
optional int32 threshold = 1 [default = 10];
}
optional FastExtractionSettings fast_settings = 31;
// Radius of the window size so that the full window is
// 2*tracking_window_size+1
optional int32 tracking_window_size = 4 [default = 10];
optional int32 tracking_iterations = 5 [default = 10];
// Fractional tracking distance w.r.t. to frame diameter d. The number of
// pyramid levels l is chosen such that
// 2^l * tracking_window_size / 2 >= fractional_tracking_distance * d.
// Therefore, theoretically it is guaranteed that objects moving less than
// fractional_tracking_distance * d can be tracked.
optional float fractional_tracking_distance = 6 [default = 0.15];
// If set, modifies tracking distance to be 130% of maximum average
// tracking distances of previous frames.
optional bool adaptive_tracking_distance = 24 [default = false];
// Minimum feature distance in pixels. Close features are suppressed. If value
// < 1, the distance is computed as a fraction of the frame diameter.
optional float min_feature_distance = 7 [default = 7];
// By default, when downscaling by factor x, the minimum feature distance
// is downscaled by a factor of sqrt(x). If set false, no scaling is
// performed.
optional bool distance_downscale_sqrt = 21 [default = true];
// Uses grid based extraction of features. Quality level is local within a
// grid cell and results are combined over all cells and multiple scales and
// grid offsets.
// Default option, setting it to false is deprecated and will fail.
optional bool adaptive_good_features_to_track = 8 [default = true];
// Size of each grid cell. Values < 1 are interpreted to be relative to
// frame_width_ x frame_height_.
optional float adaptive_features_block_size = 9 [default = .26];
// Scales / levels employed for feature extraction. Grid cell size is scaled
// by 0.5 for each level.
optional int32 adaptive_features_levels = 10 [default = 1];
// If > 1, feature extraction is carried out at multiple scales by downscaling
// the image repeatedly, extracting features (eigenvalue images) and upscaling
// them.
optional int32 adaptive_extraction_levels = 22 [default = 1];
// Alternate way of specifying extraction levels: number of levels is
// automatically computed by downsampling the image until its maximum
// dimension (width or height) reaches this value. Overrides
// adaptive_extraction_levels if > 0.
optional int32 adaptive_extraction_levels_lowest_size = 23 [default = 0];
// Grid step-size in fraction of width or height used for creating synthetic
// zero motion tracks with feature points lying on a grid. Can be set based on
// desired number of total features as 1/sqrt(num_features),
// e.g. .04 ~= 1/sqrt(600).
optional float synthetic_zero_motion_grid_step = 13 [default = .04];
// If set, uses ORB features with brute force matching and ratio test
// to track frames across larger perspective changes than possible with
// default KLT features.
optional bool wide_baseline_matching = 14 [default = false];
// Only brute force matches with
// best_match_distance < ratio_test_threshold * second_best_match_distance
// are retained.
optional float ratio_test_threshold = 15 [default = 0.8];
// Refines wide baseline matches by estimating affine transform to
// wide-baseline matches which is used to seed initial positions for KLT
// matches.
optional bool refine_wide_baseline_matches = 16 [default = false];
// When tracking features, features tracked from frame A to frame B may be
// reused as the features for frame B when tracking from it (instead of
// extracting features). The max_frame_distance flag limits the distance
// between A and B for the features to be reused. Setting it to 0 => no
// re-use.
optional int32 reuse_features_max_frame_distance = 17 [default = 0];
// In conjunction with above, the features are reused in frame B only if they
// are at-least this fraction of the original features in frame A. Otherwise
// they are reset and extracted from scratch.
optional float reuse_features_min_survived_frac = 18 [default = 0.7];
enum KltTrackerImplementation {
UNSPECIFIED = 0;
KLT_OPENCV = 1; // Use OpenCV's implementation of KLT tracker.
}
// Implementation choice of KLT tracker.
optional KltTrackerImplementation klt_tracker_implementation = 32
[default = KLT_OPENCV];
// Deprecated fields.
extensions 3, 11, 12, 30;
}
// Next tag: 67
message RegionFlowComputationOptions {
optional TrackingOptions tracking_options = 1;
// Features are binned into grids of different resolutions (see
// fast_estimation_block_size below) and retained if they survive a localized
// translation based RANSAC algorithm and at the survivors are at least of
// size min_feature_inliers. Must be at least 3!
optional int32 min_feature_inliers = 2 [default = 3];
// Relative number of inlier features w.r.t. average number of features
// per grid bin. Maximum of both thresholds is used as actual threshold.
optional float relative_min_feature_inliers = 46 [default = 0.2];
// Pre-blur before computing features to reduce noise. Set to zero for no
// blurring.
optional float pre_blur_sigma = 33 [default = 0.8];
// Number of ransac rounds to estimate per region flow vector. This could be
// adaptive, but the required number of rounds is so low, that estimating
// the bound is more costly than just running it for a fixed number of times.
optional int32 ransac_rounds_per_region = 3 [default = 15];
// Error thresholds for a feature to be considered as an inlier in
// pixel-distance. The max of all three thresholds below is used as the actual
// threshold.
// Absolute in pixels.
optional float absolute_inlier_error_threshold = 4 [default = 2];
// Scaled w.r.t. frame diameter.
optional float frac_inlier_error_threshold = 52 [default = 0];
// Scaled w.r.t model estimated during each RANSAC round.
optional float relative_inlier_error_threshold = 44 [default = 0.1];
// Returns for each grid only the top N inlier sets.
optional int32 top_inlier_sets = 45 [default = 2];
// For debugging purposes, uses all tracked features regardless of the above
// setting.
optional bool no_estimation_mode = 40 [default = false];
// Block size in pixels. If fractional block_size is used (0 < size < 1),
// it is interpreted as fraction of the image dimensions.
// We use 4 blocks in each dimension by standard.
optional float fast_estimation_block_size = 6 [default = .25];
// Minimum block size in pixels (larger dimension) to perform fast estimation
// on. Pyramid levels are allocated such that
// block_size * 0.5^(level - 1) = min_block_size.
// At least two levels are used.
optional int32 fast_estimation_min_block_size = 25 [default = 100];
// We use overlapping versions of the grid, next parameters specifies how
// many in each dimensions (total is therefore, the value squared!).
optional int32 fast_estimation_overlap_grids = 22 [default = 3];
// Flow features with motion above this thresholds (w.r.t. frame diameter)
// are rejected.
optional float max_magnitude_threshold_ratio = 23 [default = 0.2];
// Flow features that have a motion that is larger than
// median_magnitude_bounds times the median magnitude are discarded.
// If set to zero, test is not enforced.
optional float median_magnitude_bounds = 51 [default = 0.0];
// Determines how irls weights for computed features are initialized.
// In general, more stable features are given higher weight.
enum IrlsInitialization {
INIT_UNIFORM = 1; // All weights equal 1
// Feature's irls weight is initialized to a value in [0, 2]
// indicating how consistent the feature's motion is w.r.t. neighboring
// features (high values = very consistent). Determined by counting how
// often a feature is part of the inlier set for a particular bin.
INIT_CONSISTENCY = 2;
}
// If this option is activated, feature's irls weight is initialized to the
// inverse of its computed flow.
optional IrlsInitialization irls_initialization = 49
[default = INIT_CONSISTENCY];
// We support down-sampling of an incoming frame before running the
// resolution dependent part of the region flow computation (feature
// extraction and tracking if desired).
// Note that in all downsampling modes except for DOWNSAMPLE_TO_INPUT_SIZE,
// for uneven dimensions after downsampling, we always round up to
// the nearest even dimension, i.e. 350p with a downsample_factor of 2.0
// would expect an input of size 176p.
enum DownsampleMode {
// No downsampling.
DOWNSAMPLE_NONE = 1;
// Downsizes the input frame such that frame_size == downsampling_size,
// where frame_size := max(width, height).
DOWNSAMPLE_TO_MAX_SIZE = 2;
// Downsizes frame by pre-defined factor, downsample_factor below.
DOWNSAMPLE_BY_FACTOR = 3;
// Downsampling based on downsampling schedule, see DownsampleSchedule below
// for details.
DOWNSAMPLE_BY_SCHEDULE = 4;
// Downsizes the input frame such that frame_size == downsampling_size,
// where frame_size := min(width, height).
DOWNSAMPLE_TO_MIN_SIZE = 5;
// Input frame is assumed to be already downsampled by the factor specified
// by downsample_factor below. For example if the original frame is 720p,
// and downsample_factor is set to 2.0, then we expect as input 360p.
DOWNSAMPLE_TO_INPUT_SIZE = 6;
}
optional DownsampleMode downsample_mode = 11 [default = DOWNSAMPLE_NONE];
// Specify the size of either dimension here, the frame will be
// downsampled to fit downsampling_size.
optional int32 downsampling_size = 12 [default = 256];
optional float downsample_factor = 18 [default = 2.0];
// If set, we will force the computed downsampling factor to be the nearest
// integer, resulting in faster downsampling. This will have no effect for
// DOWNSAMPLE_TO_INPUT_SIZE, DOWNSAMPLE_BY_FACTOR, and DOWNSAMPLE_BY_SCHEDULE,
// which should have exact values defined.
optional bool round_downsample_factor = 62 [default = false];
// Downsampling schedule. Frame sizes up to which a particular downsampling
// factor is applied. Factor chosen by comparing actual frame area against
// standard area (standard_width * standard_height), where standard_width =
// 16/9 X standard_height.
message DownSampleSchedule {
optional float downsample_factor_360p = 1 [default = 1]; // For <= 360p.
optional float downsample_factor_480p = 2 [default = 1]; // For <= 480p.
optional float downsample_factor_720p = 3 [default = 2]; // For <= 720p.
optional float downsample_factor_1080p = 4 [default = 2]; // >= 720p.
}
// Used if downsample_mode is DOWNSAMPLE_BY_SCHEDULE.
optional DownSampleSchedule downsample_schedule = 19;
// Minimum number of good features that we require to be present.
// Without good features, the estimated motion models will do more harm than
// good, so it is better to use simply the identity transform for this frame,
// and set the flag unstable_models to true in RegionFlow.
optional int32 min_feature_requirement = 13 [default = 20];
// We also require features to cover a minimum percentage area of the frame.
// We use downsampling and plot each feature by a 1 in a grid, this is
// equivalent to plotting each feature by a rectangle in the original frame.
optional float min_feature_cover = 14 [default = 0.15];
// Grid size for above min feature cover.
optional int32 min_feature_cover_grid = 20 [default = 8];
// Computes blur score for each frame. Score is proportional to amount of
// blur present in a frame, i.e. higher scores reflect more blurred frames.
// Note that the score is dependent on the gradient distribution of the image
// content, i.e. the score itself is rather meaningless but needs to be
// compared to scores of neighboring frames.
optional bool compute_blur_score = 17 [default = false];
message BlurScoreOptions {
// Blur score is only computed over image regions of high cornerness
// (as blur in any direction will always alter these regions). First, the
// corner image (smallest eigenvalue of 2nd moment matrix) is box filtered,
// and then thresholded.
optional int32 box_filter_diam = 1 [default = 3];
// Specifies relative (w.r.t. maximum) and absolute corneress threshold
// for threshold operation.
optional float relative_cornerness_threshold = 2 [default = 3e-2];
optional float absolute_cornerness_threshold = 3 [default = 1e-4];
// Blur score is defined as 1.0 / <median cornerness>, where
// <median cornerness> is the n-th percentile of the cornerness evaluated
// over the image regions of high corness as specified above.
optional float median_percentile = 5 [default = 0.85];
}
optional BlurScoreOptions blur_score_options = 31;
// Determines how/if visual consistency is computed. If activated,
// computes the absolute *change* in visual difference between two adjancent
// frame pairs, i.e. the modulus of the 2nd derivative of the frame
// appearance. Stores result in RegionFlowFeatureList::visual_consistency.
message VisualConsistencyOptions {
// Computation of visual consistency is only performed if activated.
optional bool compute_consistency = 1 [default = true];
// Incoming color or gray scale image is scaled to a tiny square image of
// the specified dimension. Used to compare adjacent images via SSD.
optional int32 tiny_image_dimension = 2 [default = 20];
}
optional VisualConsistencyOptions visual_consistency_options = 55;
// Radius of patch descriptor computed during RetrieveRegionFlowFeatureList
// call.
optional int32 patch_descriptor_radius = 21 [default = 3];
// Minimum distance from image border. Must be greater or equal to
// patch_descriptor_radius.
optional int32 distance_from_border = 50 [default = 3];
// Corner response is scaled by scalar below and normalized to lie within
// [0, 1], where 0 is low corner score and 1 high corner score.
optional float corner_response_scale = 26 [default = 1500];
// Verifies reliablity of features, by back-tracking operation from matched
// location. If returned location is within verification_distance feature is
// accepted otherwise discarded.
optional bool verify_features = 27 [default = false];
optional float verification_distance = 28 [default = 0.5];
// If set, consistency of long features is verified (in case tracking_policy
// is set to POLICY_LONG_FEATURES) by extracting a patch
// around the feature during the very first observation and comparing the
// matching patching along the long feature trajectory via SSD. If the
// difference is above the long_feature_verification_threshold the feature is
// removed.
optional bool verify_long_features = 53 [default = true];
// Maximum average per pixel error (in L1 norm) in the normalized intensity
// domain for matching patches to be considered to be consistent.
optional float long_feature_verification_threshold = 54 [default = 0.04];
// Long features are expected to have limited acceleration over time.
// If acceleration exceeds specified value based on the setting in
// verify_long_feature_acceleration either:
// a) verify_long_feature_acceleration = false
// A new track is started instead of continuing the old one.
// The track itself is not removed in this case.
//
// b) verify_long_feature_acceleration = true
// The track is flagged for verification, by back-tracking operation from
// matched location. If track fails verification test it is
// discarded. This only triggers if at least
// verify_long_feature_trigger_ratio of features have been flagged,
// otherwise option a is used.
optional float max_long_feature_acceleration = 56 [default = 5.0];
optional bool verify_long_feature_acceleration = 63 [default = false];
optional float verify_long_feature_trigger_ratio = 64 [default = 0.0];
// If true, histogram equalization is performed to the input image sequence
// before registration.
optional bool histogram_equalization = 57 [default = false];
// If true, synthetic region flows with zero motion are used for all (or just
// the first) frame.
optional bool use_synthetic_zero_motion_tracks_all_frames = 34
[default = false];
optional bool use_synthetic_zero_motion_tracks_first_frame = 35
[default = false];
// Optional gain correction before tracking features. Improves robustness when
// lighting is changing.
optional bool gain_correction = 36 [default = false];
// If set performs gain correction by simply equalizing mean intensity
// between frames, instead of using ToneEstimation.
optional bool fast_gain_correction = 61 [default = false];
// If the multiple hypothesis flag is set, features are tracked using both
// with and without gain correction, and the hypothesis with more inliers
// is selected.
optional bool gain_correction_multiple_hypotheses = 47 [default = true];
// This flag, when used together with the multiple hypotheses flag, specifies
// that gain correction should increase the number of inliers by at least this
// fraction for it to be used instead of default tracking.
optional float gain_correction_inlier_improvement_frac = 48 [default = 0.1];
// If set, always uses the brighter frame as reference. This is the
// preferred direction of correction, to avoid overexposed regions from
// being corrected which leads to spurious matches.
optional bool gain_correction_bright_reference = 59 [default = false];
// Only performs gain correction if number of tracked features falls under
// specified ratio (w.r.t. previous frame).
// Set to zero, to always perform gain correction if requested.
optional float gain_correction_triggering_ratio = 60 [default = 0.0];
// Gain correction is based on a grid of zero motion features, independent of
// the underlying motion. Fractional parameter specifies resolution of the
// grid w.r.t. frame size.
optional float frac_gain_feature_size = 37 [default = 0.3];
optional float frac_gain_step = 38 [default = 0.1];
enum GainCorrectMode {
GAIN_CORRECT_DEFAULT_USER = 1; // Uses default or user supplied bounds,
// i.e. gain_bias_bounds is left untouched.
GAIN_CORRECT_VIDEO = 2; // Uses defaults for video (most strict).
GAIN_CORRECT_HDR = 3; // Uses most relaxed settings to track
// across HDR frames, taken at different
// exposures.
GAIN_CORRECT_PHOTO_BURST = 4; // More relaxed than video but stricter
// than HDR; use for photo burst where
// exposure between frames can change.
}
optional GainCorrectMode gain_correct_mode = 41
[default = GAIN_CORRECT_DEFAULT_USER];
// Bounds for the estimated model. If not set externally, will be set
// based on GainCorrectMode.
optional ToneEstimationOptions.GainBiasBounds gain_bias_bounds = 39;
// Supported image formats. All images are converted to grayscale
// before processing. These image formats only concern AddImage.
// IMPORTANT: All the Retrieve* methods expect RGB when the descriptors
// are computed.
enum ImageFormat {
FORMAT_GRAYSCALE = 1;
FORMAT_RGB = 2;
FORMAT_RGBA = 3;
FORMAT_BGR = 4;
FORMAT_BGRA = 5;
}
// Image format of the input.
optional ImageFormat image_format = 58 [default = FORMAT_RGB];
enum DescriptorExtractorType {
ORB =
0; // http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.370.4395&rep=rep1&type=pdf
}
// The descriptor extractor type used.
optional DescriptorExtractorType descriptor_extractor_type = 65
[default = ORB];
// Whether to compute derivatives when building the pyramid. When set to
// true, it's building a Laplacian pyramid. When set to false, it's building
// a Gaussian pyramid.
optional bool compute_derivative_in_pyramid = 66 [default = true];
// Deprecated fields.
extensions 5, 7, 8, 9, 10, 15, 16, 24, 29, 30, 32, 42, 43;
}