chromium/third_party/mediapipe/src/mediapipe/util/tracking/flow_packager.proto

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";

package mediapipe;

import "mediapipe/util/tracking/motion_models.proto";
import "mediapipe/util/tracking/region_flow.proto";

// Messages encapsulating compressed and uncompressed TrackingData.
//
// Uncompressed tracking data can be aggregated via an TrackingDataChunk
// (e.g. to be cached to file per chunk). The whole chunk persists in memory
// after reading.
//
// Compressed tracking data can be aggregated as binary encoded TrackingData
// messages into two container formats (with support for random seeking):
// 1) TrackingContainerProto:
//    Encoding using proto buffer wire format, using default proto
//    serialization and de-serialization to binary string.
//    The container uses the MetaData message to store the stream offsets and
//    milliseconds for each frame of TrackingData. TrackingData itself is custom
//    encoded to binary using FlowPackager::EncodeTrackingData and the resulting
//    binary blob wrapped in a BinaryTrackingData message.
// 2) TrackingContainerFormat:
//    Encoding without any dependencies to protobuffers, for clients without
//    proto buffer support.
//    Encoding is based on encoding binary blobs of data wrapped into repeated
//    containers. The layout of a container is described by the message
//    TrackingContainer and serialized to binary data as described below
//    (without using proto encoding). Therefore, message TrackingContainer is
//    mostly for documentation purposes than for direct use.
//    The format is described by the proto message TrackingContainerFormat (used
//    internally by FlowPackager) however serialization and de-serialization
//    to binary string is performed using custom methods supplied by
//    FlowPackager (TrackingContainerFormatToBinary and
//    TrackingContainerFormatFromBinary).
//    The format stores the MetaData first as above, although using custom
//    encoding. TrackingData is encoded to binary as above using
//    FlowPackager::EncodeTrackingData and the resulting binary blob is storred
//    within a TrackingContainer.

// Next flag: 9
message TrackingData {
  enum FrameFlags {
    FLAG_PROFILE_BASELINE = 0;
    FLAG_PROFILE_HIGH = 1;
    FLAG_HIGH_FIDELITY_VECTORS = 2;
    FLAG_BACKGROUND_UNSTABLE = 4;  // Background model could not be estimated.
    FLAG_DUPLICATED = 8;           // Frame is duplicated, i.e. identical to
                                   // previous one.
    // Indicates the beginning of a new chunk. In this case the track_id's
    // are not compatible w.r.t. previous one.
    FLAG_CHUNK_BOUNDARY = 16;
  }

  optional int32 frame_flags = 1 [default = 0];

  // Tracking data is resolution independent specified w.r.t.
  // specified domain.
  optional int32 domain_width = 2;
  optional int32 domain_height = 3;

  // Aspect ratio (w/h) of the original frame tracking data was computed from.
  optional float frame_aspect = 6 [default = 1.0];

  optional Homography background_model = 4;

  // Stores num_elements vectors of motion data. (x,y) position encoded via
  // row_indices and col_starts, as compressed sparse column matrix storage
  // format:
  // (https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29),
  // Vector data is stored as (dx, dy) position. Optionally we store the fitting
  // error and track id for each feature.
  message MotionData {
    optional int32 num_elements = 1;

    // #num_elements pairs (flow_x, flow_y) densely packed.
    repeated float vector_data = 2 [packed = true];

    // Stores corresponding track index for each feature. Features belonging
    // to the same track over time are assigned the same id.
    // NOTE: Due to size, tracking ids are never stored as compressed binary
    // tracking data.
    repeated int32 track_id = 3 [packed = true];

    // # num_elements row indices.
    repeated int32 row_indices = 4 [packed = true];

    // Start index in above array for each column (#domain_width + 1 entries).
    repeated int32 col_starts = 5 [packed = true];

    // Feature descriptors for num_elements feature points.
    repeated BinaryFeatureDescriptor feature_descriptors = 6;

    // Stores all the tracked ids that have been discarded actively. This
    // information will be used by downstreaming to avoid misjudgement on
    // tracking continuity.
    repeated int32 actively_discarded_tracked_ids = 7;
  }

  optional MotionData motion_data = 5;

  // Total number of features in our analysis
  optional uint32 global_feature_count = 7;

  // Average of all motion vector magnitudes (without accounting for any motion
  // model), within 10th to 90th percentile (to remove outliers).
  optional float average_motion_magnitude = 8;
}

message TrackingDataChunk {
  message Item {
    optional TrackingData tracking_data = 1;
    // Global frame index.
    optional int32 frame_idx = 2;
    // Corresponding timestamp.
    optional int64 timestamp_usec = 3;
    // Previous frame timestamp.
    optional int64 prev_timestamp_usec = 4;
  }

  repeated Item item = 1;

  // Set as marker for last chunk.
  optional bool last_chunk = 2 [default = false];

  // Set as marker for first chunk.
  optional bool first_chunk = 3 [default = false];
}

// TrackingData in compressed binary format. Obtainable via
// FlowPackager::EncodeTrackingData. Details of binary encode are below.
message BinaryTrackingData {  // TrackingContainer::header = "TRAK"
  optional bytes data = 1;
}

// Detailed explanation of binary Tracking data encode (LITTLE ENDIAN encode!)
// TrackingData is stored in binary as a struct of the above fields and the
// compressed motion data in sparse column matrix storage format.
// (https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29)
// Specifically, TrackingData is encoded as:
// {  frame_flags        : 32 bit int    (from member)
//    domain_width       : 32 bit int    (from member)
//    domain_height      : 32 bit int    (from member)
//    frame_aspect       : 32 bit float  (from member)
//
//    background_model   : 6 * 32 bit float  (dx, dy, a, b, c, d of AffineModel)
//    scale              : 32 bit float  (scale vectors are multiplied with)
//    num_vectors        : 32 bit int    (from member num_elements)
//
//    col_start_delta    : (domain_width + 1) * 8 bit uint   (col starts delta
//                                                           encoded)
//    row_idx_size       : 32 bit int     (size of row_idx array <= num_vectors)
//    row_idx            : row_idx_size * 8 bit uint
//    vector_size        : 32 bit int     (size of vector_data)
//    vector_data        : vector_size * [8 bit | 16 bit] int
//                         (depending on FLAG_HIGH_FIDELITY_VECTORS)
// }
//
// >> Baseline encode <<
// Scale is determined such that maximum vector value (maximum across x and y)
// is mapped to highest 8 bit or 16 bit SIGNED int
// (i.e. 7 or 15 bit resolution respectively).
// Vector values are multiplied by this scale (storring float in int with
// truncation) and (dx, dy) is packed as [dy | dx] into a 16bit or 32 bit word.
// Unpacking requires therefore dividing the vector values by scale.
//
// Column starts are delta compressed, that is, col_start_delta[i] stores
// col_starts(i) - col_starts(i - 1) from MotionData.
//
// Row indices are directly stored at 8 bit uints, that is row_idx_size ==
// num_vectors in this case.
//
//
// >> High profile encode <<
// Scale is determined as above but for maximum vector deltas (maximum across x
// and y of magnitude in difference between two adjacent vectors). Vector value
// deltas are multiplied by this scale before encoding.
//
// Encoding is more complex compared to baseline. Instead of vector value, delta
// vector values (difference in dx = ddx, difference in dy = ddy)
// are multiplied by scale and storred packed as [ddy | ddx] into to 16bit or
// 32bit word. Compression algorithm accounts for error accumulation, so
// unpacking should first add deltas in integer domain (for x and y separately)
// and then divide by scale to yield (an approximation) of the
// original vector value.
// Most importantly, not every vector value is storred, but only if the delta is
// above the FlowPackagerOptions::high_profile_reuse_threshold, in which case we
// advance to the next vector data. Otherwise the previous vector is used.
//
// The information whether to advance is stored for each vector in the
// highest bit of of the row index (FlowPackagerOptions::ADVANCE_FLAG). Row
// indicies are not storred as in the baseline profile directly, but as deltas
// (reset at the beginning of every column). As deltas are small it is often
// possible to store two deltas (if both are < 8) in a single byte. This is
// indicated by the second highest flag in the row index
// (FlowPackagerOptions::DOUBLE_INDEX_ENCODE). If set, row index stores
// [row_delta_1 | row_delta_2] in the lower 6 bit. Note, that the advance flag
// applies uniformly to both deltas in this case.
// Sidenote (edge case): Due to the use of the top 2 bits as flags,
// at times we cannot store the full row delta in the lower 6 bits.
// In this case the vector is duplicated (using the ADVANCE_FLAG)
// until the delta sum of duplicated vectors reaches the original delta.
// Consequently, the compressed vector field in high profile may contain a few
// vectors more than the original.
//
// Column starts are delta compressed as in baseline, but account for double
// index encodes. Therefore each column delta is reduced by the number of double
// index encodes occuring for this column. This has to be replicated on the
// decoding side, each delta needs to be increased by the number of double index
// encodes encountered during encoding.

// Stores offsets for random seek and time offsets for each frame of
// TrackingData. Stream offsets are specified relative w.r.t. end of metadata
// blob.
// Offsets specify start of the corresponding binary encoded TrackingContainer
// (for TrackingContainerFormat) or BinaryTrackingData proto (for
// TrackingContainerProto).
message MetaData {  // TrackingContainer::header = "META"
  optional fixed32 num_frames = 2;

  message TrackOffset {
    optional fixed32 msec = 1;           // Time offset of the metadata in msec.
    optional fixed32 stream_offset = 2;  // Offset of TrackingContainer or
                                         // respectively BinaryTrackingData
                                         // in stream.
                                         // Specifed w.r.t. end of the Metadata.
  }

  repeated TrackOffset track_offsets = 3;
}

// TrackingContainer is self-describing container format to store arbitrary
// chunks of binary data. Each container is typed via its 4 character header,
// versioned via an int, and followed by the size of the binary data and the
// actual data. Designed for clients without availability of protobuffer
// support.
// Note: This message is mainly used for documentation purposes and uses custom
// encoding as specified by FlowPackager::TrackingContainerFormatToBinary.
// Default binary size of a TrackingContainer (DO NOT CHANGE!):
// header:    4 byte +
// version:   4 byte +
// size:      4 byte +
// data       #size
// SUM:       12 + #size.
message TrackingContainer {
  optional string header = 1;                  // 4 character header.
  optional fixed32 version = 2 [default = 1];  // Version information.
  optional fixed32 size = 3;  // Size of binary data held by container
  optional bytes data = 4;    // Binary data encoded.

  // DO NOT alter layout of TrackingContainer.
  // Use version to extend or alter encoded binary data.
}

// Container format for clients without proto support (written via
// FlowPackager::TrackingContainerFormatToBinary and read via
// FlowPackager::TrackingContainerFormatFromBinary).
// Proto here is intermediate format for documentationa and internal use.
// Stores multiple TrackingContainers of different types.
// Meta data is storred first, to facilitate random seek (via stream offset
// positions) to arbitrary binary TrackinData. Termination container signals end
// of stream.
message TrackingContainerFormat {
  optional TrackingContainer meta_data = 1;   // Wraps binary meta data, via
                                              // custom encode.
  repeated TrackingContainer track_data = 2;  // Wraps BinaryTrackingData.

  // Add new TrackingContainers above before end of stream indicator.
  // Zero sized termination container with TrackingContainer::header = "TERM".
  optional TrackingContainer term_data = 3;
}

// Simplified proto format of above TrackingContainerFormat. Instead of using
// self-describing TrackingContainer's, we simply use the proto wire format for
// encoding and decoding (proto format is typed and versioned via ids).
message TrackingContainerProto {
  optional MetaData meta_data = 1;
  repeated BinaryTrackingData track_data = 2;
}

// Options controlling compression and encoding.
message FlowPackagerOptions {
  // Tracking data is resolution independent specified w.r.t.
  // specified domain. Only values <= 256 are supported if binary tracking data
  // is requested to be supported (see below).
  optional int32 domain_width = 1 [default = 256];
  optional int32 domain_height = 2 [default = 192];

  // Needs to be set for calls to FlowPackager::EncodeTrackingData. If encoding
  // is not required, can be set to false in which case a higher domain_width
  // can be used.
  optional bool binary_tracking_data_support = 6 [default = true];

  optional bool use_high_profile = 3 [default = false];

  // If set uses 16 bit encode for vector data, in BinaryTrackingData,
  // otherwise only 8 bits are used.
  optional bool high_fidelity_16bit_encode = 4 [default = true];

  // In high profile encode, re-use previously encoded vector when absolute
  // difference to current vector is below threshold.
  optional float high_profile_reuse_threshold = 5 [default = 0.5];

  // High profile encoding flags.
  enum HighProfileEncoding {
    ADVANCE_FLAG = 0x80;
    DOUBLE_INDEX_ENCODE = 0x40;
    INDEX_MASK = 0x3F;
  }
}