chromium/third_party/mediapipe/src/mediapipe/calculators/tensor/tensors_to_detections_calculator.proto

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// The option proto for the TensorsToDetectionsCalculator.

syntax = "proto2";

package mediapipe;

import "mediapipe/framework/calculator.proto";

message TensorsToDetectionsCalculatorOptions {
  extend .mediapipe.CalculatorOptions {
    optional TensorsToDetectionsCalculatorOptions ext = 335742639;
  }

  // [Required] The number of output classes predicted by the detection model.
  optional int32 num_classes = 1;
  // [Required] The number of output boxes predicted by the detection model.
  optional int32 num_boxes = 2;
  // [Required] The number of output values per boxes predicted by the detection
  // model. The values contain bounding boxes, keypoints, etc.
  optional int32 num_coords = 3;

  // The offset of keypoint coordinates in the location tensor.
  optional int32 keypoint_coord_offset = 9;
  // The number of predicted keypoints.
  optional int32 num_keypoints = 10 [default = 0];
  // The dimension of each keypoint, e.g. number of values predicted for each
  // keypoint.
  optional int32 num_values_per_keypoint = 11 [default = 2];
  // The offset of box coordinates in the location tensor.
  optional int32 box_coord_offset = 12 [default = 0];

  // Parameters for decoding SSD detection model.
  optional float x_scale = 4 [default = 0.0];
  optional float y_scale = 5 [default = 0.0];
  optional float w_scale = 6 [default = 0.0];
  optional float h_scale = 7 [default = 0.0];

  optional bool apply_exponential_on_box_size = 13 [default = false];

  // Whether to reverse the order of predicted x, y from output.
  // If false, the order is [y_center, x_center, h, w], if true the order is
  // [x_center, y_center, w, h].
  // DEPRECATED. Use `box_format` instead.
  optional bool reverse_output_order = 14 [default = false];
  // The ids of classes that should be ignored during decoding the score for
  // each predicted box. Can be overridden with IGNORE_CLASSES side packet.
  // `ignore_classes` and `allow_classes` are mutually exclusive.
  repeated int32 ignore_classes = 8;
  // The ids of classes that should be allowed during decoding the score for
  // each predicted box. `ignore_classes` and `allow_classes` are mutually
  // exclusive.
  repeated int32 allow_classes = 21 [packed = true];

  optional bool sigmoid_score = 15 [default = false];
  optional float score_clipping_thresh = 16;

  // Whether the detection coordinates from the input tensors should be flipped
  // vertically (along the y-direction). This is useful, for example, when the
  // input tensors represent detections defined with a coordinate system where
  // the origin is at the top-left corner, whereas the desired detection
  // representation has a bottom-left origin (e.g., in OpenGL).
  optional bool flip_vertically = 18 [default = false];

  // Score threshold for preserving decoded detections.
  optional float min_score_thresh = 19;

  // The maximum number of the detection results to return. If < 0, all
  // available results will be returned.
  // For the detection models that have built-in non max suppression op, the
  // output detections are the top-scored results. Otherwise, the output
  // detections are the first N results that have higher scores than
  // `min_score_thresh`.
  optional int32 max_results = 20 [default = -1];

  // The maximum number of classes per detection.
  optional int32 max_classes_per_detection = 25 [default = 1];

  // The custom model output tensor mapping.
  // The indices of the "detections" tensor and the "scores" tensor are always
  // required. If the model outputs an "anchors" tensor, `anchors_tensor_index`
  // must be specified. If the model outputs both "classes" tensor and "number
  // of detections" tensors, `classes_tensor_index` and
  // `num_detections_tensor_index` must be set.
  message TensorMapping {
    optional int32 detections_tensor_index = 1;
    optional int32 classes_tensor_index = 2;
    optional int32 scores_tensor_index = 3;
    optional int32 num_detections_tensor_index = 4;
    optional int32 anchors_tensor_index = 5;
  }
  optional TensorMapping tensor_mapping = 22;

  // Represents the bounding box by using the combination of boundaries,
  // {ymin, xmin, ymax, xmax}.
  // The default order is {ymin, xmin, ymax, xmax}.
  message BoxBoundariesIndices {
    optional int32 ymin = 1 [default = 0];
    optional int32 xmin = 2 [default = 1];
    optional int32 ymax = 3 [default = 2];
    optional int32 xmax = 4 [default = 3];
  }
  oneof box_indices {
    BoxBoundariesIndices box_boundaries_indices = 23;
  }

  // Tells the calculator how to convert the detector output to bounding boxes.
  // Replaces `reverse_output_order` to support more bbox output formats.
  // As with `reverse_output_order`, this also informs calculator the order
  // of keypoint predictions.
  enum BoxFormat {
    // if UNSPECIFIED, the calculator assumes YXHW
    UNSPECIFIED = 0;
    // bbox [y_center, x_center, height, width], keypoint [y, x]
    YXHW = 1;
    // bbox [x_center, y_center, width, height], keypoint [x, y]
    XYWH = 2;
    // bbox [xmin, ymin, xmax, ymax], keypoint [x, y]
    XYXY = 3;
  }
  optional BoxFormat box_format = 24 [default = UNSPECIFIED];
}