chromium/third_party/coremltools/mlmodel/format/NonMaximumSuppression.proto

// Copyright (c) 2018, Apple Inc. All rights reserved.
//
// Use of this source code is governed by a BSD-3-clause license that can be
// found in LICENSE.txt or at https://opensource.org/licenses/BSD-3-Clause

syntax = "proto3";
option optimize_for = LITE_RUNTIME;

import public "DataStructures.proto";

package CoreML.Specification;

/*
 * Non-maximum suppression of axis-aligned bounding boxes.
 *
 * This is used primarily for object detectors that tend to produce multiple
 * boxes around a single object.  This is a byproduct of the detector's
 * robustness to spatial translation. If there are two or more bounding boxes
 * that are very similar to one another, the algorithm should return only a
 * single representative.
 *
 * Similarity between two bounding boxes is measured by intersection-over-union
 * (IOU), the fraction between the area of intersection and area of the union.
 * Here is an example where the areas can be calculated by hand by counting
 * glyphs::
 *
 *     +-------+                            +-------+
 *     |       |                            |       |
 *     |    +------+          +--+          |       +---+
 *     |    |  |   |          |  |          |           |
 *     +-------+   |          +--+          +----+      |
 *          |      |                             |      |
 *          +------+                             +------+
 *                        Intersection         Union
 *      IOU: 0.16      =       12       /       73
 *
 * All IOU scores are fractions betwen 0.0 (fully disjoint) and 1.0 (perfect
 * overlap). The standard algorithm (PickTop) is defined as follows:
 *
 *  1. Sort boxes by descending order of confidence
 *  2. Take the top one and mark it as keep
 *  3. Suppress (mark it as discard) all boxes within a fixed IOU radius of the
 *     keep box
 *  4. Go to 2 and repeat on the subset of boxes not already kept or discarded
 *  5. When all boxes are processed, output only the ones marked as keep
 *
 * Before the algorithm, boxes that fall below the confidence threshold are
 * discarded.
 */
message NonMaximumSuppression {
  // Suppression methods:
  /*
   * Pick the bounding box of the top confidence, suppress all within a radius.
   */
  message PickTop {
    /*
     * Suppression is only done among predictions with the same label
     * (argmax of the confidence).
     */
    bool perClass = 1;
  }

  /*
   * Choose which underlying suppression method to use
   */
  oneof SuppressionMethod {
    PickTop pickTop = 1;
  }

  /*
   * Optional class label mapping.
   */
  oneof ClassLabels {
    StringVector stringClassLabels = 100;
    Int64Vector int64ClassLabels = 101;
  }

  /*
   * This defines the radius of suppression. A box is considered to be within
   * the radius of another box if their IOU score is less than this value.
   */
  double iouThreshold = 110;

  /*
  * Remove bounding boxes below this threshold.  The algorithm run-time is
  * proportional to the square of the number of incoming bounding boxes
  * (O(N^2)). This threshold is a way to reduce N to make the algorithm
  * faster. The confidence threshold can be any non-negative value. Negative
  * confidences are not allowed, since if the output shape is specified to be
  * larger than boxes after suppression, the unused boxes are filled with
  * zero confidence. If the prediction is handled by Core Vision, it is also
  * important that confidences are defined with the following semantics:
  *
  *   1. Confidences should be between 0 and 1
  *   2. The sum of the confidences for a prediction should not exceed 1, but is
  *      allowed to be less than 1
  *   3. The sum of the confidences will be interpreted as the confidence of
  *      any object (e.g. if the confidences for two classes are 0.2 and 0.4,
         it means there is a 60% (0.2 + 0.4) confidence that an object is
         present)
  */
  double confidenceThreshold = 111;

  /*
   * Set the name of the confidence input.
   *
   * The input should be a multi-array of type double and shape N x C. N is
   * the number of boxes and C the number of classes. Each row describes the
   * confidences of each object category being present at that particular
   * location. Confidences should be nonnegative, where 0.0 means the highest
   * certainty the object is not present.
   *
   * Specifying shape is optional.
   */
  string confidenceInputFeatureName = 200;

  /*
   * Set the name of the coordinates input.
   *
   * The input should be a multi-array of type double and shape N x 4. The
   * rows correspond to the rows of the confidence matrix. The four values
   * describe (in order):
   *
   *  - x (center location of the box along the horizontal axis)
   *  - y (center location of the box along the vertical axis)
   *  - width (size of box along the horizontal axis)
   *  - height (size of box on along the vertical axis)
   *
   * Specifying shape is optional.
   */
  string coordinatesInputFeatureName = 201;

  /*
   * The iouThreshold can be optionally overridden by specifying this string
   * and providing a corresponding input of type double. This allows changing
   * the value of the parameter during run-time.
   *
   * The input should be a scalar double between 0.0 and 1.0. Setting it to 1.0
   * means there will be no suppression based on IOU.
   */
  string iouThresholdInputFeatureName = 202;

  /*
   * The confidenceThreshold can be optionally overridden by specifying this
   * string and providing a corresponding input. This allows changing the
   * value of the parameter during run-time, which can aid setting it just
   * right for a particular use case.
   *
   * The input should be a scalar double with nonnegative value.
   */
  string confidenceThresholdInputFeatureName = 203;

  /*
   * Set the name of the confidence output. The output will be the same type
   * and shape as the corresponding input. The only difference is that the
   * number of rows may have been reduced.
   *
   * Specifying shape is optional. One reason to specify shape is to limit
   * the number of output boxes. This can be done is several ways:
   *
   * Fixed shape:
   * The output can be pinned to a fixed set of boxes. If this number is larger
   * than the number of boxes that would have been returned, the output is
   * padded with zeros for both confidence and coordinates. Specifying a fixed
   * shape can be done by setting either shape (deprecated) or allowedShapes set
   * to fixedsize.
   *
   * Min/max:
   * It is also possible to set both a minimum and a maximum. The same
   * zero-padding as for fixed shape is applied when necessary. Setting min/max
   * is done by defining two allowedShapes, where the first dimension uses a
   * rangeofsizes defining lowerbound and upperbound.
   */
  string confidenceOutputFeatureName = 210;

  /*
   * Set the name of the coordinates output. The output will be the same type
   * and shape as the corresponding input. The only difference is that the
   * number of rows may have been reduced.
   *
   * Specifying shape is optional. See confidence output for a more detailed
   * description. Note that to achieve either fixed shape output or a
   * constraint range of boxes, only one of confidence or coordinates need to
   * set a shape. Both shapes are allowed to be defined, but in such case they
   * have to be consistent along dimension 0.
   */
  string coordinatesOutputFeatureName = 211;
}