// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cmath>
#include <utility>
#include <vector>
#include "absl/log/absl_check.h"
#include "absl/log/absl_log.h"
#include "mediapipe/calculators/tflite/ssd_anchors_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/object_detection/anchor.pb.h"
#include "mediapipe/framework/port/ret_check.h"
namespace mediapipe {
namespace {
struct MultiScaleAnchorInfo {
int32_t level;
std::vector<float> aspect_ratios;
std::vector<float> scales;
std::pair<float, float> base_anchor_size;
std::pair<float, float> anchor_stride;
};
struct FeatureMapDim {
int height;
int width;
};
float CalculateScale(float min_scale, float max_scale, int stride_index,
int num_strides) {
if (num_strides == 1) {
return (min_scale + max_scale) * 0.5f;
} else {
return min_scale +
(max_scale - min_scale) * 1.0 * stride_index / (num_strides - 1.0f);
}
}
int GetNumLayers(const SsdAnchorsCalculatorOptions& options) {
if (options.multiscale_anchor_generation()) {
return (options.max_level() - options.min_level() + 1);
}
return options.num_layers();
}
FeatureMapDim GetFeatureMapDimensions(
const SsdAnchorsCalculatorOptions& options, int index) {
FeatureMapDim feature_map_dims;
if (options.feature_map_height_size()) {
feature_map_dims.height = options.feature_map_height(index);
feature_map_dims.width = options.feature_map_width(index);
} else {
const int stride = options.strides(index);
feature_map_dims.height =
std::ceil(1.0f * options.input_size_height() / stride);
feature_map_dims.width =
std::ceil(1.0f * options.input_size_width() / stride);
}
return feature_map_dims;
}
// Although we have stride for both x and y, only one value is used for offset
// calculation. See
// tensorflow_models/object_detection/anchor_generators/multiscale_grid_anchor_generator.py;l=121
std::pair<float, float> GetMultiScaleAnchorOffset(
const SsdAnchorsCalculatorOptions& options, const float stride,
const int level) {
std::pair<float, float> result(0., 0.);
int denominator = std::pow(2, level);
if (options.input_size_height() % denominator == 0 ||
options.input_size_height() == 1) {
result.first = stride / 2.0;
}
if (options.input_size_width() % denominator == 0 ||
options.input_size_width() == 1) {
result.second = stride / 2.0;
}
return result;
}
void NormalizeAnchor(const int input_height, const int input_width,
Anchor* anchor) {
anchor->set_h(anchor->h() / (float)input_height);
anchor->set_w(anchor->w() / (float)input_width);
anchor->set_y_center(anchor->y_center() / (float)input_height);
anchor->set_x_center(anchor->x_center() / (float)input_width);
}
Anchor CalculateAnchorBox(const int y_center, const int x_center,
const float scale, const float aspect_ratio,
const std::pair<float, float> base_anchor_size,
// y-height first
const std::pair<float, float> anchor_stride,
const std::pair<float, float> anchor_offset) {
Anchor result;
float ratio_sqrt = std::sqrt(aspect_ratio);
result.set_h(scale * base_anchor_size.first / ratio_sqrt);
result.set_w(scale * ratio_sqrt * base_anchor_size.second);
result.set_y_center(y_center * anchor_stride.first + anchor_offset.first);
result.set_x_center(x_center * anchor_stride.second + anchor_offset.second);
return result;
}
} // namespace
// Generate anchors for SSD object detection model.
// Output:
// ANCHORS: A list of anchors. Model generates predictions based on the
// offsets of these anchors.
//
// Usage example:
// node {
// calculator: "SsdAnchorsCalculator"
// output_side_packet: "anchors"
// options {
// [mediapipe.SsdAnchorsCalculatorOptions.ext] {
// num_layers: 6
// min_scale: 0.2
// max_scale: 0.95
// input_size_height: 300
// input_size_width: 300
// anchor_offset_x: 0.5
// anchor_offset_y: 0.5
// strides: 16
// strides: 32
// strides: 64
// strides: 128
// strides: 256
// strides: 512
// aspect_ratios: 1.0
// aspect_ratios: 2.0
// aspect_ratios: 0.5
// aspect_ratios: 3.0
// aspect_ratios: 0.3333
// reduce_boxes_in_lowest_layer: true
// }
// }
// }
class SsdAnchorsCalculator : public CalculatorBase {
public:
static absl::Status GetContract(CalculatorContract* cc) {
cc->OutputSidePackets().Index(0).Set<std::vector<Anchor>>();
return absl::OkStatus();
}
absl::Status Open(CalculatorContext* cc) override {
cc->SetOffset(TimestampDiff(0));
const SsdAnchorsCalculatorOptions& options =
cc->Options<SsdAnchorsCalculatorOptions>();
auto anchors = absl::make_unique<std::vector<Anchor>>();
if (!options.fixed_anchors().empty()) {
// Check fields for generating anchors are not set.
if (options.has_input_size_height() || options.has_input_size_width() ||
options.has_min_scale() || options.has_max_scale() ||
options.has_num_layers() || options.multiscale_anchor_generation()) {
return absl::InvalidArgumentError(
"Fixed anchors are provided, but fields are set for generating "
"anchors. When fixed anchors are set, fields for generating "
"anchors must not be set.");
}
anchors->assign(options.fixed_anchors().begin(),
options.fixed_anchors().end());
cc->OutputSidePackets().Index(0).Set(Adopt(anchors.release()));
return absl::OkStatus();
}
MP_RETURN_IF_ERROR(GenerateAnchors(anchors.get(), options));
cc->OutputSidePackets().Index(0).Set(Adopt(anchors.release()));
return absl::OkStatus();
}
absl::Status Process(CalculatorContext* cc) override {
return absl::OkStatus();
}
private:
static absl::Status GenerateAnchors(
std::vector<Anchor>* anchors, const SsdAnchorsCalculatorOptions& options);
static absl::Status GenerateMultiScaleAnchors(
std::vector<Anchor>* anchors, const SsdAnchorsCalculatorOptions& options);
};
REGISTER_CALCULATOR(SsdAnchorsCalculator);
// Generates grid anchors on the fly corresponding to multiple CNN layers as
// described in:
// "Focal Loss for Dense Object Detection" (https://arxiv.org/abs/1708.02002)
// T.-Y. Lin, P. Goyal, R. Girshick, K. He, P. Dollar
absl::Status SsdAnchorsCalculator::GenerateMultiScaleAnchors(
std::vector<Anchor>* anchors, const SsdAnchorsCalculatorOptions& options) {
std::vector<MultiScaleAnchorInfo> anchor_infos;
for (int i = options.min_level(); i <= options.max_level(); ++i) {
MultiScaleAnchorInfo current_anchor_info;
// level
current_anchor_info.level = i;
// aspect_ratios
for (const float aspect_ratio : options.aspect_ratios()) {
current_anchor_info.aspect_ratios.push_back(aspect_ratio);
}
// scale
for (int i = 0; i < options.scales_per_octave(); ++i) {
current_anchor_info.scales.push_back(
std::pow(2.0, (double)i / (double)options.scales_per_octave()));
}
// anchor stride
float anchor_stride = std::pow(2.0, i);
current_anchor_info.anchor_stride =
std::make_pair(anchor_stride, anchor_stride);
// base_anchor_size
current_anchor_info.base_anchor_size =
std::make_pair(anchor_stride * options.anchor_scale(),
anchor_stride * options.anchor_scale());
anchor_infos.push_back(current_anchor_info);
}
for (unsigned int i = 0; i < anchor_infos.size(); ++i) {
FeatureMapDim dimensions = GetFeatureMapDimensions(options, i);
for (int y = 0; y < dimensions.height; ++y) {
for (int x = 0; x < dimensions.width; ++x) {
// loop over combination of scale and aspect ratio
for (unsigned int j = 0; j < anchor_infos[i].aspect_ratios.size();
++j) {
for (unsigned int k = 0; k < anchor_infos[i].scales.size(); ++k) {
Anchor anchor = CalculateAnchorBox(
/*y_center=*/y, /*x_center=*/x, anchor_infos[i].scales[k],
anchor_infos[i].aspect_ratios[j],
anchor_infos[i].base_anchor_size,
/*anchor_stride=*/anchor_infos[i].anchor_stride,
/*anchor_offset=*/
GetMultiScaleAnchorOffset(options,
anchor_infos[i].anchor_stride.first,
anchor_infos[i].level));
if (options.normalize_coordinates()) {
NormalizeAnchor(options.input_size_height(),
options.input_size_width(), &anchor);
}
anchors->push_back(anchor);
}
}
}
}
}
return absl::OkStatus();
}
absl::Status SsdAnchorsCalculator::GenerateAnchors(
std::vector<Anchor>* anchors, const SsdAnchorsCalculatorOptions& options) {
// Verify the options.
if (!options.feature_map_height_size() && !options.strides_size()) {
return absl::InvalidArgumentError(
"Both feature map shape and strides are missing. Must provide either "
"one.");
}
const int kNumLayers = GetNumLayers(options);
if (options.feature_map_height_size()) {
if (options.strides_size()) {
ABSL_LOG(ERROR) << "Found feature map shapes. Strides will be ignored.";
}
ABSL_CHECK_EQ(options.feature_map_height_size(), kNumLayers);
ABSL_CHECK_EQ(options.feature_map_height_size(),
options.feature_map_width_size());
} else {
ABSL_CHECK_EQ(options.strides_size(), kNumLayers);
}
if (options.multiscale_anchor_generation()) {
return GenerateMultiScaleAnchors(anchors, options);
}
int layer_id = 0;
while (layer_id < options.num_layers()) {
std::vector<float> anchor_height;
std::vector<float> anchor_width;
std::vector<float> aspect_ratios;
std::vector<float> scales;
// For same strides, we merge the anchors in the same order.
int last_same_stride_layer = layer_id;
while (last_same_stride_layer < options.strides_size() &&
options.strides(last_same_stride_layer) ==
options.strides(layer_id)) {
const float scale =
CalculateScale(options.min_scale(), options.max_scale(),
last_same_stride_layer, options.strides_size());
if (last_same_stride_layer == 0 &&
options.reduce_boxes_in_lowest_layer()) {
// For first layer, it can be specified to use predefined anchors.
aspect_ratios.push_back(1.0);
aspect_ratios.push_back(2.0);
aspect_ratios.push_back(0.5);
scales.push_back(0.1);
scales.push_back(scale);
scales.push_back(scale);
} else {
for (int aspect_ratio_id = 0;
aspect_ratio_id < options.aspect_ratios_size();
++aspect_ratio_id) {
aspect_ratios.push_back(options.aspect_ratios(aspect_ratio_id));
scales.push_back(scale);
}
if (options.interpolated_scale_aspect_ratio() > 0.0) {
const float scale_next =
last_same_stride_layer == options.strides_size() - 1
? 1.0f
: CalculateScale(options.min_scale(), options.max_scale(),
last_same_stride_layer + 1,
options.strides_size());
scales.push_back(std::sqrt(scale * scale_next));
aspect_ratios.push_back(options.interpolated_scale_aspect_ratio());
}
}
last_same_stride_layer++;
}
for (int i = 0; i < aspect_ratios.size(); ++i) {
const float ratio_sqrts = std::sqrt(aspect_ratios[i]);
anchor_height.push_back(scales[i] / ratio_sqrts);
anchor_width.push_back(scales[i] * ratio_sqrts);
}
int feature_map_height = 0;
int feature_map_width = 0;
if (options.feature_map_height_size()) {
feature_map_height = options.feature_map_height(layer_id);
feature_map_width = options.feature_map_width(layer_id);
} else {
const int stride = options.strides(layer_id);
feature_map_height =
std::ceil(1.0f * options.input_size_height() / stride);
feature_map_width = std::ceil(1.0f * options.input_size_width() / stride);
}
for (int y = 0; y < feature_map_height; ++y) {
for (int x = 0; x < feature_map_width; ++x) {
for (int anchor_id = 0; anchor_id < anchor_height.size(); ++anchor_id) {
// TODO: Support specifying anchor_offset_x, anchor_offset_y.
const float x_center =
(x + options.anchor_offset_x()) * 1.0f / feature_map_width;
const float y_center =
(y + options.anchor_offset_y()) * 1.0f / feature_map_height;
Anchor new_anchor;
new_anchor.set_x_center(x_center);
new_anchor.set_y_center(y_center);
if (options.fixed_anchor_size()) {
new_anchor.set_w(1.0f);
new_anchor.set_h(1.0f);
} else {
new_anchor.set_w(anchor_width[anchor_id]);
new_anchor.set_h(anchor_height[anchor_id]);
}
anchors->push_back(new_anchor);
}
}
}
layer_id = last_same_stride_layer;
}
return absl::OkStatus();
}
} // namespace mediapipe