// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mediapipe/util/tracking/box_detector.h"
#include <vector>
#include "absl/log/absl_check.h"
#include "absl/log/absl_log.h"
#include "absl/memory/memory.h"
#include "mediapipe/framework/port/opencv_calib3d_inc.h"
#include "mediapipe/framework/port/opencv_imgproc_inc.h"
#include "mediapipe/framework/port/opencv_video_inc.h"
#include "mediapipe/util/tracking/box_detector.pb.h"
#include "mediapipe/util/tracking/box_tracker.h"
#include "mediapipe/util/tracking/measure_time.h"
namespace mediapipe {
namespace {
void ScaleBox(float scale_x, float scale_y, TimedBoxProto *box) {
box->set_left(box->left() * scale_x);
box->set_right(box->right() * scale_x);
box->set_top(box->top() * scale_y);
box->set_bottom(box->bottom() * scale_y);
if (box->has_quad()) {
for (int c = 0; c < 4; ++c) {
(*box->mutable_quad()->mutable_vertices())[2 * c] *= scale_x;
(*box->mutable_quad()->mutable_vertices())[2 * c + 1] *= scale_y;
}
}
}
cv::Mat ConvertDescriptorsToMat(const std::vector<std::string> &descriptors) {
ABSL_CHECK(!descriptors.empty()) << "empty descriptors.";
const int descriptors_dims = descriptors[0].size();
ABSL_CHECK_GT(descriptors_dims, 0);
cv::Mat mat(descriptors.size(), descriptors_dims, CV_8U);
for (int j = 0; j < descriptors.size(); ++j) {
memcpy(mat.row(j).data, descriptors[j].data(), descriptors_dims);
}
return mat;
}
cv::Mat GetDescriptorsWithIndices(const cv::Mat &frame_descriptors,
const std::vector<int> &indices) {
ABSL_CHECK_GT(frame_descriptors.rows, 0);
const int num_inlier_descriptors = indices.size();
ABSL_CHECK_GT(num_inlier_descriptors, 0);
const int descriptors_dims = frame_descriptors.cols;
ABSL_CHECK_GT(descriptors_dims, 0);
cv::Mat mat(num_inlier_descriptors, descriptors_dims, CV_32F);
for (int j = 0; j < indices.size(); ++j) {
frame_descriptors.row(indices[j]).copyTo(mat.row(j));
}
return mat;
}
} // namespace
// Using OpenCV brute force matcher along with cross validate match to conduct
// the query.
class BoxDetectorOpencvBfImpl : public BoxDetectorInterface {
public:
explicit BoxDetectorOpencvBfImpl(const BoxDetectorOptions &options);
private:
std::vector<FeatureCorrespondence> MatchFeatureDescriptors(
const std::vector<Vector2_f> &features, const cv::Mat &descriptors,
int box_idx) override;
cv::BFMatcher bf_matcher_;
};
std::unique_ptr<BoxDetectorInterface> BoxDetectorInterface::Create(
const BoxDetectorOptions &options) {
if (options.index_type() == BoxDetectorOptions::OPENCV_BF) {
return absl::make_unique<BoxDetectorOpencvBfImpl>(options);
} else {
ABSL_LOG(FATAL) << "index type undefined.";
}
}
BoxDetectorInterface::BoxDetectorInterface(const BoxDetectorOptions &options)
: options_(options) {}
void BoxDetectorInterface::DetectAndAddBoxFromFeatures(
const std::vector<Vector2_f> &features, const cv::Mat &descriptors,
const TimedBoxProtoList &tracked_boxes, int64_t timestamp_msec,
float scale_x, float scale_y, TimedBoxProtoList *detected_boxes) {
absl::MutexLock lock_access(&access_to_index_);
image_scale_ = std::min(scale_x, scale_y);
image_aspect_ = scale_x / scale_y;
int size_before_add = box_id_to_idx_.size();
std::vector<bool> tracked(size_before_add, false);
for (const auto &box : tracked_boxes.box()) {
if (!box.reacquisition()) {
continue;
}
const absl::flat_hash_map<int, int>::iterator iter =
box_id_to_idx_.find(box.id());
if (iter == box_id_to_idx_.end()) {
// De-normalize the input box to image scale
TimedBoxProto scaled_box = box;
ScaleBox(scale_x, scale_y, &scaled_box);
AddBoxFeaturesToIndex(features, descriptors, scaled_box,
/*transform_features_for_pnp*/ true);
} else {
int box_idx = iter->second;
tracked[box_idx] = true;
float center_x = 0.0f;
float center_y = 0.0f;
if (!box.has_quad()) {
center_x = (box.left() + box.right()) * 0.5f;
center_y = (box.top() + box.bottom()) * 0.5f;
} else {
for (int c = 0; c < 4; ++c) {
center_x += box.quad().vertices(c * 2);
center_y += box.quad().vertices(c * 2 + 1);
}
center_x /= 4;
center_y /= 4;
}
if (center_x < 0.0f || center_x > 1.0f || center_y < 0.0f ||
center_y > 1.0f) {
has_been_out_of_fov_[box_idx] = true;
}
}
}
for (int idx = 0; idx < size_before_add; ++idx) {
if ((options_.detect_every_n_frame() > 0 &&
cnt_detect_called_ % options_.detect_every_n_frame() == 0) ||
!tracked[idx] ||
(options_.detect_out_of_fov() && has_been_out_of_fov_[idx])) {
TimedBoxProtoList det = DetectBox(features, descriptors, idx);
if (det.box_size() > 0) {
det.mutable_box(0)->set_time_msec(timestamp_msec);
// Convert the result box to normalized space.
ScaleBox(1.0f / scale_x, 1.0f / scale_y, det.mutable_box(0));
*detected_boxes->add_box() = det.box(0);
has_been_out_of_fov_[idx] = false;
}
}
}
// reset timer after detect or add action.
cnt_detect_called_ = 1;
}
void BoxDetectorInterface::DetectAndAddBox(
const TrackingData &tracking_data, const TimedBoxProtoList &tracked_boxes,
int64_t timestamp_msec, TimedBoxProtoList *detected_boxes) {
std::vector<Vector2_f> features_from_tracking_data;
std::vector<std::string> descriptors_from_tracking_data;
FeatureAndDescriptorFromTrackingData(tracking_data,
&features_from_tracking_data,
&descriptors_from_tracking_data);
if (features_from_tracking_data.empty() ||
descriptors_from_tracking_data.empty()) {
ABSL_LOG(WARNING)
<< "Detection skipped due to empty features or descriptors.";
return;
}
cv::Mat frame_descriptors =
ConvertDescriptorsToMat(descriptors_from_tracking_data);
float scale_x, scale_y;
ScaleFromAspect(tracking_data.frame_aspect(), /*invert*/ false, &scale_x,
&scale_y);
DetectAndAddBoxFromFeatures(features_from_tracking_data, frame_descriptors,
tracked_boxes, timestamp_msec, scale_x, scale_y,
detected_boxes);
}
bool BoxDetectorInterface::CheckDetectAndAddBox(
const TimedBoxProtoList &tracked_boxes) {
bool need_add = false;
int cnt_tracked = 0;
for (const auto &box : tracked_boxes.box()) {
if (!box.reacquisition()) {
continue;
}
const absl::flat_hash_map<int, int>::iterator iter =
box_id_to_idx_.find(box.id());
if (iter == box_id_to_idx_.end()) {
need_add = true;
break;
} else {
++cnt_tracked;
}
}
// When new boxes being added for reacquisition, we need to run redetection.
if (need_add) {
return true;
}
const bool is_periodical_check_on = options_.detect_every_n_frame() > 0;
const bool need_periodical_check =
is_periodical_check_on &&
(cnt_detect_called_ % options_.detect_every_n_frame() == 0);
// When configured to do periodical check, and when need to run the periodical
// check, we run redetection.
if (need_periodical_check) {
return true;
}
const bool any_reacquisition_box_missing =
!box_id_to_idx_.empty() && (cnt_tracked < box_id_to_idx_.size());
// When NOT configured to use periodical check, we run redetection EVERY frame
// when any reacquisition box is missing. Note this path of redetection
// including re-run feature extraction and is expensive, might cause graph
// throttling on low end devices.
if (!is_periodical_check_on && any_reacquisition_box_missing) {
return true;
}
// Other cases, increment the cnt_detect_called_ number and return false to
// not run redetection.
++cnt_detect_called_;
return false;
}
void BoxDetectorInterface::DetectAndAddBox(
const cv::Mat &image, const TimedBoxProtoList &tracked_boxes,
int64_t timestamp_msec, TimedBoxProtoList *detected_boxes) {
// Determine if we need execute feature extraction.
if (!CheckDetectAndAddBox(tracked_boxes)) {
return;
}
const auto &image_query_settings = options_.image_query_settings();
cv::Mat grayscale;
if (image.channels() == 3) {
cv::cvtColor(image, grayscale, cv::COLOR_BGR2GRAY);
} else if (image.channels() == 4) {
cv::cvtColor(image, grayscale, cv::COLOR_RGBA2GRAY);
} else {
grayscale = image;
}
cv::Mat resize_image;
const int longer_edge = std::max(grayscale.cols, grayscale.rows);
const float longer_edge_scaled = image_query_settings.pyramid_bottom_size();
if (longer_edge <= longer_edge_scaled) {
resize_image = grayscale;
} else {
float resize_scale = longer_edge_scaled / longer_edge;
cv::resize(
grayscale, resize_image,
cv::Size(resize_scale * grayscale.cols, resize_scale * grayscale.rows),
cv::INTER_AREA);
}
// Use cv::ORB feature extractor for now since it provides better quality of
// detection results compared with manually constructing pyramid and then use
// OrbFeatureDescriptor.
// TODO: Tune OrbFeatureDescriptor to hit similar quality.
if (!orb_extractor_) {
orb_extractor_ =
cv::ORB::create(image_query_settings.max_features(),
image_query_settings.pyramid_scale_factor(),
image_query_settings.max_pyramid_levels());
}
std::vector<cv::KeyPoint> keypoints;
cv::Mat descriptors;
orb_extractor_->detect(resize_image, keypoints);
orb_extractor_->compute(resize_image, keypoints, descriptors);
ABSL_CHECK_EQ(keypoints.size(), descriptors.rows);
float inv_scale = 1.0f / std::max(resize_image.cols, resize_image.rows);
std::vector<Vector2_f> v_keypoints(keypoints.size());
for (int j = 0; j < keypoints.size(); ++j) {
v_keypoints[j] =
Vector2_f(keypoints[j].pt.x * inv_scale, keypoints[j].pt.y * inv_scale);
}
float scale_x = resize_image.cols * inv_scale;
float scale_y = resize_image.rows * inv_scale;
DetectAndAddBoxFromFeatures(v_keypoints, descriptors, tracked_boxes,
timestamp_msec, scale_x, scale_y, detected_boxes);
}
TimedBoxProtoList BoxDetectorInterface::DetectBox(
const std::vector<Vector2_f> &features, const cv::Mat &descriptors,
int box_idx) {
return FindBoxesFromFeatureCorrespondence(
MatchFeatureDescriptors(features, descriptors, box_idx), box_idx);
}
TimedBoxProtoList BoxDetectorInterface::FindBoxesFromFeatureCorrespondence(
const std::vector<FeatureCorrespondence> &matches, int box_idx) {
int max_corr = -1;
int max_corr_frame = 0;
for (int j = 0; j < matches.size(); ++j) {
int num_corr = matches[j].points_frame.size();
if (num_corr > max_corr) {
max_corr = num_corr;
max_corr_frame = j;
}
}
TimedBoxProtoList result_list;
constexpr int kMinNumCorrespondence = 10;
if (max_corr < kMinNumCorrespondence) {
return result_list;
}
const TimedBoxProto &ori_box = frame_box_[box_idx][max_corr_frame];
if (!ori_box.has_quad()) {
cv::Mat similarity =
cv::estimateRigidTransform(matches[max_corr_frame].points_index,
matches[max_corr_frame].points_frame, false);
if (similarity.cols == 0 || similarity.rows == 0) return result_list;
float similarity_scale =
std::hypot(similarity.at<double>(0, 0), similarity.at<double>(1, 0));
float similarity_theta =
std::atan2(similarity.at<double>(1, 0), similarity.at<double>(0, 0));
auto *new_box_ptr = result_list.add_box();
float box_center_x = 0.5f * (ori_box.left() + ori_box.right());
float box_center_y = 0.5f * (ori_box.top() + ori_box.bottom());
float new_center_x = similarity.at<double>(0, 0) * box_center_x +
similarity.at<double>(0, 1) * box_center_y +
similarity.at<double>(0, 2);
float new_center_y = similarity.at<double>(1, 0) * box_center_x +
similarity.at<double>(1, 1) * box_center_y +
similarity.at<double>(1, 2);
new_box_ptr->set_left((ori_box.left() - box_center_x) * similarity_scale +
new_center_x);
new_box_ptr->set_right((ori_box.right() - box_center_x) * similarity_scale +
new_center_x);
new_box_ptr->set_top((ori_box.top() - box_center_y) * similarity_scale +
new_center_y);
new_box_ptr->set_bottom(
(ori_box.bottom() - box_center_y) * similarity_scale + new_center_y);
new_box_ptr->set_rotation(ori_box.rotation() + similarity_theta);
new_box_ptr->set_id(box_idx_to_id_[box_idx]);
new_box_ptr->set_reacquisition(true);
return result_list;
} else {
return FindQuadFromFeatureCorrespondence(matches[max_corr_frame], ori_box,
image_aspect_);
}
}
TimedBoxProtoList BoxDetectorInterface::FindQuadFromFeatureCorrespondence(
const FeatureCorrespondence &matches, const TimedBoxProto &box_proto,
float frame_aspect) {
TimedBoxProtoList result_list;
if (matches.points_frame.size() != matches.points_index.size()) {
ABSL_LOG(ERROR) << matches.points_frame.size() << " vs "
<< matches.points_index.size()
<< ". Correpondence size doesn't match.";
return result_list;
}
int matches_size = matches.points_frame.size();
if (matches_size < options_.min_num_correspondence()) {
return result_list;
}
constexpr int kRansacMaxIterations = 100;
cv::Mat inliers_set;
cv::Mat homography = cv::findHomography(
matches.points_index, matches.points_frame, cv::FM_RANSAC,
options_.ransac_reprojection_threshold(), inliers_set,
kRansacMaxIterations);
// Check if the orientation is preserved, otherwise quad will be flipped.
double det = homography.at<double>(0, 0) * homography.at<double>(1, 1) -
homography.at<double>(0, 1) * homography.at<double>(1, 0);
if (det < 0) {
return result_list;
}
double persp = homography.at<double>(2, 0) * homography.at<double>(2, 0) +
homography.at<double>(2, 1) * homography.at<double>(2, 1);
if (persp > options_.max_perspective_factor()) {
return result_list;
}
std::vector<cv::Point2f> frame_corners;
if (frame_aspect > 0.0f && box_proto.has_aspect_ratio() &&
box_proto.aspect_ratio() > 0.0f) {
float box_scale_x, box_scale_y;
ScaleFromAspect(box_proto.aspect_ratio(), /*invert*/ false, &box_scale_x,
&box_scale_y);
const float box_half_x = box_scale_x * 0.5;
const float box_half_y = box_scale_y * 0.5;
float frame_scale_x, frame_scale_y;
ScaleFromAspect(frame_aspect, /*invert*/ false, &frame_scale_x,
&frame_scale_y);
const float frame_half_x = frame_scale_x * 0.5;
const float frame_half_y = frame_scale_y * 0.5;
std::vector<cv::Point3f> vectors_3d;
vectors_3d.reserve(matches_size);
std::vector<cv::Point2f> vectors_2d;
vectors_2d.reserve(matches_size);
for (int j = 0; j < matches_size; ++j) {
if (inliers_set.at<uchar>(j)) {
vectors_3d.emplace_back(matches.points_index[j].x - box_half_x,
matches.points_index[j].y - box_half_y, 0.0f);
vectors_2d.emplace_back(matches.points_frame[j].x - frame_half_x,
matches.points_frame[j].y - frame_half_y);
}
}
constexpr int kMinCorrespondences = 4;
if (vectors_3d.size() < kMinCorrespondences) {
return result_list;
}
// TODO: Use camera intrinsic if provided.
cv::Mat rvec, tvec;
cv::solvePnP(vectors_3d, vectors_2d, cv::Mat::eye(3, 3, CV_64F),
cv::Mat::zeros(1, 5, CV_64FC1), rvec, tvec);
std::vector<cv::Point3f> template_corners{
cv::Point3f(-box_half_x, -box_half_y, 0),
cv::Point3f(-box_half_x, box_half_y, 0),
cv::Point3f(box_half_x, box_half_y, 0),
cv::Point3f(box_half_x, -box_half_y, 0)};
cv::projectPoints(template_corners, rvec, tvec, cv::Mat::eye(3, 3, CV_64F),
cv::Mat::zeros(1, 5, CV_64FC1), frame_corners);
for (int j = 0; j < 4; ++j) {
frame_corners[j].x += frame_half_x;
frame_corners[j].y += frame_half_y;
}
} else {
std::vector<cv::Point2f> template_corners{
cv::Point2f(box_proto.quad().vertices(0), box_proto.quad().vertices(1)),
cv::Point2f(box_proto.quad().vertices(2), box_proto.quad().vertices(3)),
cv::Point2f(box_proto.quad().vertices(4), box_proto.quad().vertices(5)),
cv::Point2f(box_proto.quad().vertices(6),
box_proto.quad().vertices(7))};
cv::perspectiveTransform(template_corners, frame_corners, homography);
}
auto *new_box_ptr = result_list.add_box();
float min_x = std::numeric_limits<float>::max();
float max_x = std::numeric_limits<float>::lowest();
float min_y = std::numeric_limits<float>::max();
float max_y = std::numeric_limits<float>::lowest();
for (int c = 0; c < 4; ++c) {
new_box_ptr->mutable_quad()->add_vertices(frame_corners[c].x);
new_box_ptr->mutable_quad()->add_vertices(frame_corners[c].y);
min_x = std::min(min_x, frame_corners[c].x);
max_x = std::max(max_x, frame_corners[c].x);
min_y = std::min(min_y, frame_corners[c].y);
max_y = std::max(max_y, frame_corners[c].y);
}
new_box_ptr->set_left(min_x);
new_box_ptr->set_right(max_x);
new_box_ptr->set_top(min_y);
new_box_ptr->set_bottom(max_y);
new_box_ptr->set_rotation(0.0f);
new_box_ptr->set_id(box_proto.id());
new_box_ptr->set_reacquisition(true);
if (box_proto.has_aspect_ratio()) {
new_box_ptr->set_aspect_ratio(box_proto.aspect_ratio());
}
return result_list;
}
std::vector<int> BoxDetectorInterface::GetFeatureIndexWithinBox(
const std::vector<Vector2_f> &features, const TimedBoxProto &box) {
std::vector<int> insider_idx;
if (features.empty()) return insider_idx;
MotionBoxState box_state;
if (!box.has_quad()) {
box_state.set_pos_x(box.left());
box_state.set_pos_y(box.top());
box_state.set_width(box.right() - box.left());
box_state.set_height(box.bottom() - box.top());
box_state.set_rotation(box.rotation());
} else {
auto *state_quad_ptr = box_state.mutable_quad();
for (int c = 0; c < 8; ++c) {
state_quad_ptr->add_vertices(box.quad().vertices(c));
}
}
const Vector2_f box_scaling(1.0f, 1.0f);
constexpr float kScaleFactorForBoxEnlarging = 0.1f;
constexpr int kMinNumFeatures = 60;
GetFeatureIndicesWithinBox(
features, box_state, box_scaling,
/*max_enlarge_size=*/image_scale_ * kScaleFactorForBoxEnlarging,
/*min_num_features=*/kMinNumFeatures, &insider_idx);
return insider_idx;
}
void BoxDetectorInterface::AddBoxFeaturesToIndex(
const std::vector<Vector2_f> &features, const cv::Mat &descriptors,
const TimedBoxProto &box, bool transform_features_for_pnp) {
std::vector<int> insider_idx = GetFeatureIndexWithinBox(features, box);
if (!insider_idx.empty()) {
const absl::flat_hash_map<int, int>::iterator iter =
box_id_to_idx_.find(box.id());
int box_idx;
if (iter == box_id_to_idx_.end()) {
box_idx = box_id_to_idx_.size();
box_id_to_idx_[box.id()] = box_idx;
box_idx_to_id_.push_back(box.id());
frame_box_.resize(box_id_to_idx_.size());
feature_to_frame_.resize(box_id_to_idx_.size());
feature_keypoints_.resize(box_id_to_idx_.size());
feature_descriptors_.resize(box_id_to_idx_.size());
has_been_out_of_fov_.push_back(false);
} else {
box_idx = iter->second;
has_been_out_of_fov_[box_idx] = false;
}
// Create a frame
int frame_id = frame_box_[box_idx].size();
frame_box_[box_idx].push_back(box);
cv::Mat box_descriptors =
GetDescriptorsWithIndices(descriptors, insider_idx);
if (feature_descriptors_[box_idx].rows == 0) {
feature_descriptors_[box_idx] = box_descriptors;
} else {
cv::vconcat(feature_descriptors_[box_idx], box_descriptors,
feature_descriptors_[box_idx]);
}
if (box.has_aspect_ratio() && transform_features_for_pnp) {
// TODO: Dynamically switching between pnp and homography
// detection is not supported. The detector can only perform detection in
// one mode in its lifetime.
float scale_x, scale_y;
ScaleFromAspect(box.aspect_ratio(), /*invert*/ false, &scale_x, &scale_y);
std::vector<cv::Point2f> corners_template{
cv::Point2f(0.0f, 0.0f), cv::Point2f(0.0f, scale_y),
cv::Point2f(scale_x, scale_y), cv::Point2f(scale_x, 0.0f)};
std::vector<cv::Point2f> corners_frame(4);
for (int j = 0; j < 4; ++j) {
corners_frame[j].x = box.quad().vertices(j * 2);
corners_frame[j].y = box.quad().vertices(j * 2 + 1);
}
cv::Mat h_transform = cv::findHomography(corners_frame, corners_template);
std::vector<cv::Point2f> features_frame, features_template;
for (int j = 0; j < insider_idx.size(); ++j) {
features_frame.emplace_back(features[insider_idx[j]].x(),
features[insider_idx[j]].y());
}
cv::perspectiveTransform(features_frame, features_template, h_transform);
for (int j = 0; j < features_template.size(); ++j) {
feature_keypoints_[box_idx].emplace_back(features_template[j].x,
features_template[j].y);
}
} else {
for (int j = 0; j < insider_idx.size(); ++j) {
feature_keypoints_[box_idx].emplace_back(features[insider_idx[j]]);
}
}
for (int j = 0; j < insider_idx.size(); ++j) {
feature_to_frame_[box_idx].push_back(frame_id);
}
}
}
void BoxDetectorInterface::CancelBoxDetection(int box_id) {
absl::MutexLock lock_access(&access_to_index_);
const absl::flat_hash_map<int, int>::iterator iter =
box_id_to_idx_.find(box_id);
if (iter == box_id_to_idx_.end()) {
return;
} else {
const int erase_idx = iter->second;
frame_box_.erase(frame_box_.begin() + erase_idx);
feature_to_frame_.erase(feature_to_frame_.begin() + erase_idx);
feature_keypoints_.erase(feature_keypoints_.begin() + erase_idx);
feature_descriptors_.erase(feature_descriptors_.begin() + erase_idx);
has_been_out_of_fov_.erase(has_been_out_of_fov_.begin() + erase_idx);
box_idx_to_id_.erase(box_idx_to_id_.begin() + erase_idx);
box_id_to_idx_.erase(iter);
for (int j = erase_idx; j < box_idx_to_id_.size(); ++j) {
box_id_to_idx_[box_idx_to_id_[j]] = j;
}
}
}
BoxDetectorIndex BoxDetectorInterface::ObtainBoxDetectorIndex() const {
absl::MutexLock lock_access(&access_to_index_);
BoxDetectorIndex index;
for (int j = 0; j < frame_box_.size(); ++j) {
BoxDetectorIndex::BoxEntry *box_ptr = index.add_box_entry();
for (int i = 0; i < frame_box_[j].size(); ++i) {
BoxDetectorIndex::BoxEntry::FrameEntry *frame_ptr =
box_ptr->add_frame_entry();
*(frame_ptr->mutable_box()) = frame_box_[j][i];
}
for (int k = 0; k < feature_to_frame_[j].size(); ++k) {
BoxDetectorIndex::BoxEntry::FrameEntry *frame_ptr =
box_ptr->mutable_frame_entry(feature_to_frame_[j][k]);
frame_ptr->add_keypoints(feature_keypoints_[j][k].x());
frame_ptr->add_keypoints(feature_keypoints_[j][k].y());
frame_ptr->add_descriptors()->set_data(
static_cast<void *>(feature_descriptors_[j].row(k).data),
feature_descriptors_[j].cols * sizeof(float));
}
}
return index;
}
void BoxDetectorInterface::AddBoxDetectorIndex(const BoxDetectorIndex &index) {
absl::MutexLock lock_access(&access_to_index_);
for (int j = 0; j < index.box_entry_size(); ++j) {
const auto &box_entry = index.box_entry(j);
for (int i = 0; i < box_entry.frame_entry_size(); ++i) {
const auto &frame_entry = box_entry.frame_entry(i);
// If the box to be added already exists in the index, skip.
if (box_id_to_idx_.find(frame_entry.box().id()) != box_id_to_idx_.end()) {
continue;
}
ABSL_CHECK_EQ(frame_entry.keypoints_size(),
frame_entry.descriptors_size() * 2);
const int num_features = frame_entry.descriptors_size();
ABSL_CHECK_GT(num_features, 0);
std::vector<Vector2_f> features(num_features);
const int descriptors_dims = frame_entry.descriptors(0).data().size();
ABSL_CHECK_GT(descriptors_dims, 0);
cv::Mat descriptors_mat(num_features, descriptors_dims / sizeof(float),
CV_32F);
for (int k = 0; k < num_features; ++k) {
features[k] = Vector2_f(frame_entry.keypoints(2 * k),
frame_entry.keypoints(2 * k + 1));
memcpy(descriptors_mat.row(k).data,
frame_entry.descriptors(k).data().data(), descriptors_dims);
}
AddBoxFeaturesToIndex(features, descriptors_mat, frame_entry.box());
}
}
}
BoxDetectorOpencvBfImpl::BoxDetectorOpencvBfImpl(
const BoxDetectorOptions &options)
: BoxDetectorInterface(options), bf_matcher_(cv::NORM_L2, true) {}
std::vector<FeatureCorrespondence>
BoxDetectorOpencvBfImpl::MatchFeatureDescriptors(
const std::vector<Vector2_f> &features, const cv::Mat &descriptors,
int box_idx) {
ABSL_CHECK_EQ(features.size(), descriptors.rows);
std::vector<FeatureCorrespondence> correspondence_result(
frame_box_[box_idx].size());
if (features.empty() || descriptors.rows == 0 || descriptors.cols == 0) {
return correspondence_result;
}
int knn = 1;
std::vector<std::vector<cv::DMatch>> matches;
bf_matcher_.knnMatch(descriptors, feature_descriptors_[box_idx], matches,
knn);
// Hamming distance threshold for best match distance. This max distance
// filtering rejects some of false matches which has not been rejected by
// cross match validation. And the value is determined emprically.
for (const auto &match_pair : matches) {
if (match_pair.size() < knn) continue;
const cv::DMatch &best_match = match_pair[0];
if (best_match.distance > options_.max_match_distance()) continue;
int match_idx = feature_to_frame_[box_idx][best_match.trainIdx];
correspondence_result[match_idx].points_frame.push_back(cv::Point2f(
features[best_match.queryIdx].x(), features[best_match.queryIdx].y()));
correspondence_result[match_idx].points_index.push_back(
cv::Point2f(feature_keypoints_[box_idx][best_match.trainIdx].x(),
feature_keypoints_[box_idx][best_match.trainIdx].y()));
}
return correspondence_result;
}
} // namespace mediapipe