chromium/third_party/mediapipe/src/mediapipe/modules/objectron/calculators/tensors_to_objects_calculator.cc

// Copyright 2020 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cstdint>
#include <memory>
#include <unordered_map>
#include <vector>

#include "Eigen/Dense"
#include "absl/log/absl_check.h"
#include "absl/log/absl_log.h"
#include "absl/memory/memory.h"
#include "absl/strings/str_format.h"
#include "absl/types/span.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/deps/file_path.h"
#include "mediapipe/framework/formats/tensor.h"
#include "mediapipe/framework/port/opencv_core_inc.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/modules/objectron/calculators/annotation_data.pb.h"
#include "mediapipe/modules/objectron/calculators/belief_decoder_config.pb.h"
#include "mediapipe/modules/objectron/calculators/decoder.h"
#include "mediapipe/modules/objectron/calculators/tensor_util.h"
#include "mediapipe/modules/objectron/calculators/tensors_to_objects_calculator.pb.h"

namespace {
constexpr char kInputStreamTag[] = "TENSORS";
constexpr char kOutputStreamTag[] = "ANNOTATIONS";

// Each detection object will be assigned an unique id that starts from 1.
static int object_id = 0;

inline int GetNextObjectId() { return ++object_id; }
}  // namespace

namespace mediapipe {

// Convert result Tensors from deep pursuit 3d model into FrameAnnotation.
//
// Input:
//  TENSORS - Vector of Tensor of type kFloat32.
// Output:
//  ANNOTATIONS - Result FrameAnnotation.
//
// Usage example:
// node {
//   calculator: "TensorsToObjectsCalculator"
//   input_stream: "TENSORS:tensors"
//   output_stream: "ANNOTATIONS:annotations"
// }
class TensorsToObjectsCalculator : public CalculatorBase {
 public:
  static absl::Status GetContract(CalculatorContract* cc);

  absl::Status Open(CalculatorContext* cc) override;
  absl::Status Process(CalculatorContext* cc) override;
  absl::Status Close(CalculatorContext* cc) override;

 private:
  absl::Status ProcessCPU(CalculatorContext* cc,
                          FrameAnnotation* output_objects);
  absl::Status LoadOptions(CalculatorContext* cc);
  // Takes point_3d in FrameAnnotation, projects to 2D, and overwrite the
  // point_2d field with the projection.
  void Project3DTo2D(bool portrait, FrameAnnotation* annotation) const;
  // Increment and assign object ID for each detected object.
  // In a single MediaPipe session, the IDs are unique.
  // Also assign timestamp for the FrameAnnotation to be the input packet
  // timestamp.
  void AssignObjectIdAndTimestamp(int64_t timestamp_us,
                                  FrameAnnotation* annotation);

  int num_classes_ = 0;
  int num_keypoints_ = 0;

  ::mediapipe::TensorsToObjectsCalculatorOptions options_;
  std::unique_ptr<Decoder> decoder_;
  Eigen::Matrix<float, 4, 4, Eigen::RowMajor> projection_matrix_;
};
REGISTER_CALCULATOR(TensorsToObjectsCalculator);

absl::Status TensorsToObjectsCalculator::GetContract(CalculatorContract* cc) {
  RET_CHECK(!cc->Inputs().GetTags().empty());
  RET_CHECK(!cc->Outputs().GetTags().empty());

  if (cc->Inputs().HasTag(kInputStreamTag)) {
    cc->Inputs().Tag(kInputStreamTag).Set<std::vector<mediapipe::Tensor>>();
  }

  if (cc->Outputs().HasTag(kOutputStreamTag)) {
    cc->Outputs().Tag(kOutputStreamTag).Set<FrameAnnotation>();
  }
  return absl::OkStatus();
}

absl::Status TensorsToObjectsCalculator::Open(CalculatorContext* cc) {
  MP_RETURN_IF_ERROR(LoadOptions(cc));
  // clang-format off
  projection_matrix_ <<
      1.5731,   0,       0, 0,
      0,   2.0975,       0, 0,
      0,        0, -1.0002, -0.2,
      0,        0,      -1, 0;
  // clang-format on
  decoder_ = absl::make_unique<Decoder>(
      BeliefDecoderConfig(options_.decoder_config()));

  return absl::OkStatus();
}

absl::Status TensorsToObjectsCalculator::Process(CalculatorContext* cc) {
  if (cc->Inputs().Tag(kInputStreamTag).IsEmpty()) {
    return absl::OkStatus();
  }

  auto output_objects = absl::make_unique<FrameAnnotation>();

  MP_RETURN_IF_ERROR(ProcessCPU(cc, output_objects.get()));

  // Output
  if (cc->Outputs().HasTag(kOutputStreamTag)) {
    cc->Outputs()
        .Tag(kOutputStreamTag)
        .Add(output_objects.release(), cc->InputTimestamp());
  }

  return absl::OkStatus();
}

absl::Status TensorsToObjectsCalculator::ProcessCPU(
    CalculatorContext* cc, FrameAnnotation* output_objects) {
  const auto& input_tensors =
      cc->Inputs().Tag(kInputStreamTag).Get<std::vector<mediapipe::Tensor>>();

  cv::Mat prediction_heatmap = ConvertTensorToCvMat(input_tensors[0]);
  cv::Mat offsetmap = ConvertTensorToCvMat(input_tensors[1]);

  *output_objects =
      decoder_->DecodeBoundingBoxKeypoints(prediction_heatmap, offsetmap);
  auto status = decoder_->Lift2DTo3D(projection_matrix_, /*portrait*/ true,
                                     output_objects);
  if (!status.ok()) {
    ABSL_LOG(ERROR) << status;
    return status;
  }
  Project3DTo2D(/*portrait*/ true, output_objects);
  AssignObjectIdAndTimestamp(cc->InputTimestamp().Microseconds(),
                             output_objects);

  return absl::OkStatus();
}

absl::Status TensorsToObjectsCalculator::Close(CalculatorContext* cc) {
  return absl::OkStatus();
}

absl::Status TensorsToObjectsCalculator::LoadOptions(CalculatorContext* cc) {
  // Get calculator options specified in the graph.
  options_ = cc->Options<::mediapipe::TensorsToObjectsCalculatorOptions>();

  num_classes_ = options_.num_classes();
  num_keypoints_ = options_.num_keypoints();

  // Currently only support 2D when num_values_per_keypoint equals to 2.
  ABSL_CHECK_EQ(options_.num_values_per_keypoint(), 2);

  return absl::OkStatus();
}

void TensorsToObjectsCalculator::Project3DTo2D(
    bool portrait, FrameAnnotation* annotation) const {
  for (auto& ann : *annotation->mutable_annotations()) {
    for (auto& key_point : *ann.mutable_keypoints()) {
      Eigen::Vector4f point3d;
      point3d << key_point.point_3d().x(), key_point.point_3d().y(),
          key_point.point_3d().z(), 1.0f;
      Eigen::Vector4f point3d_projection = projection_matrix_ * point3d;
      float u, v;
      const float inv_w = 1.0f / point3d_projection(3);
      if (portrait) {
        u = (point3d_projection(1) * inv_w + 1.0f) * 0.5f;
        v = (point3d_projection(0) * inv_w + 1.0f) * 0.5f;
      } else {
        u = (point3d_projection(0) * inv_w + 1.0f) * 0.5f;
        v = (1.0f - point3d_projection(1) * inv_w) * 0.5f;
      }
      key_point.mutable_point_2d()->set_x(u);
      key_point.mutable_point_2d()->set_y(v);
    }
  }
}

void TensorsToObjectsCalculator::AssignObjectIdAndTimestamp(
    int64_t timestamp_us, FrameAnnotation* annotation) {
  for (auto& ann : *annotation->mutable_annotations()) {
    ann.set_object_id(GetNextObjectId());
  }
  annotation->set_timestamp(timestamp_us);
}

}  // namespace mediapipe