chromium/third_party/mediapipe/src/mediapipe/calculators/video/motion_analysis_calculator.cc

// Copyright 2019 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cmath>
#include <fstream>
#include <memory>
#include <string>

#include "absl/log/absl_check.h"
#include "absl/log/absl_log.h"
#include "absl/strings/numbers.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "mediapipe/calculators/video/motion_analysis_calculator.pb.h"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/formats/image_frame.h"
#include "mediapipe/framework/formats/image_frame_opencv.h"
#include "mediapipe/framework/formats/video_stream_header.h"
#include "mediapipe/framework/port/logging.h"
#include "mediapipe/framework/port/ret_check.h"
#include "mediapipe/framework/port/status.h"
#include "mediapipe/util/tracking/camera_motion.h"
#include "mediapipe/util/tracking/camera_motion.pb.h"
#include "mediapipe/util/tracking/frame_selection.pb.h"
#include "mediapipe/util/tracking/motion_analysis.h"
#include "mediapipe/util/tracking/motion_estimation.h"
#include "mediapipe/util/tracking/motion_models.h"
#include "mediapipe/util/tracking/region_flow.pb.h"

namespace mediapipe {

constexpr char kDownsampleTag[] = "DOWNSAMPLE";
constexpr char kCsvFileTag[] = "CSV_FILE";
constexpr char kGrayVideoOutTag[] = "GRAY_VIDEO_OUT";
constexpr char kVideoOutTag[] = "VIDEO_OUT";
constexpr char kDenseFgTag[] = "DENSE_FG";
constexpr char kVizTag[] = "VIZ";
constexpr char kSaliencyTag[] = "SALIENCY";
constexpr char kCameraTag[] = "CAMERA";
constexpr char kFlowTag[] = "FLOW";
constexpr char kSelectionTag[] = "SELECTION";
constexpr char kVideoTag[] = "VIDEO";

using mediapipe::AffineAdapter;
using mediapipe::CameraMotion;
using mediapipe::FrameSelectionResult;
using mediapipe::Homography;
using mediapipe::HomographyAdapter;
using mediapipe::LinearSimilarityModel;
using mediapipe::MixtureHomography;
using mediapipe::MixtureRowWeights;
using mediapipe::MotionAnalysis;
using mediapipe::ProjectViaFit;
using mediapipe::RegionFlowComputationOptions;
using mediapipe::RegionFlowFeatureList;
using mediapipe::SalientPointFrame;
using mediapipe::TranslationModel;

const char kOptionsTag[] = "OPTIONS";

// A calculator that performs motion analysis on an incoming video stream.
//
// Input streams:  (at least one of them is required).
//   VIDEO:     The input video stream (ImageFrame, sRGB, sRGBA or GRAY8).
//   SELECTION: Optional input stream to perform analysis only on selected
//              frames. If present needs to contain camera motion
//              and features.
//
// Input side packets:
//   CSV_FILE:  Read motion models as homographies from CSV file. Expected
//              to be defined in the frame domain (un-normalized).
//              Should store 9 floats per row.
//              Specify number of homographies per frames via option
//              meta_models_per_frame. For values > 1, MixtureHomographies
//              are created, for value == 1, a single Homography is used.
//   DOWNSAMPLE: Optionally specify downsampling factor via input side packet
//               overriding value in the graph settings.
// Output streams (all are optional).
//   FLOW:      Sparse feature tracks in form of proto RegionFlowFeatureList.
//   CAMERA:    Camera motion as proto CameraMotion describing the per frame-
//              pair motion. Has VideoHeader from input video.
//   SALIENCY:  Foreground saliency (objects moving different from the
//              background) as proto SalientPointFrame.
//   VIZ:       Visualization stream as ImageFrame, sRGB, visualizing
//              features and saliency (set via
//              analysis_options().visualization_options())
//   DENSE_FG:  Dense foreground stream, describing per-pixel foreground-
//              ness as confidence between 0 (background) and 255
//              (foreground). Output is ImageFrame (GRAY8).
//   VIDEO_OUT: Optional output stream when SELECTION is used. Output is input
//              VIDEO at the selected frames. Required VIDEO to be present.
//   GRAY_VIDEO_OUT: Optional output stream for downsampled, grayscale video.
//                   Requires VIDEO to be present and SELECTION to not be used.
class MotionAnalysisCalculator : public CalculatorBase {
  // TODO: Activate once leakr approval is ready.
  // typedef com::google::android::libraries::micro::proto::Data HomographyData;

 public:
  ~MotionAnalysisCalculator() override = default;

  static absl::Status GetContract(CalculatorContract* cc);

  absl::Status Open(CalculatorContext* cc) override;
  absl::Status Process(CalculatorContext* cc) override;
  absl::Status Close(CalculatorContext* cc) override;

 private:
  // Outputs results to Outputs() if MotionAnalysis buffered sufficient results.
  // Otherwise no-op. Set flush to true to force output of all buffered data.
  void OutputMotionAnalyzedFrames(bool flush, CalculatorContext* cc);

  // Lazy init function to be called on Process.
  absl::Status InitOnProcess(InputStream* video_stream,
                             InputStream* selection_stream);

  // Parses CSV file contents to homographies.
  bool ParseModelCSV(const std::string& contents,
                     std::deque<Homography>* homographies);

  // Turns list of 9-tuple floating values into set of homographies.
  bool HomographiesFromValues(const std::vector<float>& homog_values,
                              std::deque<Homography>* homographies);

  // Appends CameraMotions and features from homographies.
  // Set append_identity to true to add an identity transform to the beginning
  // of the each list *in addition* to the motions derived from homographies.
  void AppendCameraMotionsFromHomographies(
      const std::deque<Homography>& homographies, bool append_identity,
      std::deque<CameraMotion>* camera_motions,
      std::deque<RegionFlowFeatureList>* features);

  // Helper function to subtract current metadata motion from features. Used
  // for hybrid estimation case.
  void SubtractMetaMotion(const CameraMotion& meta_motion,
                          RegionFlowFeatureList* features);

  // Inverse of above function to add back meta motion and replace
  // feature location with originals after estimation.
  void AddMetaMotion(const CameraMotion& meta_motion,
                     const RegionFlowFeatureList& meta_features,
                     RegionFlowFeatureList* features, CameraMotion* motion);

  MotionAnalysisCalculatorOptions options_;
  int frame_width_ = -1;
  int frame_height_ = -1;
  int frame_idx_ = 0;

  // Buffers incoming video frame packets (if visualization output is requested)
  std::vector<Packet> packet_buffer_;

  // Buffers incoming timestamps until MotionAnalysis is ready to output via
  // above OutputMotionAnalyzedFrames.
  std::vector<Timestamp> timestamp_buffer_;

  // Input indicators for each stream.
  bool selection_input_ = false;
  bool video_input_ = false;

  // Output indicators for each stream.
  bool region_flow_feature_output_ = false;
  bool camera_motion_output_ = false;
  bool saliency_output_ = false;
  bool visualize_output_ = false;
  bool dense_foreground_output_ = false;
  bool video_output_ = false;
  bool grayscale_output_ = false;
  bool csv_file_input_ = false;

  // Inidicates if saliency should be computed.
  bool with_saliency_ = false;

  // Set if hybrid meta analysis - see proto for details.
  bool hybrid_meta_analysis_ = false;

  // Concatenated motions for each selected frame. Used in case
  // hybrid estimation is requested to fallback to valid models.
  std::deque<CameraMotion> selected_motions_;

  // Normalized homographies from CSV file or metadata.
  std::deque<Homography> meta_homographies_;
  std::deque<CameraMotion> meta_motions_;
  std::deque<RegionFlowFeatureList> meta_features_;

  // Offset into above meta_motions_ and features_ when using
  // hybrid meta analysis.
  int hybrid_meta_offset_ = 0;

  std::unique_ptr<MotionAnalysis> motion_analysis_;

  std::unique_ptr<MixtureRowWeights> row_weights_;
};

REGISTER_CALCULATOR(MotionAnalysisCalculator);

absl::Status MotionAnalysisCalculator::GetContract(CalculatorContract* cc) {
  if (cc->Inputs().HasTag(kVideoTag)) {
    cc->Inputs().Tag(kVideoTag).Set<ImageFrame>();
  }

  // Optional input stream from frame selection calculator.
  if (cc->Inputs().HasTag(kSelectionTag)) {
    cc->Inputs().Tag(kSelectionTag).Set<FrameSelectionResult>();
  }

  RET_CHECK(cc->Inputs().HasTag(kVideoTag) ||
            cc->Inputs().HasTag(kSelectionTag))
      << "Either VIDEO, SELECTION must be specified.";

  if (cc->Outputs().HasTag(kFlowTag)) {
    cc->Outputs().Tag(kFlowTag).Set<RegionFlowFeatureList>();
  }

  if (cc->Outputs().HasTag(kCameraTag)) {
    cc->Outputs().Tag(kCameraTag).Set<CameraMotion>();
  }

  if (cc->Outputs().HasTag(kSaliencyTag)) {
    cc->Outputs().Tag(kSaliencyTag).Set<SalientPointFrame>();
  }

  if (cc->Outputs().HasTag(kVizTag)) {
    cc->Outputs().Tag(kVizTag).Set<ImageFrame>();
  }

  if (cc->Outputs().HasTag(kDenseFgTag)) {
    cc->Outputs().Tag(kDenseFgTag).Set<ImageFrame>();
  }

  if (cc->Outputs().HasTag(kVideoOutTag)) {
    cc->Outputs().Tag(kVideoOutTag).Set<ImageFrame>();
  }

  if (cc->Outputs().HasTag(kGrayVideoOutTag)) {
    // We only output grayscale video if we're actually performing full region-
    // flow analysis on the video.
    RET_CHECK(cc->Inputs().HasTag(kVideoTag) &&
              !cc->Inputs().HasTag(kSelectionTag));
    cc->Outputs().Tag(kGrayVideoOutTag).Set<ImageFrame>();
  }

  if (cc->InputSidePackets().HasTag(kCsvFileTag)) {
    cc->InputSidePackets().Tag(kCsvFileTag).Set<std::string>();
  }
  if (cc->InputSidePackets().HasTag(kDownsampleTag)) {
    cc->InputSidePackets().Tag(kDownsampleTag).Set<float>();
  }

  if (cc->InputSidePackets().HasTag(kOptionsTag)) {
    cc->InputSidePackets().Tag(kOptionsTag).Set<CalculatorOptions>();
  }

  return absl::OkStatus();
}

absl::Status MotionAnalysisCalculator::Open(CalculatorContext* cc) {
  options_ =
      tool::RetrieveOptions(cc->Options<MotionAnalysisCalculatorOptions>(),
                            cc->InputSidePackets(), kOptionsTag);

  video_input_ = cc->Inputs().HasTag(kVideoTag);
  selection_input_ = cc->Inputs().HasTag(kSelectionTag);
  region_flow_feature_output_ = cc->Outputs().HasTag(kFlowTag);
  camera_motion_output_ = cc->Outputs().HasTag(kCameraTag);
  saliency_output_ = cc->Outputs().HasTag(kSaliencyTag);
  visualize_output_ = cc->Outputs().HasTag(kVizTag);
  dense_foreground_output_ = cc->Outputs().HasTag(kDenseFgTag);
  video_output_ = cc->Outputs().HasTag(kVideoOutTag);
  grayscale_output_ = cc->Outputs().HasTag(kGrayVideoOutTag);
  csv_file_input_ = cc->InputSidePackets().HasTag(kCsvFileTag);
  hybrid_meta_analysis_ = options_.meta_analysis() ==
                          MotionAnalysisCalculatorOptions::META_ANALYSIS_HYBRID;

  if (video_output_) {
    RET_CHECK(selection_input_) << "VIDEO_OUT requires SELECTION input";
  }

  if (selection_input_) {
    switch (options_.selection_analysis()) {
      case MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION:
        RET_CHECK(!visualize_output_)
            << "Visualization not supported for NO_ANALYSIS_USE_SELECTION";
        RET_CHECK(!dense_foreground_output_)
            << "Dense foreground not supported for NO_ANALYSIS_USE_SELECTION";
        RET_CHECK(!saliency_output_)
            << "Saliency output not supported for NO_ANALYSIS_USE_SELECTION";
        break;

      case MotionAnalysisCalculatorOptions::ANALYSIS_RECOMPUTE:
      case MotionAnalysisCalculatorOptions::ANALYSIS_WITH_SEED:
        RET_CHECK(video_input_) << "Need video input for feature tracking.";
        break;

      case MotionAnalysisCalculatorOptions::ANALYSIS_FROM_FEATURES:
        // Nothing to add here.
        break;
    }
  }

  if (visualize_output_ || dense_foreground_output_ || video_output_) {
    RET_CHECK(video_input_) << "Video input required.";
  }

  if (csv_file_input_) {
    RET_CHECK(!selection_input_)
        << "Can not use selection input with csv input.";
    if (!hybrid_meta_analysis_) {
      RET_CHECK(!saliency_output_ && !visualize_output_ &&
                !dense_foreground_output_ && !grayscale_output_)
          << "CSV file and meta input only supports flow and camera motion "
          << "output when using metadata only.";
    }
  }

  if (csv_file_input_) {
    // Read from file and parse.
    const std::string filename =
        cc->InputSidePackets().Tag(kCsvFileTag).Get<std::string>();

    std::string file_contents;
    std::ifstream input_file(filename, std::ios::in);
    input_file.seekg(0, std::ios::end);
    const int file_length = input_file.tellg();
    file_contents.resize(file_length);
    input_file.seekg(0, std::ios::beg);
    input_file.read(&file_contents[0], file_length);
    input_file.close();

    RET_CHECK(ParseModelCSV(file_contents, &meta_homographies_))
        << "Could not parse CSV file";
  }

  // Get video header from video or selection input if present.
  const VideoHeader* video_header = nullptr;
  if (video_input_ && !cc->Inputs().Tag(kVideoTag).Header().IsEmpty()) {
    video_header = &(cc->Inputs().Tag(kVideoTag).Header().Get<VideoHeader>());
  } else if (selection_input_ &&
             !cc->Inputs().Tag(kSelectionTag).Header().IsEmpty()) {
    video_header =
        &(cc->Inputs().Tag(kSelectionTag).Header().Get<VideoHeader>());
  } else {
    ABSL_LOG(WARNING) << "No input video header found. Downstream calculators "
                         "expecting video headers are likely to fail.";
  }

  with_saliency_ = options_.analysis_options().compute_motion_saliency();
  // Force computation of saliency if requested as output.
  if (cc->Outputs().HasTag(kSaliencyTag)) {
    with_saliency_ = true;
    if (!options_.analysis_options().compute_motion_saliency()) {
      ABSL_LOG(WARNING) << "Enable saliency computation. Set "
                        << "compute_motion_saliency to true to silence this "
                        << "warning.";
      options_.mutable_analysis_options()->set_compute_motion_saliency(true);
    }
  }

  if (options_.bypass_mode()) {
    cc->SetOffset(TimestampDiff(0));
  }

  if (cc->InputSidePackets().HasTag(kDownsampleTag)) {
    options_.mutable_analysis_options()
        ->mutable_flow_options()
        ->set_downsample_factor(
            cc->InputSidePackets().Tag(kDownsampleTag).Get<float>());
  }

  // If no video header is provided, just return and initialize on the first
  // Process() call.
  if (video_header == nullptr) {
    return absl::OkStatus();
  }

  ////////////// EARLY RETURN; ONLY HEADER OUTPUT SHOULD GO HERE ///////////////

  if (visualize_output_) {
    cc->Outputs().Tag(kVizTag).SetHeader(Adopt(new VideoHeader(*video_header)));
  }

  if (video_output_) {
    cc->Outputs()
        .Tag(kVideoOutTag)
        .SetHeader(Adopt(new VideoHeader(*video_header)));
  }

  if (cc->Outputs().HasTag(kDenseFgTag)) {
    std::unique_ptr<VideoHeader> foreground_header(
        new VideoHeader(*video_header));
    foreground_header->format = ImageFormat::GRAY8;
    cc->Outputs()
        .Tag(kDenseFgTag)
        .SetHeader(Adopt(foreground_header.release()));
  }

  if (cc->Outputs().HasTag(kCameraTag)) {
    cc->Outputs()
        .Tag(kCameraTag)
        .SetHeader(Adopt(new VideoHeader(*video_header)));
  }

  if (cc->Outputs().HasTag(kSaliencyTag)) {
    cc->Outputs()
        .Tag(kSaliencyTag)
        .SetHeader(Adopt(new VideoHeader(*video_header)));
  }

  return absl::OkStatus();
}

absl::Status MotionAnalysisCalculator::Process(CalculatorContext* cc) {
  if (options_.bypass_mode()) {
    return absl::OkStatus();
  }

  InputStream* video_stream =
      video_input_ ? &(cc->Inputs().Tag(kVideoTag)) : nullptr;
  InputStream* selection_stream =
      selection_input_ ? &(cc->Inputs().Tag(kSelectionTag)) : nullptr;

  // Checked on Open.
  ABSL_CHECK(video_stream || selection_stream);

  // Lazy init.
  if (frame_width_ < 0 || frame_height_ < 0) {
    MP_RETURN_IF_ERROR(InitOnProcess(video_stream, selection_stream));
  }

  const Timestamp timestamp = cc->InputTimestamp();
  if ((csv_file_input_) && !hybrid_meta_analysis_) {
    if (camera_motion_output_) {
      RET_CHECK(!meta_motions_.empty()) << "Insufficient metadata.";

      CameraMotion output_motion = meta_motions_.front();
      meta_motions_.pop_front();
      output_motion.set_timestamp_usec(timestamp.Value());
      cc->Outputs()
          .Tag(kCameraTag)
          .Add(new CameraMotion(output_motion), timestamp);
    }

    if (region_flow_feature_output_) {
      RET_CHECK(!meta_features_.empty()) << "Insufficient frames in CSV file";
      RegionFlowFeatureList output_features = meta_features_.front();
      meta_features_.pop_front();

      output_features.set_timestamp_usec(timestamp.Value());
      cc->Outputs().Tag(kFlowTag).Add(
          new RegionFlowFeatureList(output_features), timestamp);
    }

    ++frame_idx_;
    return absl::OkStatus();
  }

  if (motion_analysis_ == nullptr) {
    // We do not need MotionAnalysis when using just metadata.
    motion_analysis_.reset(new MotionAnalysis(options_.analysis_options(),
                                              frame_width_, frame_height_));
  }

  std::unique_ptr<FrameSelectionResult> frame_selection_result;
  // Always use frame if selection is not activated.
  bool use_frame = !selection_input_;
  if (selection_input_) {
    ABSL_CHECK(selection_stream);

    // Fill in timestamps we process.
    if (!selection_stream->Value().IsEmpty()) {
      MP_ASSIGN_OR_RETURN(
          frame_selection_result,
          selection_stream->Value().ConsumeOrCopy<FrameSelectionResult>());
      use_frame = true;

      // Make sure both features and camera motion are present.
      RET_CHECK(frame_selection_result->has_camera_motion() &&
                frame_selection_result->has_features())
          << "Frame selection input error at: " << timestamp
          << " both camera motion and features need to be "
             "present in FrameSelectionResult. "
          << frame_selection_result->has_camera_motion() << " , "
          << frame_selection_result->has_features();
    }
  }

  if (selection_input_ && use_frame &&
      options_.selection_analysis() ==
          MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION) {
    // Output concatenated results, nothing to compute here.
    if (camera_motion_output_) {
      cc->Outputs()
          .Tag(kCameraTag)
          .Add(frame_selection_result->release_camera_motion(), timestamp);
    }
    if (region_flow_feature_output_) {
      cc->Outputs().Tag(kFlowTag).Add(
          frame_selection_result->release_features(), timestamp);
    }

    if (video_output_) {
      cc->Outputs().Tag(kVideoOutTag).AddPacket(video_stream->Value());
    }

    return absl::OkStatus();
  }

  if (use_frame) {
    if (!selection_input_) {
      const cv::Mat input_view =
          formats::MatView(&video_stream->Get<ImageFrame>());
      if (hybrid_meta_analysis_) {
        // Seed with meta homography.
        RET_CHECK(hybrid_meta_offset_ < meta_motions_.size())
            << "Not enough metadata received for hybrid meta analysis";
        Homography initial_transform =
            meta_motions_[hybrid_meta_offset_].homography();
        std::function<void(RegionFlowFeatureList*)> subtract_helper = std::bind(
            &MotionAnalysisCalculator::SubtractMetaMotion, this,
            meta_motions_[hybrid_meta_offset_], std::placeholders::_1);

        // Keep original features before modification around.
        motion_analysis_->AddFrameGeneric(
            input_view, timestamp.Value(), initial_transform, nullptr, nullptr,
            &subtract_helper, &meta_features_[hybrid_meta_offset_]);
        ++hybrid_meta_offset_;
      } else {
        motion_analysis_->AddFrame(input_view, timestamp.Value());
      }
    } else {
      selected_motions_.push_back(frame_selection_result->camera_motion());
      switch (options_.selection_analysis()) {
        case MotionAnalysisCalculatorOptions::NO_ANALYSIS_USE_SELECTION:
          return mediapipe::UnknownErrorBuilder(MEDIAPIPE_LOC)
                 << "Should not reach this point!";

        case MotionAnalysisCalculatorOptions::ANALYSIS_FROM_FEATURES:
          motion_analysis_->AddFeatures(frame_selection_result->features());
          break;

        case MotionAnalysisCalculatorOptions::ANALYSIS_RECOMPUTE: {
          const cv::Mat input_view =
              formats::MatView(&video_stream->Get<ImageFrame>());
          motion_analysis_->AddFrame(input_view, timestamp.Value());
          break;
        }

        case MotionAnalysisCalculatorOptions::ANALYSIS_WITH_SEED: {
          Homography homography;
          CameraMotionToHomography(frame_selection_result->camera_motion(),
                                   &homography);
          const cv::Mat input_view =
              formats::MatView(&video_stream->Get<ImageFrame>());
          motion_analysis_->AddFrameGeneric(input_view, timestamp.Value(),
                                            homography, &homography);
          break;
        }
      }
    }

    timestamp_buffer_.push_back(timestamp);
    ++frame_idx_;

    VLOG_EVERY_N(1, 100) << "Analyzed frame " << frame_idx_;

    // Buffer input frames only if visualization is requested.
    if (visualize_output_ || video_output_) {
      packet_buffer_.push_back(video_stream->Value());
    }

    // If requested, output grayscale thumbnails
    if (grayscale_output_) {
      cv::Mat grayscale_mat = motion_analysis_->GetGrayscaleFrameFromResults();
      std::unique_ptr<ImageFrame> grayscale_image(new ImageFrame(
          ImageFormat::GRAY8, grayscale_mat.cols, grayscale_mat.rows));
      cv::Mat image_frame_mat = formats::MatView(grayscale_image.get());
      grayscale_mat.copyTo(image_frame_mat);

      cc->Outputs()
          .Tag(kGrayVideoOutTag)
          .Add(grayscale_image.release(), timestamp);
    }

    // Output other results, if we have any yet.
    OutputMotionAnalyzedFrames(false, cc);
  }

  return absl::OkStatus();
}

absl::Status MotionAnalysisCalculator::Close(CalculatorContext* cc) {
  // Guard against empty videos.
  if (motion_analysis_) {
    OutputMotionAnalyzedFrames(true, cc);
  }
  if (csv_file_input_) {
    if (!meta_motions_.empty()) {
      ABSL_LOG(ERROR) << "More motions than frames. Unexpected! Remainder: "
                      << meta_motions_.size();
    }
  }
  return absl::OkStatus();
}

void MotionAnalysisCalculator::OutputMotionAnalyzedFrames(
    bool flush, CalculatorContext* cc) {
  std::vector<std::unique_ptr<RegionFlowFeatureList>> features;
  std::vector<std::unique_ptr<CameraMotion>> camera_motions;
  std::vector<std::unique_ptr<SalientPointFrame>> saliency;

  const int buffer_size = timestamp_buffer_.size();
  const int num_results = motion_analysis_->GetResults(
      flush, &features, &camera_motions, with_saliency_ ? &saliency : nullptr);

  ABSL_CHECK_LE(num_results, buffer_size);

  if (num_results == 0) {
    return;
  }

  for (int k = 0; k < num_results; ++k) {
    // Region flow features and camera motion for this frame.
    auto& feature_list = features[k];
    auto& camera_motion = camera_motions[k];
    const Timestamp timestamp = timestamp_buffer_[k];

    if (selection_input_ && options_.hybrid_selection_camera()) {
      if (camera_motion->type() > selected_motions_.front().type()) {
        // Composited type is more stable.
        camera_motion->Swap(&selected_motions_.front());
      }
      selected_motions_.pop_front();
    }

    if (hybrid_meta_analysis_) {
      AddMetaMotion(meta_motions_.front(), meta_features_.front(),
                    feature_list.get(), camera_motion.get());
      meta_motions_.pop_front();
      meta_features_.pop_front();
    }

    // Video frame for visualization.
    std::unique_ptr<ImageFrame> visualization_frame;
    cv::Mat visualization;
    if (visualize_output_) {
      // Initialize visualization frame with original frame.
      visualization_frame.reset(new ImageFrame());
      visualization_frame->CopyFrom(packet_buffer_[k].Get<ImageFrame>(), 16);
      visualization = formats::MatView(visualization_frame.get());

      motion_analysis_->RenderResults(
          *feature_list, *camera_motion,
          with_saliency_ ? saliency[k].get() : nullptr, &visualization);

      cc->Outputs().Tag(kVizTag).Add(visualization_frame.release(), timestamp);
    }

    // Output dense foreground mask.
    if (dense_foreground_output_) {
      std::unique_ptr<ImageFrame> foreground_frame(
          new ImageFrame(ImageFormat::GRAY8, frame_width_, frame_height_));
      cv::Mat foreground = formats::MatView(foreground_frame.get());
      motion_analysis_->ComputeDenseForeground(*feature_list, *camera_motion,
                                               &foreground);
      cc->Outputs().Tag(kDenseFgTag).Add(foreground_frame.release(), timestamp);
    }

    // Output flow features if requested.
    if (region_flow_feature_output_) {
      cc->Outputs().Tag(kFlowTag).Add(feature_list.release(), timestamp);
    }

    // Output camera motion.
    if (camera_motion_output_) {
      cc->Outputs().Tag(kCameraTag).Add(camera_motion.release(), timestamp);
    }

    if (video_output_) {
      cc->Outputs().Tag(kVideoOutTag).AddPacket(packet_buffer_[k]);
    }

    // Output saliency.
    if (saliency_output_) {
      cc->Outputs().Tag(kSaliencyTag).Add(saliency[k].release(), timestamp);
    }
  }

  if (hybrid_meta_analysis_) {
    hybrid_meta_offset_ -= num_results;
    ABSL_CHECK_GE(hybrid_meta_offset_, 0);
  }

  timestamp_buffer_.erase(timestamp_buffer_.begin(),
                          timestamp_buffer_.begin() + num_results);

  if (visualize_output_ || video_output_) {
    packet_buffer_.erase(packet_buffer_.begin(),
                         packet_buffer_.begin() + num_results);
  }
}

absl::Status MotionAnalysisCalculator::InitOnProcess(
    InputStream* video_stream, InputStream* selection_stream) {
  if (video_stream) {
    frame_width_ = video_stream->Get<ImageFrame>().Width();
    frame_height_ = video_stream->Get<ImageFrame>().Height();

    // Ensure image options are set correctly.
    auto* region_options =
        options_.mutable_analysis_options()->mutable_flow_options();

    // Use two possible formats to account for different channel orders.
    RegionFlowComputationOptions::ImageFormat image_format;
    RegionFlowComputationOptions::ImageFormat image_format2;
    switch (video_stream->Get<ImageFrame>().Format()) {
      case ImageFormat::GRAY8:
        image_format = image_format2 =
            RegionFlowComputationOptions::FORMAT_GRAYSCALE;
        break;

      case ImageFormat::SRGB:
        image_format = RegionFlowComputationOptions::FORMAT_RGB;
        image_format2 = RegionFlowComputationOptions::FORMAT_BGR;
        break;

      case ImageFormat::SRGBA:
        image_format = RegionFlowComputationOptions::FORMAT_RGBA;
        image_format2 = RegionFlowComputationOptions::FORMAT_BGRA;
        break;

      default:
        RET_CHECK(false) << "Unsupported image format.";
    }
    if (region_options->image_format() != image_format &&
        region_options->image_format() != image_format2) {
      ABSL_LOG(WARNING) << "Requested image format in RegionFlowComputation "
                        << "does not match video stream format. Overriding.";
      region_options->set_image_format(image_format);
    }

    // Account for downsampling mode INPUT_SIZE. In this case we are handed
    // already downsampled frames but the resulting CameraMotion should
    // be computed on higher resolution as specifed by the downsample scale.
    if (region_options->downsample_mode() ==
        RegionFlowComputationOptions::DOWNSAMPLE_TO_INPUT_SIZE) {
      const float scale = region_options->downsample_factor();
      frame_width_ = static_cast<int>(std::round(frame_width_ * scale));
      frame_height_ = static_cast<int>(std::round(frame_height_ * scale));
    }
  } else if (selection_stream) {
    const auto& camera_motion =
        selection_stream->Get<FrameSelectionResult>().camera_motion();
    frame_width_ = camera_motion.frame_width();
    frame_height_ = camera_motion.frame_height();
  } else {
    ABSL_LOG(FATAL) << "Either VIDEO or SELECTION stream need to be specified.";
  }

  // Filled by CSV file parsing.
  if (!meta_homographies_.empty()) {
    ABSL_CHECK(csv_file_input_);
    AppendCameraMotionsFromHomographies(meta_homographies_,
                                        true,  // append identity.
                                        &meta_motions_, &meta_features_);
    meta_homographies_.clear();
  }

  // Filter weights before using for hybrid mode.
  if (hybrid_meta_analysis_) {
    auto* motion_options =
        options_.mutable_analysis_options()->mutable_motion_options();
    motion_options->set_filter_initialized_irls_weights(true);
  }

  return absl::OkStatus();
}

bool MotionAnalysisCalculator::ParseModelCSV(
    const std::string& contents, std::deque<Homography>* homographies) {
  std::vector<absl::string_view> values =
      absl::StrSplit(contents, absl::ByAnyChar(",\n"));

  // Trim off any empty lines.
  while (values.back().empty()) {
    values.pop_back();
  }

  // Convert to float.
  std::vector<float> homog_values;
  homog_values.reserve(values.size());

  for (const auto& value : values) {
    double value_64f;
    if (!absl::SimpleAtod(value, &value_64f)) {
      ABSL_LOG(ERROR) << "Not a double, expected!";
      return false;
    }

    homog_values.push_back(value_64f);
  }

  return HomographiesFromValues(homog_values, homographies);
}

bool MotionAnalysisCalculator::HomographiesFromValues(
    const std::vector<float>& homog_values,
    std::deque<Homography>* homographies) {
  ABSL_CHECK(homographies);

  // Obvious constants are obvious :D
  constexpr int kHomographyValues = 9;
  if (homog_values.size() % kHomographyValues != 0) {
    ABSL_LOG(ERROR) << "Contents not a multiple of " << kHomographyValues;
    return false;
  }

  for (int k = 0; k < homog_values.size(); k += kHomographyValues) {
    std::vector<double> h_vals(kHomographyValues);
    for (int l = 0; l < kHomographyValues; ++l) {
      h_vals[l] = homog_values[k + l];
    }

    // Normalize last entry to 1.
    if (h_vals[kHomographyValues - 1] == 0) {
      ABSL_LOG(ERROR) << "Degenerate homography, last entry is zero";
      return false;
    }

    const double scale = 1.0f / h_vals[kHomographyValues - 1];
    for (int l = 0; l < kHomographyValues; ++l) {
      h_vals[l] *= scale;
    }

    Homography h = HomographyAdapter::FromDoublePointer(h_vals.data(), false);
    homographies->push_back(h);
  }

  if (homographies->size() % options_.meta_models_per_frame() != 0) {
    ABSL_LOG(ERROR) << "Total homographies not a multiple of specified models "
                    << "per frame.";
    return false;
  }

  return true;
}

void MotionAnalysisCalculator::SubtractMetaMotion(
    const CameraMotion& meta_motion, RegionFlowFeatureList* features) {
  if (meta_motion.mixture_homography().model_size() > 0) {
    ABSL_CHECK(row_weights_ != nullptr);
    RegionFlowFeatureListViaTransform(meta_motion.mixture_homography(),
                                      features, -1.0f,
                                      1.0f,  // subtract transformed.
                                      true,  // replace feature loc.
                                      row_weights_.get());
  } else {
    RegionFlowFeatureListViaTransform(meta_motion.homography(), features, -1.0f,
                                      1.0f,   // subtract transformed.
                                      true);  // replace feature loc.
  }

  // Clamp transformed features to domain and handle outliers.
  const float domain_diam =
      hypot(features->frame_width(), features->frame_height());
  const float motion_mag = meta_motion.average_magnitude();
  // Same irls fraction as used by MODEL_MIXTURE_HOMOGRAPHY scaling in
  // MotionEstimation.
  const float irls_fraction = options_.analysis_options()
                                  .motion_options()
                                  .irls_mixture_fraction_scale() *
                              options_.analysis_options()
                                  .motion_options()
                                  .irls_motion_magnitude_fraction();
  float err_scale = std::max(1.0f, motion_mag * irls_fraction);

  const float max_err =
      options_.meta_outlier_domain_ratio() * domain_diam * err_scale;
  const float max_err_sq = max_err * max_err;

  for (auto& feature : *features->mutable_feature()) {
    feature.set_x(
        std::max(0.0f, std::min(features->frame_width() - 1.0f, feature.x())));
    feature.set_y(
        std::max(0.0f, std::min(features->frame_height() - 1.0f, feature.y())));
    // Label anything with large residual motion an outlier.
    if (FeatureFlow(feature).Norm2() > max_err_sq) {
      feature.set_irls_weight(0.0f);
    }
  }
}

void MotionAnalysisCalculator::AddMetaMotion(
    const CameraMotion& meta_motion, const RegionFlowFeatureList& meta_features,
    RegionFlowFeatureList* features, CameraMotion* motion) {
  // Restore old feature location.
  ABSL_CHECK_EQ(meta_features.feature_size(), features->feature_size());
  for (int k = 0; k < meta_features.feature_size(); ++k) {
    auto feature = features->mutable_feature(k);
    const auto& meta_feature = meta_features.feature(k);
    feature->set_x(meta_feature.x());
    feature->set_y(meta_feature.y());
    feature->set_dx(meta_feature.dx());
    feature->set_dy(meta_feature.dy());
  }

  // Composite camera motion.
  *motion = ComposeCameraMotion(*motion, meta_motion);
  // Restore type from metadata, i.e. do not declare motions as invalid.
  motion->set_type(meta_motion.type());
  motion->set_match_frame(-1);
}

void MotionAnalysisCalculator::AppendCameraMotionsFromHomographies(
    const std::deque<Homography>& homographies, bool append_identity,
    std::deque<CameraMotion>* camera_motions,
    std::deque<RegionFlowFeatureList>* features) {
  ABSL_CHECK(camera_motions);
  ABSL_CHECK(features);

  CameraMotion identity;
  identity.set_frame_width(frame_width_);
  identity.set_frame_height(frame_height_);

  *identity.mutable_translation() = TranslationModel();
  *identity.mutable_linear_similarity() = LinearSimilarityModel();
  *identity.mutable_homography() = Homography();
  identity.set_type(CameraMotion::VALID);
  identity.set_match_frame(0);

  RegionFlowFeatureList empty_list;
  empty_list.set_long_tracks(true);
  empty_list.set_match_frame(-1);
  empty_list.set_frame_width(frame_width_);
  empty_list.set_frame_height(frame_height_);

  if (append_identity) {
    camera_motions->push_back(identity);
    features->push_back(empty_list);
  }

  const int models_per_frame = options_.meta_models_per_frame();
  ABSL_CHECK_GT(models_per_frame, 0)
      << "At least one model per frame is needed";
  ABSL_CHECK_EQ(0, homographies.size() % models_per_frame);
  const int num_frames = homographies.size() / models_per_frame;

  // Heuristic sigma, similar to what we use for rolling shutter removal.
  const float mixture_sigma = 1.0f / models_per_frame;

  if (row_weights_ == nullptr) {
    row_weights_.reset(new MixtureRowWeights(frame_height_,
                                             frame_height_ / 10,  // 10% margin
                                             mixture_sigma * frame_height_,
                                             1.0f, models_per_frame));
  }

  for (int f = 0; f < num_frames; ++f) {
    MixtureHomography mix_homog;
    const int model_start = f * models_per_frame;

    for (int k = 0; k < models_per_frame; ++k) {
      const Homography& homog = homographies[model_start + k];
      *mix_homog.add_model() = ModelInvert(homog);
    }

    CameraMotion c = identity;
    c.set_match_frame(-1);

    if (mix_homog.model_size() > 1) {
      *c.mutable_mixture_homography() = mix_homog;
      c.set_mixture_row_sigma(mixture_sigma);

      for (int k = 0; k < models_per_frame; ++k) {
        c.add_mixture_inlier_coverage(1.0f);
      }
      *c.add_mixture_homography_spectrum() = mix_homog;
      c.set_rolling_shutter_motion_index(0);

      *c.mutable_homography() = ProjectViaFit<Homography>(
          mix_homog, frame_width_, frame_height_, row_weights_.get());
    } else {
      // Guaranteed to exist because to check that models_per_frame > 0 above.
      *c.mutable_homography() = mix_homog.model(0);
    }

    // Project remaining motions down.
    *c.mutable_linear_similarity() = ProjectViaFit<LinearSimilarityModel>(
        c.homography(), frame_width_, frame_height_);
    *c.mutable_translation() = ProjectViaFit<TranslationModel>(
        c.homography(), frame_width_, frame_height_);

    c.set_average_magnitude(
        std::hypot(c.translation().dx(), c.translation().dy()));

    camera_motions->push_back(c);
    features->push_back(empty_list);
  }
}

}  // namespace mediapipe