chromium/chromecast/media/cma/backend/video/av_sync_video.cc

// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chromecast/media/cma/backend/video/av_sync_video.h"

#include <cmath>

#include "base/functional/bind.h"
#include "base/logging.h"
#include "base/time/time.h"
#include "chromecast/base/statistics/weighted_moving_linear_regression.h"
#include "chromecast/media/audio/rate_adjuster.h"
#include "chromecast/media/cma/backend/audio_decoder_for_mixer.h"
#include "chromecast/media/cma/backend/media_pipeline_backend_for_mixer.h"
#include "chromecast/media/cma/backend/video_decoder_for_mixer.h"

namespace chromecast {
namespace media {

namespace {

constexpr base::TimeDelta kLinearRegressionWindow = base::Seconds(20);

// Time interval between AV sync upkeeps.
constexpr base::TimeDelta kAvSyncUpkeepInterval = base::Milliseconds(16);

// Threshold where the audio and video PTS are far enough apart such that we
// want to do a hard correction.
constexpr base::TimeDelta kMaxAptsError = base::Milliseconds(50);

// Minimum samples of video PTS before we start doing A/V sync.
const int kMinVideoPtsSamples = 60;

// This is the threshold for which we consider the rate of playback variation
// to be valid. If we measure a rate of playback variation worse than this, we
// consider the linear regression measurement invalid, we flush the linear
// regression and let AvSync collect samples all over again.
const double kExpectedSlopeVariance = 0.005;

// We don't AV sync content with frame rate less than this. This low framerate
// indicates that the content happens to be audio-centric, with a dummy video
// stream.
const int kAvSyncFpsThreshold = 10;

// Maximum correction rate for absolute sync offset.
const double kMaxOffsetCorrection = 2.5e-4;

// Maximum A/V sync offset that we allow without correction. Note that we still
// correct the audio playback rate to match video (to prevent the offset from
// growing) even when the offset is lower than this value.
const int64_t kMaxIgnoredOffset = 500;

}  // namespace

std::unique_ptr<AvSync> AvSync::Create(
    MediaPipelineBackendForMixer* const backend) {
  return std::make_unique<AvSyncVideo>(backend);
}

AvSyncVideo::AvSyncVideo(MediaPipelineBackendForMixer* const backend)
    : backend_(backend) {
  DCHECK(backend_);
}

AvSyncVideo::~AvSyncVideo() = default;

void AvSyncVideo::NotifyStart(int64_t timestamp, int64_t pts) {
  LOG(INFO) << __func__;
  playback_start_timestamp_ = backend_->MonotonicClockNow();
  current_media_playback_rate_ = 1.0;
  current_audio_clock_rate_ = 1.0;
  backend_->video_decoder()->SetPlaybackRate(current_media_playback_rate_);

  StartAvSync();
}

void AvSyncVideo::NotifyStop() {
  LOG(INFO) << __func__;
  StopAvSync();
}

void AvSyncVideo::NotifyPause() {
  LOG(INFO) << __func__;
  StopAvSync();
  playback_start_timestamp_ = INT64_MAX;
}

void AvSyncVideo::NotifyResume() {
  LOG(INFO) << __func__;
  playback_start_timestamp_ = backend_->MonotonicClockNow();
  StartAvSync();
}

void AvSyncVideo::NotifyPlaybackRateChange(float rate) {
  DCHECK(backend_->video_decoder());
  DCHECK(backend_->audio_decoder());

  current_media_playback_rate_ = rate;
  backend_->video_decoder()->SetPlaybackRate(current_media_playback_rate_);

  FlushAudioPts();
  FlushVideoPts();

  LOG(INFO) << __func__
            << " current_media_playback_rate_=" << current_media_playback_rate_
            << " current_audio_clock_rate_=" << current_audio_clock_rate_;
}

void AvSyncVideo::StartAvSync() {
  FlushAudioPts();
  FlushVideoPts();

  upkeep_av_sync_timer_.Start(FROM_HERE, kAvSyncUpkeepInterval, this,
                              &AvSyncVideo::UpkeepAvSync);
}

void AvSyncVideo::StopAvSync() {
  upkeep_av_sync_timer_.Stop();
  audio_rate_adjuster_.reset();
}

void AvSyncVideo::UpkeepAvSync() {
  if (!backend_->video_decoder() || !backend_->audio_decoder()) {
    return;
  }

  if (!VptsUpkeep()) {
    return;
  }

  int64_t new_raw_apts = 0;
  int64_t new_apts_timestamp = 0;
  if (!backend_->audio_decoder()->GetTimestampedPts(&new_apts_timestamp,
                                                    &new_raw_apts) ||
      new_apts_timestamp <= playback_start_timestamp_ ||
      new_raw_apts == last_apts_value_) {
    return;
  }
  last_apts_value_ = new_raw_apts;

  DCHECK(video_pts_);

  int64_t desired_apts_timestamp;
  double vpts_slope;
  double error;
  if (video_pts_->EstimateSlope(&vpts_slope, &error) &&
      std::abs(vpts_slope - 1.0 / current_media_playback_rate_) >
          kExpectedSlopeVariance) {
    // VPTS slope is bad. This can be because the video is actually playing out
    // at the wrong rate (eg when video playback can't keep up and is too slow),
    // or could be due to bad VPTS data (eg after resume, from old timestamps
    // before pause). We assume the most recent VPTS sample is OK (so far this
    // has always been true) and check if we need to do a hard correction to
    // account for cases where the video is actually playing at the wrong rate
    // before flushing the VPTS regression.
    LOG(ERROR) << "Calculated bad vpts_slope " << vpts_slope
               << " corresponding to playback rate =~ " << (1.0 / vpts_slope)
               << ". Expected playback rate = " << current_media_playback_rate_;

    int64_t last_vpts = video_pts_->samples().back().x;
    int64_t last_vpts_timestamp = video_pts_->samples().back().y;
    desired_apts_timestamp =
        last_vpts_timestamp +
        (new_raw_apts - last_vpts) / current_media_playback_rate_;
    FlushVideoPts();
  } else if (!video_pts_->EstimateY(new_raw_apts, &desired_apts_timestamp,
                                    &error)) {
    LOG(INFO) << "Failed to estimate desired APTS timestamp";
    return;
  }

  // If error is positive, the audio is playing later than it should be.
  int64_t apts_timestamp_error = new_apts_timestamp - desired_apts_timestamp;
  if (std::abs(apts_timestamp_error) > kMaxAptsError.InMicroseconds() ||
      new_apts_timestamp < last_apts_timestamp_) {
    if (new_apts_timestamp < last_apts_timestamp_) {
      LOG(INFO) << "Audio timestamp moved backward";
    }
    LOG(INFO) << "Hard correction; APTS = " << new_raw_apts
              << ", ts = " << new_apts_timestamp
              << ", desired = " << desired_apts_timestamp
              << ", error = " << apts_timestamp_error;
    HardCorrection(new_raw_apts, desired_apts_timestamp);
    return;
  }
  if (video_pts_) {
    // Only do audio rate upkeep if the VPTS data was OK (ie, no bad slope).
    DCHECK(audio_rate_adjuster_);
    audio_rate_adjuster_->AddError(apts_timestamp_error, new_apts_timestamp);
  }
}

bool AvSyncVideo::VptsUpkeep() {
  if (!video_pts_) {
    video_pts_ = std::make_unique<WeightedMovingLinearRegression>(
        kLinearRegressionWindow.InMicroseconds());
  }

  int64_t new_raw_vpts = 0;
  int64_t new_vpts_timestamp = 0;
  if (backend_->video_decoder()->GetCurrentPts(&new_vpts_timestamp,
                                               &new_raw_vpts) &&
      new_vpts_timestamp > playback_start_timestamp_ &&
      new_raw_vpts != last_vpts_value_) {
    video_pts_->AddSample(new_raw_vpts, new_vpts_timestamp, 1.0);
    last_vpts_value_ = new_raw_vpts;
  }

  if (video_pts_->num_samples() < kMinVideoPtsSamples) {
    return false;
  }

  return (GetVideoFrameRate() >= kAvSyncFpsThreshold);
}

int AvSyncVideo::GetVideoFrameRate() {
  DCHECK(video_pts_);
  DCHECK_GE(video_pts_->num_samples(), 2u);
  int64_t duration =
      video_pts_->samples().back().x - video_pts_->samples().front().x;
  return std::round(static_cast<double>(video_pts_->num_samples()) * 1000000 /
                    duration);
}

void AvSyncVideo::HardCorrection(int64_t apts, int64_t desired_apts_timestamp) {
  backend_->audio_decoder()->RestartPlaybackAt(apts, desired_apts_timestamp);
  FlushAudioPts();
}

double AvSyncVideo::ChangeAudioRate(double desired_clock_rate,
                                    double error_slope,
                                    double current_error) {
  double effective_new_rate =
      backend_->audio_decoder()->SetAvSyncPlaybackRate(desired_clock_rate);
  current_audio_clock_rate_ = effective_new_rate;
  LOG(INFO) << "Update audio clock rate to " << effective_new_rate
            << "; wanted " << desired_clock_rate
            << ", error slope = " << error_slope
            << ", smoothed error = " << current_error;

  double vpts_slope;
  double e;
  if (video_pts_->EstimateSlope(&vpts_slope, &e)) {
    LOG(INFO) << "VPTS slope = " << vpts_slope << "; playback rate =~ "
              << (1.0 / vpts_slope);
  }
  return effective_new_rate;
}

void AvSyncVideo::FlushAudioPts() {
  last_apts_timestamp_ = INT64_MIN;
  // Don't reset last_apts_value_, since we still want to ignore that value for
  // the new linear regression since it may be invalid.

  RateAdjuster::Config config;
  config.linear_regression_window = kLinearRegressionWindow;
  config.max_ignored_current_error = kMaxIgnoredOffset;
  config.max_current_error_correction = kMaxOffsetCorrection;
  // Only change the clock rate if the desired rate is > 30 ppm different.
  // Reasoning: 30 ppm means that leaving the clock rate unchanged will add at
  // most 30 microseconds of additional error before the next clock rate check.
  config.min_rate_change = 3.0e-5;
  audio_rate_adjuster_ = std::make_unique<RateAdjuster>(
      config,
      base::BindRepeating(&AvSyncVideo::ChangeAudioRate,
                          base::Unretained(this)),
      current_audio_clock_rate_);
}

void AvSyncVideo::FlushVideoPts() {
  video_pts_.reset();
  // Don't reset last_vpts_value_, since we still want to ignore that value for
  // the new linear regression since it may be invalid.
}

}  // namespace media
}  // namespace chromecast