chromium/media/gpu/v4l2/mt21/mt21_decompressor.cc

// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include <sched.h>
#include <stdlib.h>

#include "base/bits.h"
#include "media/gpu/v4l2/mt21/mt21_decompressor.h"
#include "media/gpu/v4l2/mt21/mt21_util.h"
#include "third_party/libyuv/include/libyuv/planar_functions.h"

namespace media {

namespace {

template <class T>
void MT21ToMM21(const uint8_t* src,
                const uint8_t* footer,
                uint8_t* dest,
                size_t start_offset,
                size_t width,
                size_t height,
                const GolombRiceTableEntry* symbol_cache) {
  std::vector<T> subblock_bins[2];
  uint8_t scratch[kMT21ScratchMemorySize] __attribute__((aligned(16)));

  for (size_t block_offset = 0; block_offset < width * height;
       block_offset += kMT21BlockSize) {
    BinSubblocks<T>(src, footer, dest, block_offset + start_offset,
                    subblock_bins);
  }

  // Handle high-entropy passthrough subblocks.
  for (T& subblock : subblock_bins[1]) {
    memcpy(subblock.dest, subblock.src, subblock.len);
  }

  // Vector decompress as many blocks as possible.
  size_t i = 0;
  for (; i + kNumOutputLanes - 1 < subblock_bins[0].size();
       i += kNumOutputLanes) {
    VectorDecompressSubblockHelper<T>(subblock_bins[0], i, scratch);
  }
  // Scalar decompress the remainder.
  for (; i < subblock_bins[0].size(); i++) {
    DecompressSubblockHelper<T>(subblock_bins[0][i], symbol_cache);
  }
}

void DecompressAndDetile(const MT21DecompressionJob& job,
                         uint8_t* pivot,
                         const GolombRiceTableEntry* symbol_cache) {
  if (job.is_chroma) {
    MT21ToMM21<MT21UVSubblock>(job.src, job.footer, pivot, job.offset,
                               job.width, job.height, symbol_cache);
  } else {
    MT21ToMM21<MT21YSubblock>(job.src, job.footer, pivot, job.offset, job.width,
                              job.height, symbol_cache);
  }

  libyuv::DetilePlane(pivot + job.offset, job.width, job.dest + job.offset,
                      job.width, job.width, job.height,
                      job.is_chroma ? kMT21TileHeight / 2 : kMT21TileHeight);
}

// MT8173 has 2 Cortex A72s and 2 Cortex A53s
constexpr size_t kNumLittleThreads = 2;
constexpr size_t kNumBigThreads = 2;

void MT21WorkerEntry(cpu_set_t mask,
                     std::atomic_bool& should_shutdown,
                     const GolombRiceTableEntry* symbol_cache,
                     uint8_t* pivot,
                     scoped_refptr<MT21DecompressionJob> job) {
  sched_setaffinity(0, sizeof(cpu_set_t), &mask);

  while (true) {
    job->wakeup_event.Wait();

    if (should_shutdown) {
      break;
    }

    DecompressAndDetile(*job, pivot, symbol_cache);

    job->done_event.Signal();
  }
}

}  // namespace

MT21DecompressionJob::MT21DecompressionJob(const uint8_t* src,
                                           const uint8_t* footer,
                                           size_t offset,
                                           uint8_t* dest,
                                           size_t width,
                                           size_t height,
                                           bool is_chroma)
    : wakeup_event(base::WaitableEvent::ResetPolicy::AUTOMATIC,
                   base::WaitableEvent::InitialState::NOT_SIGNALED),
      done_event(base::WaitableEvent::ResetPolicy::AUTOMATIC,
                 base::WaitableEvent::InitialState::NOT_SIGNALED) {
  this->src = src;
  this->footer = footer;
  this->offset = offset;
  this->dest = dest;
  this->width = width;
  this->height = height;
  this->is_chroma = is_chroma;
}

MT21Decompressor::MT21Decompressor(gfx::Size resolution) {
  symbol_cache_ = new GolombRiceTableEntry[kGolombRiceCacheSize];
  PopulateGolombRiceCache(symbol_cache_);

  aligned_resolution_ =
      gfx::Size(base::bits::AlignUp(static_cast<size_t>(resolution.width()),
                                    kMT21TileWidth),
                base::bits::AlignUp(static_cast<size_t>(resolution.height()),
                                    kMT21TileHeight));

  // Big cores are CPUs 2 and 3, while the little cores are 0 and 1.
  cpu_set_t mask;
  CPU_ZERO(&mask);
  for (size_t i = kNumLittleThreads; i < kNumLittleThreads + kNumBigThreads;
       i++) {
    CPU_SET(i, &mask);
  }
  big_core_pivot_ =
      static_cast<uint8_t*>(aligned_alloc(16, aligned_resolution_.GetArea()));
  for (size_t i = 0; i < kNumBigThreads; i++) {
    scoped_refptr<MT21DecompressionJob> job =
        base::MakeRefCounted<MT21DecompressionJob>(nullptr, nullptr, 0, nullptr,
                                                   0, 0, false);
    big_core_jobs_.push_back(job);
    big_core_threads_.emplace_back(MT21WorkerEntry, mask,
                                   std::ref(should_shutdown_), symbol_cache_,
                                   big_core_pivot_, job);
  }

  CPU_ZERO(&mask);
  for (size_t i = 0; i < kNumLittleThreads; i++) {
    CPU_SET(i, &mask);
  }
  little_core_pivot_ =
      static_cast<uint8_t*>(aligned_alloc(16, aligned_resolution_.GetArea()));
  for (size_t i = 0; i < kNumLittleThreads; i++) {
    scoped_refptr<MT21DecompressionJob> job =
        base::MakeRefCounted<MT21DecompressionJob>(nullptr, nullptr, 0, nullptr,
                                                   0, 0, true);
    little_core_jobs_.push_back(job);
    little_core_threads_.emplace_back(MT21WorkerEntry, mask,
                                      std::ref(should_shutdown_), symbol_cache_,
                                      little_core_pivot_, job);
  }

  // Experimental evidence shows that A53s decompress MT21 blocks at about half
  // the speed of A72s. This conveniently means that if split the chroma plane
  // between the A53s and the luma plane between the A72s, we should perfectly
  // balance the load.

  size_t uv_split_height = base::bits::AlignUp(
      static_cast<size_t>(aligned_resolution_.height() / 2 / 2),
      kMT21TileHeight / 2);
  size_t uv_split_offset = uv_split_height * aligned_resolution_.width();
  little_core_jobs_[0]->offset = 0;
  little_core_jobs_[0]->width = aligned_resolution_.width();
  little_core_jobs_[0]->height = uv_split_height;
  little_core_jobs_[1]->offset = uv_split_offset;
  little_core_jobs_[1]->width = aligned_resolution_.width();
  little_core_jobs_[1]->height =
      aligned_resolution_.height() / 2 - uv_split_height;

  size_t y_split_height = base::bits::AlignUp(
      static_cast<size_t>(aligned_resolution_.height() / 2), kMT21TileHeight);
  size_t y_split_offset = y_split_height * aligned_resolution_.width();
  big_core_jobs_[0]->offset = 0;
  big_core_jobs_[0]->width = aligned_resolution_.width();
  big_core_jobs_[0]->height = y_split_height;
  big_core_jobs_[1]->offset = y_split_offset;
  big_core_jobs_[1]->width = aligned_resolution_.width();
  big_core_jobs_[1]->height = aligned_resolution_.height() - y_split_height;
}

MT21Decompressor::~MT21Decompressor() {
  should_shutdown_ = true;
  for (auto& job : little_core_jobs_) {
    job->wakeup_event.Signal();
  }
  for (auto& job : big_core_jobs_) {
    job->wakeup_event.Signal();
  }
  for (size_t i = 0; i < kNumLittleThreads; i++) {
    little_core_threads_[i].join();
  }
  for (size_t i = 0; i < kNumBigThreads; i++) {
    big_core_threads_[i].join();
  }

  delete little_core_pivot_;
  delete big_core_pivot_;

  delete symbol_cache_;
}

void MT21Decompressor::MT21ToNV12(const uint8_t* src_y,
                                  const uint8_t* src_uv,
                                  const size_t y_buf_size,
                                  const size_t uv_buf_size,
                                  uint8_t* dest_y,
                                  uint8_t* dest_uv) {
  const uint8_t* y_footer =
      ComputeFooterOffset(aligned_resolution_.GetArea(), y_buf_size,
                          kMT21YFooterAlignment) +
      src_y;
  const uint8_t* uv_footer =
      ComputeFooterOffset(aligned_resolution_.GetArea() / 2, uv_buf_size,
                          kMT21UVFooterAlignment) +
      src_uv;

  // Start little core jobs.
  for (auto& job : little_core_jobs_) {
    job->src = src_uv;
    job->footer = uv_footer;
    job->dest = dest_uv;
    job->wakeup_event.Signal();
  }

  // Start big core jobs.
  for (auto& job : big_core_jobs_) {
    job->src = src_y;
    job->footer = y_footer;
    job->dest = dest_y;
    job->wakeup_event.Signal();
  }

  // Wait for everything to finish.
  for (auto& job : little_core_jobs_) {
    job->done_event.Wait();
  }
  for (auto& job : big_core_jobs_) {
    job->done_event.Wait();
  }
}

}  // namespace media