chromium/media/gpu/v4l2/mt21/mt21_decompressor.h

// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Decompression utility for the MT21C pixel format.
//
// Note that this file and its corresponding .cc file have some very SoC
// specific code. While we would ideally like to avoid tying code so closely
// with a specific chip, this code is in the critical path for video decoding,
// and we know that we will only ever need to run this code on the MT8173. Every
// other SoC in the MT81XX line support a pixel format called MM21, which we
// have generic support for in libyuv.
//
// We may some day decide to try using MT21C on other chips in the MT81XX line,
// but we will need to change significant sections of this code to make that
// viable. Our assumptions about the relative speed the big and little cores,
// the number of cores, the CPU IDs of the cores, the timings of the SIMD
// instructions, the availability of ARM64, etc, will all be incorrect.

#ifndef MEDIA_GPU_V4L2_MT21_MT21_DECOMPRESSOR_H_
#define MEDIA_GPU_V4L2_MT21_MT21_DECOMPRESSOR_H_

#include "build/build_config.h"

#if !defined(ARCH_CPU_ARM_FAMILY)
#error "MT21Decompressor is only intended to run on MT8173 (ARM)"
#endif

#if !(defined(COMPILER_GCC) || defined(__clang__))
#error "MT21Decompressor is only intended to be built with GCC or Clang"
#endif

#include <stdint.h>

#include <atomic>
#include <memory>
#include <thread>
#include <vector>

#include "base/memory/raw_ptr.h"
#include "base/memory/ref_counted.h"
#include "base/memory/scoped_refptr.h"
#include "base/synchronization/lock.h"
#include "base/synchronization/waitable_event.h"
#include "ui/gfx/geometry/size.h"

namespace media {

struct GolombRiceTableEntry;

struct MT21DecompressionJob : public base::RefCounted<MT21DecompressionJob> {
  MT21DecompressionJob(const uint8_t* src,
                       const uint8_t* footer,
                       size_t offset,
                       uint8_t* dest,
                       size_t width,
                       size_t height,
                       bool is_chroma);
  const uint8_t* src;
  const uint8_t* footer;
  size_t offset;
  RAW_PTR_EXCLUSION uint8_t* dest;
  size_t width;
  size_t height;
  bool is_chroma;
  base::WaitableEvent wakeup_event;
  base::WaitableEvent done_event;

 private:
  friend class base::RefCounted<MT21DecompressionJob>;
  ~MT21DecompressionJob() = default;
};

// We considered making this an ImageProcessorBackend, but it turns out we need
// access to the raw V4L2 buffer. MT21C planes have a "secret" footer containing
// metadata necessary for decompression appended to the beginning of the last
// page in the buffer. This extra data is totally unknown to Chrome abstractions
// like VideoFrame, which just assume a plane's size is determined by stride and
// height.
class MT21Decompressor {
 public:
  MT21Decompressor(gfx::Size resolution);
  ~MT21Decompressor();

  void MT21ToNV12(const uint8_t* src_y,
                  const uint8_t* src_uv,
                  const size_t y_buf_size,
                  const size_t uv_buf_size,
                  uint8_t* dest_y,
                  uint8_t* dest_uv);

 private:
  // We divide the frame horizontally 4 times and distribute the job among
  // the 4 CPU cores in the MT8173. Two of these cores are little cores, so we
  // want to divide the task unevenly and make sure the smaller 2 tasks end up
  // scheduled on the smaller cores. In order to accomplish this, we circumvent
  // Chrome's threading system entirely and use raw operating system threads, so
  // we can use sched_setaffinity().
  //
  // One alternative that was considered was breaking the decompression up into
  // a bunch of little atomic tasks and using a threadpool, and just letting
  // the OS scheduler figure out the division of labor. This approach has the
  // significant drawback however of not only introducing more overhead, but
  // more importantly, having potentially very poor memory locality.
  //
  // Note that we also keep threads alive and waiting between runs of the
  // decompression routine. Experimental evidence has indicated that the
  // overhead of start and join syscalls substantially lengthen decompression
  // times, so we just use userspace semaphores for synchronization instead.
  std::atomic_bool should_shutdown_ = false;
  std::vector<std::thread> big_core_threads_;
  std::vector<scoped_refptr<MT21DecompressionJob>> big_core_jobs_;
  raw_ptr<uint8_t> big_core_pivot_;
  std::vector<std::thread> little_core_threads_;
  std::vector<scoped_refptr<MT21DecompressionJob>> little_core_jobs_;
  raw_ptr<uint8_t> little_core_pivot_;

  gfx::Size aligned_resolution_;

  raw_ptr<GolombRiceTableEntry> symbol_cache_;
};

}  // namespace media

#endif  // MEDIA_GPU_V4L2_MT21_MT21_DECOMPRESSOR_H_