ethread.c | Explore in Territory

/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <assert.h>
#include <stdbool.h>

#include "aom_util/aom_pthread.h"

#include "av1/common/warped_motion.h"
#include "av1/common/thread_common.h"

#include "av1/encoder/allintra_vis.h"
#include "av1/encoder/bitstream.h"
#include "av1/encoder/enc_enums.h"
#include "av1/encoder/encodeframe.h"
#include "av1/encoder/encodeframe_utils.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/encoder_alloc.h"
#include "av1/encoder/ethread.h"
#if !CONFIG_REALTIME_ONLY
#include "av1/encoder/firstpass.h"
#endif
#include "av1/encoder/global_motion.h"
#include "av1/encoder/global_motion_facade.h"
#include "av1/encoder/intra_mode_search_utils.h"
#include "av1/encoder/picklpf.h"
#include "av1/encoder/rdopt.h"
#include "aom_dsp/aom_dsp_common.h"
#include "av1/encoder/temporal_filter.h"
#include "av1/encoder/tpl_model.h"

static inline void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { … }

static inline void update_delta_lf_for_row_mt(AV1_COMP *cpi) { … }

void av1_row_mt_sync_read_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
                                int c) { … }

void av1_row_mt_sync_write_dummy(AV1EncRowMultiThreadSync *row_mt_sync, int r,
                                 int c, int cols) { … }

void av1_row_mt_sync_read(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c) { … }

void av1_row_mt_sync_write(AV1EncRowMultiThreadSync *row_mt_sync, int r, int c,
                           int cols) { … }

// Allocate memory for row synchronization
static void row_mt_sync_mem_alloc(AV1EncRowMultiThreadSync *row_mt_sync,
                                  AV1_COMMON *cm, int rows) { … }

// Deallocate row based multi-threading synchronization related mutex and data
void av1_row_mt_sync_mem_dealloc(AV1EncRowMultiThreadSync *row_mt_sync) { … }

static inline int get_sb_rows_in_frame(AV1_COMMON *cm) { … }

static void row_mt_mem_alloc(AV1_COMP *cpi, int max_rows, int max_cols,
                             int alloc_row_ctx) { … }

void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { … }

static inline void assign_tile_to_thread(int *thread_id_to_tile_id,
                                         int num_tiles, int num_workers) { … }

static inline int get_next_job(TileDataEnc *const tile_data,
                               int *current_mi_row, int mib_size) { … }

static inline void switch_tile_and_get_next_job(
    AV1_COMMON *const cm, TileDataEnc *const tile_data, int *cur_tile_id,
    int *current_mi_row, int *end_of_frame, int is_firstpass,
    const BLOCK_SIZE fp_block_size) { … }

#if !CONFIG_REALTIME_ONLY
static void set_firstpass_encode_done(AV1_COMP *cpi) {
  AV1_COMMON *const cm = &cpi->common;
  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
  const int tile_cols = cm->tiles.cols;
  const int tile_rows = cm->tiles.rows;
  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
  const int unit_height = mi_size_high[fp_block_size];

  // In case of multithreading of firstpass encode, due to top-right
  // dependency, the worker on a firstpass row waits for the completion of the
  // firstpass processing of the top and top-right fp_blocks. Hence, in case a
  // thread (main/worker) encounters an error, update the firstpass processing
  // of every row in the frame to indicate that it is complete in order to avoid
  // dependent workers waiting indefinitely.
  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
      TileDataEnc *const tile_data =
          &cpi->tile_data[tile_row * tile_cols + tile_col];
      TileInfo *tile = &tile_data->tile_info;
      AV1EncRowMultiThreadSync *const row_mt_sync = &tile_data->row_mt_sync;
      const int unit_cols_in_tile =
          av1_get_unit_cols_in_tile(tile, fp_block_size);
      for (int mi_row = tile->mi_row_start, unit_row_in_tile = 0;
           mi_row < tile->mi_row_end;
           mi_row += unit_height, unit_row_in_tile++) {
        enc_row_mt->sync_write_ptr(row_mt_sync, unit_row_in_tile,
                                   unit_cols_in_tile - 1, unit_cols_in_tile);
      }
    }
  }
}

static int fp_enc_row_mt_worker_hook(void *arg1, void *unused) {
  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
  AV1_COMP *const cpi = thread_data->cpi;
  int thread_id = thread_data->thread_id;
  AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt;
#if CONFIG_MULTITHREAD
  pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_;
#endif
  (void)unused;
  struct aom_internal_error_info *const error_info = &thread_data->error_info;
  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
  xd->error_info = error_info;

  // The jmp_buf is valid only for the duration of the function that calls
  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  // before it returns.
  if (setjmp(error_info->jmp)) {
    error_info->setjmp = 0;
#if CONFIG_MULTITHREAD
    pthread_mutex_lock(enc_row_mt_mutex_);
    enc_row_mt->firstpass_mt_exit = true;
    pthread_mutex_unlock(enc_row_mt_mutex_);
#endif
    set_firstpass_encode_done(cpi);
    return 0;
  }
  error_info->setjmp = 1;

  AV1_COMMON *const cm = &cpi->common;
  int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id];
  assert(cur_tile_id != -1);

  const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
  const int unit_height = mi_size_high[fp_block_size];
  int end_of_frame = 0;
  while (1) {
    int current_mi_row = -1;
#if CONFIG_MULTITHREAD
    pthread_mutex_lock(enc_row_mt_mutex_);
#endif
    bool firstpass_mt_exit = enc_row_mt->firstpass_mt_exit;
    if (!firstpass_mt_exit && !get_next_job(&cpi->tile_data[cur_tile_id],
                                            &current_mi_row, unit_height)) {
      // No jobs are available for the current tile. Query for the status of
      // other tiles and get the next job if available
      switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
                                   &current_mi_row, &end_of_frame, 1,
                                   fp_block_size);
    }
#if CONFIG_MULTITHREAD
    pthread_mutex_unlock(enc_row_mt_mutex_);
#endif
    // When firstpass_mt_exit is set to true, other workers need not pursue any
    // further jobs.
    if (firstpass_mt_exit || end_of_frame) break;

    TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
    AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;
    ThreadData *td = thread_data->td;

    assert(current_mi_row != -1 &&
           current_mi_row < this_tile->tile_info.mi_row_end);

    const int unit_height_log2 = mi_size_high_log2[fp_block_size];
    av1_first_pass_row(cpi, td, this_tile, current_mi_row >> unit_height_log2,
                       fp_block_size);
#if CONFIG_MULTITHREAD
    pthread_mutex_lock(enc_row_mt_mutex_);
#endif
    row_mt_sync->num_threads_working--;
#if CONFIG_MULTITHREAD
    pthread_mutex_unlock(enc_row_mt_mutex_);
#endif
  }
  error_info->setjmp = 0;
  return 1;
}
#endif

static void launch_loop_filter_rows(AV1_COMMON *cm, EncWorkerData *thread_data,
                                    AV1EncRowMultiThreadInfo *enc_row_mt,
                                    int mib_size_log2) { … }

static void set_encoding_done(AV1_COMP *cpi) { … }

static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc,
                                    const int filter_level[2]) { … }

static int enc_row_mt_worker_hook(void *arg1, void *unused) { … }

static int enc_worker_hook(void *arg1, void *unused) { … }

void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) { … }

void av1_init_cdef_worker(AV1_COMP *cpi) { … }

#if !CONFIG_REALTIME_ONLY
void av1_init_lr_mt_buffers(AV1_COMP *cpi) {
  AV1_COMMON *const cm = &cpi->common;
  AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync;
  if (lr_sync->sync_range) {
    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
      return;
    int num_lr_workers =
        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
    assert(num_lr_workers <= lr_sync->num_workers);
    lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf;
    lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs;
  }
}
#endif

#if CONFIG_MULTITHREAD
void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) { … }
#endif  // CONFIG_MULTITHREAD

// Computes the number of workers to be considered while allocating memory for a
// multi-threaded module under FPMT.
int av1_get_num_mod_workers_for_alloc(const PrimaryMultiThreadInfo *p_mt_info,
                                      MULTI_THREADED_MODULES mod_name) { … }

void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) { … }

void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) { … }

// This function will change the state and free the mutex of corresponding
// workers and terminate the object. The object can not be re-used unless a call
// to reset() is made.
void av1_terminate_workers(AV1_PRIMARY *ppi) { … }

// This function returns 1 if frame parallel encode is supported for
// the current configuration. Returns 0 otherwise.
static inline int is_fpmt_config(const AV1_PRIMARY *ppi,
                                 const AV1EncoderConfig *oxcf) { … }

int av1_check_fpmt_config(AV1_PRIMARY *const ppi,
                          const AV1EncoderConfig *const oxcf) { … }

// A large value for threads used to compute the max num_enc_workers
// possible for each resolution.
#define MAX_THREADS …

// Computes the max number of enc workers possible for each resolution.
static inline int compute_max_num_enc_workers(
    CommonModeInfoParams *const mi_params, int mib_size_log2) { … }

// Computes the number of frame parallel(fp) contexts to be created
// based on the number of max_enc_workers.
int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) { … }

// Computes the number of workers to process each of the parallel frames.
static inline int compute_num_workers_per_frame(
    const int num_workers, const int parallel_frame_count) { … }

static inline void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
                                              int parallel_frame_count,
                                              int num_fpmt_workers_prepared);

// Prepare level 1 workers. This function is only called for
// parallel_frame_count > 1. This function populates the mt_info structure of
// frame level contexts appropriately by dividing the total number of available
// workers amongst the frames as level 2 workers. It also populates the hook and
// data members of level 1 workers.
static inline void prepare_fpmt_workers(AV1_PRIMARY *ppi,
                                        AV1_COMP_DATA *first_cpi_data,
                                        AVxWorkerHook hook,
                                        int parallel_frame_count) { … }

// Launch level 1 workers to perform frame parallel encode.
static inline void launch_fpmt_workers(AV1_PRIMARY *ppi) { … }

// Restore worker states after parallel encode.
static inline void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
                                              int parallel_frame_count,
                                              int num_fpmt_workers_prepared) { … }

// Synchronize level 1 workers.
static inline void sync_fpmt_workers(AV1_PRIMARY *ppi,
                                     int frames_in_parallel_set) { … }

static int get_compressed_data_hook(void *arg1, void *arg2) { … }

// This function encodes the raw frame data for each frame in parallel encode
// set, and outputs the frame bit stream to the designated buffers.
void av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
                                  AV1_COMP_DATA *const first_cpi_data) { … }

static inline void launch_workers(MultiThreadInfo *const mt_info,
                                  int num_workers) { … }

static inline void sync_enc_workers(MultiThreadInfo *const mt_info,
                                    AV1_COMMON *const cm, int num_workers) { … }

static inline void accumulate_counters_enc_workers(AV1_COMP *cpi,
                                                   int num_workers) { … }

static inline void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                       int num_workers) { … }

#if !CONFIG_REALTIME_ONLY
static inline void fp_prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                          int num_workers) {
  AV1_COMMON *const cm = &cpi->common;
  MultiThreadInfo *const mt_info = &cpi->mt_info;
  for (int i = num_workers - 1; i >= 0; i--) {
    AVxWorker *const worker = &mt_info->workers[i];
    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];

    worker->hook = hook;
    worker->data1 = thread_data;
    worker->data2 = NULL;

    thread_data->thread_id = i;
    // Set the starting tile for each thread.
    thread_data->start = i;

    thread_data->cpi = cpi;
    if (i == 0) {
      thread_data->td = &cpi->td;
    } else {
      thread_data->td = thread_data->original_td;
      // Before encoding a frame, copy the thread data from cpi.
      thread_data->td->mb = cpi->td.mb;
    }
    av1_alloc_src_diff_buf(cm, &thread_data->td->mb);
  }
}
#endif

// Computes the number of workers for row multi-threading of encoding stage
static inline int compute_num_enc_row_mt_workers(const AV1_COMMON *cm,
                                                 int max_threads) { … }

// Computes the number of workers for tile multi-threading of encoding stage
static inline int compute_num_enc_tile_mt_workers(const AV1_COMMON *cm,
                                                  int max_threads) { … }

// Find max worker of all MT stages
int av1_get_max_num_workers(const AV1_COMP *cpi) { … }

// Computes the number of workers for encoding stage (row/tile multi-threading)
static int compute_num_enc_workers(const AV1_COMP *cpi, int max_workers) { … }

void av1_encode_tiles_mt(AV1_COMP *cpi) { … }

// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
// members, so we treat it as an array, and sum over the whole length.
void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
                                 const FRAME_COUNTS *counts) { … }

// Computes the maximum number of sb rows and sb_cols across tiles which are
// used to allocate memory for multi-threaded encoding with row-mt=1.
static inline void compute_max_sb_rows_cols(const AV1_COMMON *cm,
                                            int *max_sb_rows_in_tile,
                                            int *max_sb_cols_in_tile) { … }

#if !CONFIG_REALTIME_ONLY
// Computes the number of workers for firstpass stage (row/tile multi-threading)
int av1_fp_compute_num_enc_workers(AV1_COMP *cpi) {
  AV1_COMMON *cm = &cpi->common;
  const int tile_cols = cm->tiles.cols;
  const int tile_rows = cm->tiles.rows;
  int total_num_threads_row_mt = 0;
  TileInfo tile_info;

  if (cpi->oxcf.max_threads <= 1) return 1;

  for (int row = 0; row < tile_rows; row++) {
    for (int col = 0; col < tile_cols; col++) {
      av1_tile_init(&tile_info, cm, row, col);
      const int num_mb_rows_in_tile =
          av1_get_unit_rows_in_tile(&tile_info, cpi->fp_block_size);
      const int num_mb_cols_in_tile =
          av1_get_unit_cols_in_tile(&tile_info, cpi->fp_block_size);
      total_num_threads_row_mt +=
          AOMMIN((num_mb_cols_in_tile + 1) >> 1, num_mb_rows_in_tile);
    }
  }
  return AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
}

// Computes the maximum number of mb_rows for row multi-threading of firstpass
// stage
static inline int fp_compute_max_mb_rows(const AV1_COMMON *cm,
                                         BLOCK_SIZE fp_block_size) {
  const int tile_rows = cm->tiles.rows;
  const int unit_height_log2 = mi_size_high_log2[fp_block_size];
  const int mib_size_log2 = cm->seq_params->mib_size_log2;
  const int num_mi_rows = cm->mi_params.mi_rows;
  const int *const row_start_sb = cm->tiles.row_start_sb;
  int max_mb_rows = 0;

  for (int row = 0; row < tile_rows; row++) {
    const int mi_row_start = row_start_sb[row] << mib_size_log2;
    const int mi_row_end =
        AOMMIN(row_start_sb[row + 1] << mib_size_log2, num_mi_rows);
    const int num_mb_rows_in_tile =
        CEIL_POWER_OF_TWO(mi_row_end - mi_row_start, unit_height_log2);
    max_mb_rows = AOMMAX(max_mb_rows, num_mb_rows_in_tile);
  }
  return max_mb_rows;
}
#endif

static void lpf_pipeline_mt_init(AV1_COMP *cpi, int num_workers) { … }

void av1_encode_tiles_row_mt(AV1_COMP *cpi) { … }

#if !CONFIG_REALTIME_ONLY
static void dealloc_thread_data_src_diff_buf(AV1_COMP *cpi, int num_workers) {
  for (int i = num_workers - 1; i >= 0; --i) {
    EncWorkerData *const thread_data = &cpi->mt_info.tile_thr_data[i];
    if (thread_data->td != &cpi->td)
      av1_dealloc_src_diff_buf(&thread_data->td->mb,
                               av1_num_planes(&cpi->common));
  }
}

void av1_fp_encode_tiles_row_mt(AV1_COMP *cpi) {
  AV1_COMMON *const cm = &cpi->common;
  MultiThreadInfo *const mt_info = &cpi->mt_info;
  AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
  const int tile_cols = cm->tiles.cols;
  const int tile_rows = cm->tiles.rows;
  int *thread_id_to_tile_id = enc_row_mt->thread_id_to_tile_id;
  int num_workers = 0;
  int max_mb_rows = 0;

  max_mb_rows = fp_compute_max_mb_rows(cm, cpi->fp_block_size);
  const bool alloc_row_mt_mem = enc_row_mt->allocated_tile_cols != tile_cols ||
                                enc_row_mt->allocated_tile_rows != tile_rows ||
                                enc_row_mt->allocated_rows != max_mb_rows;
  const bool alloc_tile_data = cpi->allocated_tiles < tile_cols * tile_rows;

  assert(IMPLIES(cpi->tile_data == NULL, alloc_tile_data));
  if (alloc_tile_data) {
    av1_alloc_tile_data(cpi);
  }

  assert(IMPLIES(alloc_tile_data, alloc_row_mt_mem));
  if (alloc_row_mt_mem) {
    row_mt_mem_alloc(cpi, max_mb_rows, -1, 0);
  }

  av1_init_tile_data(cpi);

  // For pass = 1, compute the no. of workers needed. For single-pass encode
  // (pass = 0), no. of workers are already computed.
  if (mt_info->num_mod_workers[MOD_FP] == 0)
    num_workers = av1_fp_compute_num_enc_workers(cpi);
  else
    num_workers = mt_info->num_mod_workers[MOD_FP];

  memset(thread_id_to_tile_id, -1,
         sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS);
  enc_row_mt->firstpass_mt_exit = false;

  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
      int tile_index = tile_row * tile_cols + tile_col;
      TileDataEnc *const this_tile = &cpi->tile_data[tile_index];
      AV1EncRowMultiThreadSync *const row_mt_sync = &this_tile->row_mt_sync;

      // Initialize num_finished_cols to -1 for all rows.
      memset(row_mt_sync->num_finished_cols, -1,
             sizeof(*row_mt_sync->num_finished_cols) * max_mb_rows);
      row_mt_sync->next_mi_row = this_tile->tile_info.mi_row_start;
      row_mt_sync->num_threads_working = 0;

      // intraBC mode is not evaluated during first-pass encoding. Hence, no
      // additional top-right delay is required.
      row_mt_sync->intrabc_extra_top_right_sb_delay = 0;
    }
  }

  num_workers = AOMMIN(num_workers, mt_info->num_workers);
  assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
                        num_workers);
  fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
  launch_workers(&cpi->mt_info, num_workers);
  sync_enc_workers(&cpi->mt_info, cm, num_workers);
  dealloc_thread_data_src_diff_buf(cpi, num_workers);
}

void av1_tpl_row_mt_sync_read_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
                                    int r, int c) {
  (void)tpl_mt_sync;
  (void)r;
  (void)c;
}

void av1_tpl_row_mt_sync_write_dummy(AV1TplRowMultiThreadSync *tpl_mt_sync,
                                     int r, int c, int cols) {
  (void)tpl_mt_sync;
  (void)r;
  (void)c;
  (void)cols;
}

void av1_tpl_row_mt_sync_read(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
                              int c) {
#if CONFIG_MULTITHREAD
  int nsync = tpl_row_mt_sync->sync_range;

  if (r) {
    pthread_mutex_t *const mutex = &tpl_row_mt_sync->mutex_[r - 1];
    pthread_mutex_lock(mutex);

    while (c > tpl_row_mt_sync->num_finished_cols[r - 1] - nsync)
      pthread_cond_wait(&tpl_row_mt_sync->cond_[r - 1], mutex);
    pthread_mutex_unlock(mutex);
  }
#else
  (void)tpl_row_mt_sync;
  (void)r;
  (void)c;
#endif  // CONFIG_MULTITHREAD
}

void av1_tpl_row_mt_sync_write(AV1TplRowMultiThreadSync *tpl_row_mt_sync, int r,
                               int c, int cols) {
#if CONFIG_MULTITHREAD
  int nsync = tpl_row_mt_sync->sync_range;
  int cur;
  // Only signal when there are enough encoded blocks for next row to run.
  int sig = 1;

  if (c < cols - 1) {
    cur = c;
    if (c % nsync) sig = 0;
  } else {
    cur = cols + nsync;
  }

  if (sig) {
    pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]);

    // When a thread encounters an error, num_finished_cols[r] is set to maximum
    // column number. In this case, the AOMMAX operation here ensures that
    // num_finished_cols[r] is not overwritten with a smaller value thus
    // preventing the infinite waiting of threads in the relevant sync_read()
    // function.
    tpl_row_mt_sync->num_finished_cols[r] =
        AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur);

    pthread_cond_signal(&tpl_row_mt_sync->cond_[r]);
    pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]);
  }
#else
  (void)tpl_row_mt_sync;
  (void)r;
  (void)c;
  (void)cols;
#endif  // CONFIG_MULTITHREAD
}

static inline void set_mode_estimation_done(AV1_COMP *cpi) {
  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
  TplParams *const tpl_data = &cpi->ppi->tpl_data;
  const BLOCK_SIZE bsize =
      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
  const int mi_height = mi_size_high[bsize];
  AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
  const int tplb_cols_in_tile =
      ROUND_POWER_OF_TWO(mi_params->mi_cols, mi_size_wide_log2[bsize]);
  // In case of tpl row-multithreading, due to top-right dependency, the worker
  // on an mb_row waits for the completion of the tpl processing of the top and
  // top-right blocks. Hence, in case a thread (main/worker) encounters an
  // error, update that the tpl processing of every mb_row in the frame is
  // complete in order to avoid dependent workers waiting indefinitely.
  for (int mi_row = 0, tplb_row = 0; mi_row < mi_params->mi_rows;
       mi_row += mi_height, tplb_row++) {
    (*tpl_row_mt->sync_write_ptr)(&tpl_data->tpl_mt_sync, tplb_row,
                                  tplb_cols_in_tile - 1, tplb_cols_in_tile);
  }
}

// Each worker calls tpl_worker_hook() and computes the tpl data.
static int tpl_worker_hook(void *arg1, void *unused) {
  (void)unused;
  EncWorkerData *thread_data = (EncWorkerData *)arg1;
  AV1_COMP *cpi = thread_data->cpi;
  AV1_COMMON *cm = &cpi->common;
  MACROBLOCK *x = &thread_data->td->mb;
  MACROBLOCKD *xd = &x->e_mbd;
  TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
  TplBuffers *tpl_tmp_buffers = &thread_data->td->tpl_tmp_buffers;
  CommonModeInfoParams *mi_params = &cm->mi_params;
  int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;

  struct aom_internal_error_info *const error_info = &thread_data->error_info;
  xd->error_info = error_info;
  AV1TplRowMultiThreadInfo *const tpl_row_mt = &cpi->mt_info.tpl_row_mt;
  (void)tpl_row_mt;
#if CONFIG_MULTITHREAD
  pthread_mutex_t *tpl_error_mutex_ = tpl_row_mt->mutex_;
#endif

  // The jmp_buf is valid only for the duration of the function that calls
  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  // before it returns.
  if (setjmp(error_info->jmp)) {
    error_info->setjmp = 0;
#if CONFIG_MULTITHREAD
    pthread_mutex_lock(tpl_error_mutex_);
    tpl_row_mt->tpl_mt_exit = true;
    pthread_mutex_unlock(tpl_error_mutex_);
#endif
    set_mode_estimation_done(cpi);
    return 0;
  }
  error_info->setjmp = 1;

  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
  TX_SIZE tx_size = max_txsize_lookup[bsize];
  int mi_height = mi_size_high[bsize];

  av1_init_tpl_txfm_stats(tpl_txfm_stats);

  for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
       mi_row += num_active_workers * mi_height) {
    // Motion estimation row boundary
    av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
                          cpi->oxcf.border_in_pixels);
    xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
    xd->mb_to_bottom_edge =
        GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
    av1_mc_flow_dispenser_row(cpi, tpl_txfm_stats, tpl_tmp_buffers, x, mi_row,
                              bsize, tx_size);
  }
  error_info->setjmp = 0;
  return 1;
}

// Deallocate tpl synchronization related mutex and data.
void av1_tpl_dealloc(AV1TplRowMultiThreadSync *tpl_sync) {
  assert(tpl_sync != NULL);

#if CONFIG_MULTITHREAD
  if (tpl_sync->mutex_ != NULL) {
    for (int i = 0; i < tpl_sync->rows; ++i)
      pthread_mutex_destroy(&tpl_sync->mutex_[i]);
    aom_free(tpl_sync->mutex_);
  }
  if (tpl_sync->cond_ != NULL) {
    for (int i = 0; i < tpl_sync->rows; ++i)
      pthread_cond_destroy(&tpl_sync->cond_[i]);
    aom_free(tpl_sync->cond_);
  }
#endif  // CONFIG_MULTITHREAD

  aom_free(tpl_sync->num_finished_cols);
  // clear the structure as the source of this call may be a resize in which
  // case this call will be followed by an _alloc() which may fail.
  av1_zero(*tpl_sync);
}

// Allocate memory for tpl row synchronization.
static void av1_tpl_alloc(AV1TplRowMultiThreadSync *tpl_sync, AV1_COMMON *cm,
                          int mb_rows) {
  tpl_sync->rows = mb_rows;
#if CONFIG_MULTITHREAD
  {
    CHECK_MEM_ERROR(cm, tpl_sync->mutex_,
                    aom_malloc(sizeof(*tpl_sync->mutex_) * mb_rows));
    if (tpl_sync->mutex_) {
      for (int i = 0; i < mb_rows; ++i)
        pthread_mutex_init(&tpl_sync->mutex_[i], NULL);
    }

    CHECK_MEM_ERROR(cm, tpl_sync->cond_,
                    aom_malloc(sizeof(*tpl_sync->cond_) * mb_rows));
    if (tpl_sync->cond_) {
      for (int i = 0; i < mb_rows; ++i)
        pthread_cond_init(&tpl_sync->cond_[i], NULL);
    }
  }
#endif  // CONFIG_MULTITHREAD
  CHECK_MEM_ERROR(cm, tpl_sync->num_finished_cols,
                  aom_malloc(sizeof(*tpl_sync->num_finished_cols) * mb_rows));

  // Set up nsync.
  tpl_sync->sync_range = 1;
}

// Each worker is prepared by assigning the hook function and individual thread
// data.
static inline void prepare_tpl_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                       int num_workers) {
  MultiThreadInfo *mt_info = &cpi->mt_info;
  for (int i = num_workers - 1; i >= 0; i--) {
    AVxWorker *worker = &mt_info->workers[i];
    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];

    worker->hook = hook;
    worker->data1 = thread_data;
    worker->data2 = NULL;

    thread_data->thread_id = i;
    // Set the starting tile for each thread.
    thread_data->start = i;

    thread_data->cpi = cpi;
    if (i == 0) {
      thread_data->td = &cpi->td;
    } else {
      thread_data->td = thread_data->original_td;
    }

    // Before encoding a frame, copy the thread data from cpi.
    if (thread_data->td != &cpi->td) {
      thread_data->td->mb = cpi->td.mb;
      // OBMC buffers are used only to init MS params and remain unused when
      // called from tpl, hence set the buffers to defaults.
      av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
      if (!tpl_alloc_temp_buffers(&thread_data->td->tpl_tmp_buffers,
                                  cpi->ppi->tpl_data.tpl_bsize_1d)) {
        aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
                           "Error allocating tpl data");
      }
      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
    }
  }
}

#if CONFIG_BITRATE_ACCURACY
// Accumulate transform stats after tpl.
static void tpl_accumulate_txfm_stats(ThreadData *main_td,
                                      const MultiThreadInfo *mt_info,
                                      int num_workers) {
  TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats;
  for (int i = num_workers - 1; i >= 0; i--) {
    AVxWorker *const worker = &mt_info->workers[i];
    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
    ThreadData *td = thread_data->td;
    if (td != main_td) {
      const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
      av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats);
    }
  }
}
#endif  // CONFIG_BITRATE_ACCURACY

// Implements multi-threading for tpl.
void av1_mc_flow_dispenser_mt(AV1_COMP *cpi) {
  AV1_COMMON *cm = &cpi->common;
  CommonModeInfoParams *mi_params = &cm->mi_params;
  MultiThreadInfo *mt_info = &cpi->mt_info;
  TplParams *tpl_data = &cpi->ppi->tpl_data;
  AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
  int mb_rows = mi_params->mb_rows;
  int num_workers =
      AOMMIN(mt_info->num_mod_workers[MOD_TPL], mt_info->num_workers);

  if (mb_rows != tpl_sync->rows) {
    av1_tpl_dealloc(tpl_sync);
    av1_tpl_alloc(tpl_sync, cm, mb_rows);
  }
  tpl_sync->num_threads_working = num_workers;
  mt_info->tpl_row_mt.tpl_mt_exit = false;

  // Initialize cur_mb_col to -1 for all MB rows.
  memset(tpl_sync->num_finished_cols, -1,
         sizeof(*tpl_sync->num_finished_cols) * mb_rows);

  prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
  launch_workers(&cpi->mt_info, num_workers);
  sync_enc_workers(&cpi->mt_info, cm, num_workers);
#if CONFIG_BITRATE_ACCURACY
  tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
#endif  // CONFIG_BITRATE_ACCURACY
  for (int i = num_workers - 1; i >= 0; i--) {
    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
    ThreadData *td = thread_data->td;
    if (td != &cpi->td) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers);
  }
}

// Deallocate memory for temporal filter multi-thread synchronization.
void av1_tf_mt_dealloc(AV1TemporalFilterSync *tf_sync) {
  assert(tf_sync != NULL);
#if CONFIG_MULTITHREAD
  if (tf_sync->mutex_ != NULL) {
    pthread_mutex_destroy(tf_sync->mutex_);
    aom_free(tf_sync->mutex_);
  }
#endif  // CONFIG_MULTITHREAD
  tf_sync->next_tf_row = 0;
}

// Checks if a job is available. If job is available,
// populates next_tf_row and returns 1, else returns 0.
static inline int tf_get_next_job(AV1TemporalFilterSync *tf_mt_sync,
                                  int *current_mb_row, int mb_rows) {
  int do_next_row = 0;
#if CONFIG_MULTITHREAD
  pthread_mutex_t *tf_mutex_ = tf_mt_sync->mutex_;
  pthread_mutex_lock(tf_mutex_);
#endif
  if (!tf_mt_sync->tf_mt_exit && tf_mt_sync->next_tf_row < mb_rows) {
    *current_mb_row = tf_mt_sync->next_tf_row;
    tf_mt_sync->next_tf_row++;
    do_next_row = 1;
  }
#if CONFIG_MULTITHREAD
  pthread_mutex_unlock(tf_mutex_);
#endif
  return do_next_row;
}

// Hook function for each thread in temporal filter multi-threading.
static int tf_worker_hook(void *arg1, void *unused) {
  (void)unused;
  EncWorkerData *thread_data = (EncWorkerData *)arg1;
  AV1_COMP *cpi = thread_data->cpi;
  ThreadData *td = thread_data->td;
  TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
  AV1TemporalFilterSync *tf_sync = &cpi->mt_info.tf_sync;
  const struct scale_factors *scale = &cpi->tf_ctx.sf;

#if CONFIG_MULTITHREAD
  pthread_mutex_t *tf_mutex_ = tf_sync->mutex_;
#endif
  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
  struct aom_internal_error_info *const error_info = &thread_data->error_info;
  xd->error_info = error_info;

  // The jmp_buf is valid only for the duration of the function that calls
  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  // before it returns.
  if (setjmp(error_info->jmp)) {
    error_info->setjmp = 0;
#if CONFIG_MULTITHREAD
    pthread_mutex_lock(tf_mutex_);
    tf_sync->tf_mt_exit = true;
    pthread_mutex_unlock(tf_mutex_);
#endif
    return 0;
  }
  error_info->setjmp = 1;

  const int num_planes = av1_num_planes(&cpi->common);
  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);

  MACROBLOCKD *mbd = &td->mb.e_mbd;
  uint8_t *input_buffer[MAX_MB_PLANE];
  MB_MODE_INFO **input_mb_mode_info;
  tf_save_state(mbd, &input_mb_mode_info, input_buffer, num_planes);
  tf_setup_macroblockd(mbd, &td->tf_data, scale);

  int current_mb_row = -1;

  while (tf_get_next_job(tf_sync, &current_mb_row, tf_ctx->mb_rows))
    av1_tf_do_filtering_row(cpi, td, current_mb_row);

  tf_restore_state(mbd, input_mb_mode_info, input_buffer, num_planes);

  error_info->setjmp = 0;
  return 1;
}

// Assigns temporal filter hook function and thread data to each worker.
static void prepare_tf_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                               int num_workers, int is_highbitdepth) {
  MultiThreadInfo *mt_info = &cpi->mt_info;
  mt_info->tf_sync.next_tf_row = 0;
  mt_info->tf_sync.tf_mt_exit = false;
  for (int i = num_workers - 1; i >= 0; i--) {
    AVxWorker *worker = &mt_info->workers[i];
    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];

    worker->hook = hook;
    worker->data1 = thread_data;
    worker->data2 = NULL;

    thread_data->thread_id = i;
    // Set the starting tile for each thread.
    thread_data->start = i;

    thread_data->cpi = cpi;
    if (i == 0) {
      thread_data->td = &cpi->td;
    } else {
      thread_data->td = thread_data->original_td;
    }

    // Before encoding a frame, copy the thread data from cpi.
    if (thread_data->td != &cpi->td) {
      thread_data->td->mb = cpi->td.mb;
      // OBMC buffers are used only to init MS params and remain unused when
      // called from tf, hence set the buffers to defaults.
      av1_init_obmc_buffer(&thread_data->td->mb.obmc_buffer);
      if (!tf_alloc_and_reset_data(&thread_data->td->tf_data,
                                   cpi->tf_ctx.num_pels, is_highbitdepth)) {
        aom_internal_error(cpi->common.error, AOM_CODEC_MEM_ERROR,
                           "Error allocating temporal filter data");
      }
    }
  }
}

// Deallocate thread specific data for temporal filter.
static void tf_dealloc_thread_data(AV1_COMP *cpi, int num_workers,
                                   int is_highbitdepth) {
  MultiThreadInfo *mt_info = &cpi->mt_info;
  for (int i = num_workers - 1; i >= 0; i--) {
    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];
    ThreadData *td = thread_data->td;
    if (td != &cpi->td) tf_dealloc_data(&td->tf_data, is_highbitdepth);
  }
}

// Accumulate sse and sum after temporal filtering.
static void tf_accumulate_frame_diff(AV1_COMP *cpi, int num_workers) {
  FRAME_DIFF *total_diff = &cpi->td.tf_data.diff;
  for (int i = num_workers - 1; i >= 0; i--) {
    AVxWorker *const worker = &cpi->mt_info.workers[i];
    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
    ThreadData *td = thread_data->td;
    FRAME_DIFF *diff = &td->tf_data.diff;
    if (td != &cpi->td) {
      total_diff->sse += diff->sse;
      total_diff->sum += diff->sum;
    }
  }
}

// Implements multi-threading for temporal filter.
void av1_tf_do_filtering_mt(AV1_COMP *cpi) {
  AV1_COMMON *cm = &cpi->common;
  MultiThreadInfo *mt_info = &cpi->mt_info;
  const int is_highbitdepth = cpi->tf_ctx.is_highbitdepth;

  int num_workers =
      AOMMIN(mt_info->num_mod_workers[MOD_TF], mt_info->num_workers);

  prepare_tf_workers(cpi, tf_worker_hook, num_workers, is_highbitdepth);
  launch_workers(mt_info, num_workers);
  sync_enc_workers(mt_info, cm, num_workers);
  tf_accumulate_frame_diff(cpi, num_workers);
  tf_dealloc_thread_data(cpi, num_workers, is_highbitdepth);
}

// Checks if a job is available in the current direction. If a job is available,
// frame_idx will be populated and returns 1, else returns 0.
static inline int get_next_gm_job(AV1_COMP *cpi, int *frame_idx, int cur_dir) {
  GlobalMotionInfo *gm_info = &cpi->gm_info;
  GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info;

  int total_refs = gm_info->num_ref_frames[cur_dir];
  int8_t cur_frame_to_process = job_info->next_frame_to_process[cur_dir];

  if (cur_frame_to_process < total_refs && !job_info->early_exit[cur_dir]) {
    *frame_idx = gm_info->reference_frames[cur_dir][cur_frame_to_process].frame;
    job_info->next_frame_to_process[cur_dir] += 1;
    return 1;
  }
  return 0;
}

// Switches the current direction and calls the function get_next_gm_job() if
// the speed feature 'prune_ref_frame_for_gm_search' is not set.
static inline void switch_direction(AV1_COMP *cpi, int *frame_idx,
                                    int *cur_dir) {
  if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search) return;
  // Switch the direction and get next job
  *cur_dir = !(*cur_dir);
  get_next_gm_job(cpi, frame_idx, *(cur_dir));
}

// Hook function for each thread in global motion multi-threading.
static int gm_mt_worker_hook(void *arg1, void *unused) {
  (void)unused;

  EncWorkerData *thread_data = (EncWorkerData *)arg1;
  AV1_COMP *cpi = thread_data->cpi;
  GlobalMotionInfo *gm_info = &cpi->gm_info;
  AV1GlobalMotionSync *gm_sync = &cpi->mt_info.gm_sync;
  GlobalMotionJobInfo *job_info = &gm_sync->job_info;
  int thread_id = thread_data->thread_id;
  GlobalMotionData *gm_thread_data = &thread_data->td->gm_data;
#if CONFIG_MULTITHREAD
  pthread_mutex_t *gm_mt_mutex_ = gm_sync->mutex_;
#endif

  MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd;
  struct aom_internal_error_info *const error_info = &thread_data->error_info;
  xd->error_info = error_info;

  // The jmp_buf is valid only for the duration of the function that calls
  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
  // before it returns.
  if (setjmp(error_info->jmp)) {
    error_info->setjmp = 0;
#if CONFIG_MULTITHREAD
    pthread_mutex_lock(gm_mt_mutex_);
    gm_sync->gm_mt_exit = true;
    pthread_mutex_unlock(gm_mt_mutex_);
#endif
    return 0;
  }
  error_info->setjmp = 1;

  int cur_dir = job_info->thread_id_to_dir[thread_id];
  bool gm_mt_exit = false;
  while (1) {
    int ref_buf_idx = -1;

#if CONFIG_MULTITHREAD
    pthread_mutex_lock(gm_mt_mutex_);
#endif

    gm_mt_exit = gm_sync->gm_mt_exit;
    // Populates ref_buf_idx(the reference frame type) for which global motion
    // estimation will be done.
    if (!gm_mt_exit && !get_next_gm_job(cpi, &ref_buf_idx, cur_dir)) {
      // No jobs are available for the current direction. Switch
      // to other direction and get the next job, if available.
      switch_direction(cpi, &ref_buf_idx, &cur_dir);
    }

#if CONFIG_MULTITHREAD
    pthread_mutex_unlock(gm_mt_mutex_);
#endif

    // When gm_mt_exit is set to true, other workers need not pursue any
    // further jobs.
    if (gm_mt_exit || ref_buf_idx == -1) break;

    // Compute global motion for the given ref_buf_idx.
    av1_compute_gm_for_valid_ref_frames(
        cpi, error_info, gm_info->ref_buf, ref_buf_idx,
        gm_thread_data->motion_models, gm_thread_data->segment_map,
        gm_info->segment_map_w, gm_info->segment_map_h);

#if CONFIG_MULTITHREAD
    pthread_mutex_lock(gm_mt_mutex_);
#endif
    // If global motion w.r.t. current ref frame is
    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
    // the remaining ref frames in that direction.
    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
        cpi->common.global_motion[ref_buf_idx].wmtype <= TRANSLATION)
      job_info->early_exit[cur_dir] = 1;

#if CONFIG_MULTITHREAD
    pthread_mutex_unlock(gm_mt_mutex_);
#endif
  }
  error_info->setjmp = 0;
  return 1;
}

// Assigns global motion hook function and thread data to each worker.
static inline void prepare_gm_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                      int num_workers) {
  MultiThreadInfo *mt_info = &cpi->mt_info;
  mt_info->gm_sync.gm_mt_exit = false;
  for (int i = num_workers - 1; i >= 0; i--) {
    AVxWorker *worker = &mt_info->workers[i];
    EncWorkerData *thread_data = &mt_info->tile_thr_data[i];

    worker->hook = hook;
    worker->data1 = thread_data;
    worker->data2 = NULL;

    thread_data->thread_id = i;
    // Set the starting tile for each thread.
    thread_data->start = i;

    thread_data->cpi = cpi;
    if (i == 0) {
      thread_data->td = &cpi->td;
    } else {
      thread_data->td = thread_data->original_td;
    }

    if (thread_data->td != &cpi->td)
      gm_alloc_data(cpi, &thread_data->td->gm_data);
  }
}

// Assigns available threads to past/future direction.
static inline void assign_thread_to_dir(int8_t *thread_id_to_dir,
                                        int num_workers) {
  int8_t frame_dir_idx = 0;

  for (int i = 0; i < num_workers; i++) {
    thread_id_to_dir[i] = frame_dir_idx++;
    if (frame_dir_idx == MAX_DIRECTIONS) frame_dir_idx = 0;
  }
}

// Computes number of workers for global motion multi-threading.
static inline int compute_gm_workers(const AV1_COMP *cpi) {
  int total_refs =
      cpi->gm_info.num_ref_frames[0] + cpi->gm_info.num_ref_frames[1];
  int num_gm_workers = cpi->sf.gm_sf.prune_ref_frame_for_gm_search
                           ? AOMMIN(MAX_DIRECTIONS, total_refs)
                           : total_refs;
  num_gm_workers = AOMMIN(num_gm_workers, cpi->mt_info.num_workers);
  return (num_gm_workers);
}

// Frees the memory allocated for each worker in global motion multi-threading.
static inline void gm_dealloc_thread_data(AV1_COMP *cpi, int num_workers) {
  MultiThreadInfo *mt_info = &cpi->mt_info;
  for (int j = 0; j < num_workers; j++) {
    EncWorkerData *thread_data = &mt_info->tile_thr_data[j];
    ThreadData *td = thread_data->td;
    if (td != &cpi->td) gm_dealloc_data(&td->gm_data);
  }
}

// Implements multi-threading for global motion.
void av1_global_motion_estimation_mt(AV1_COMP *cpi) {
  GlobalMotionJobInfo *job_info = &cpi->mt_info.gm_sync.job_info;

  av1_zero(*job_info);

  int num_workers = compute_gm_workers(cpi);

  assign_thread_to_dir(job_info->thread_id_to_dir, num_workers);
  prepare_gm_workers(cpi, gm_mt_worker_hook, num_workers);
  launch_workers(&cpi->mt_info, num_workers);
  sync_enc_workers(&cpi->mt_info, &cpi->common, num_workers);
  gm_dealloc_thread_data(cpi, num_workers);
}
#endif  // !CONFIG_REALTIME_ONLY

static inline int get_next_job_allintra(
    AV1EncRowMultiThreadSync *const row_mt_sync, const int mi_row_end,
    int *current_mi_row, int mib_size) { … }

static inline void prepare_wiener_var_workers(AV1_COMP *const cpi,
                                              AVxWorkerHook hook,
                                              const int num_workers) { … }

static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) { … }

static int cal_mb_wiener_var_hook(void *arg1, void *unused) { … }

static void dealloc_mb_wiener_var_mt_data(AV1_COMP *cpi, int num_workers) { … }

// This function is the multi-threading version of computing the wiener
// variance.
// Note that the wiener variance is used for allintra mode (1 pass) and its
// computation is before the frame encoding, so we don't need to consider
// the number of tiles, instead we allocate all available threads to
// the computation.
void av1_calc_mb_wiener_var_mt(AV1_COMP *cpi, int num_workers,
                               double *sum_rec_distortion,
                               double *sum_est_rate) { … }

// Compare and order tiles based on absolute sum of tx coeffs.
static int compare_tile_order(const void *a, const void *b) { … }

// Get next tile index to be processed for pack bitstream
static inline int get_next_pack_bs_tile_idx(
    AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) { … }

// Calculates bitstream chunk size based on total buffer size and tile or tile
// group size.
static inline size_t get_bs_chunk_size(int tg_or_tile_size,
                                       const int frame_or_tg_size,
                                       size_t *remain_buf_size,
                                       size_t max_buf_size, int is_last_chunk) { … }

// Initializes params required for pack bitstream tile.
static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst,
                                     struct aom_write_bit_buffer *saved_wb,
                                     PackBSParams *const pack_bs_params_arr,
                                     uint8_t obu_extn_header) { … }

// Worker hook function of pack bitsteam multithreading.
static int pack_bs_worker_hook(void *arg1, void *arg2) { … }

// Prepares thread data and workers of pack bitsteam multithreading.
static void prepare_pack_bs_workers(AV1_COMP *const cpi,
                                    PackBSParams *const pack_bs_params,
                                    AVxWorkerHook hook, const int num_workers) { … }

// Accumulates data after pack bitsteam processing.
static void accumulate_pack_bs_data(
    AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr,
    uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info,
    int *const largest_tile_id, unsigned int *max_tile_size,
    uint32_t *const obu_header_size, uint8_t **tile_data_start,
    const int num_workers) { … }

void av1_write_tile_obu_mt(
    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
    unsigned int *max_tile_size, uint32_t *const obu_header_size,
    uint8_t **tile_data_start, const int num_workers) { … }

// Deallocate memory for CDEF search multi-thread synchronization.
void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) { … }

// Updates the row and column indices of the next job to be processed.
// Also updates end_of_frame flag when the processing of all blocks is complete.
static void update_next_job_info(AV1CdefSync *cdef_sync, int nvfb, int nhfb) { … }

// Initializes cdef_sync parameters.
static inline void cdef_reset_job_info(AV1CdefSync *cdef_sync) { … }

// Checks if a job is available. If job is available,
// populates next job information and returns 1, else returns 0.
static inline int cdef_get_next_job(AV1CdefSync *cdef_sync,
                                    CdefSearchCtx *cdef_search_ctx,
                                    volatile int *cur_fbr,
                                    volatile int *cur_fbc,
                                    volatile int *sb_count) { … }

// Hook function for each thread in CDEF search multi-threading.
static int cdef_filter_block_worker_hook(void *arg1, void *arg2) { … }

// Assigns CDEF search hook function and thread data to each worker.
static void prepare_cdef_workers(AV1_COMP *cpi, AVxWorkerHook hook,
                                 int num_workers) { … }

// Implements multi-threading for CDEF search.
void av1_cdef_mse_calc_frame_mt(AV1_COMP *cpi) { … }

// Computes num_workers for temporal filter multi-threading.
static inline int compute_num_tf_workers(const AV1_COMP *cpi) { … }

// Computes num_workers for tpl multi-threading.
static inline int compute_num_tpl_workers(AV1_COMP *cpi) { … }

// Computes num_workers for loop filter multi-threading.
static inline int compute_num_lf_workers(AV1_COMP *cpi) { … }

// Computes num_workers for cdef multi-threading.
static inline int compute_num_cdef_workers(AV1_COMP *cpi) { … }

// Computes num_workers for loop-restoration multi-threading.
static inline int compute_num_lr_workers(AV1_COMP *cpi) { … }

// Computes num_workers for pack bitstream multi-threading.
static inline int compute_num_pack_bs_workers(AV1_COMP *cpi) { … }

// Computes num_workers for all intra multi-threading.
static inline int compute_num_ai_workers(AV1_COMP *cpi) { … }

static int compute_num_mod_workers(AV1_COMP *cpi,
                                   MULTI_THREADED_MODULES mod_name) { … }
// Computes the number of workers for each MT modules in the encoder
void av1_compute_num_workers_for_mt(AV1_COMP *cpi) { … }
chromium/third_party/libaom/source/libaom/av1/encoder/ethread.c