#include <assert.h>
#define OPJ_SKIP_POISON
#include "opj_includes.h"
#ifdef __SSE__
#include <xmmintrin.h>
#endif
#ifdef __SSE2__
#include <emmintrin.h>
#endif
#ifdef __SSSE3__
#include <tmmintrin.h>
#endif
#ifdef __AVX2__
#include <immintrin.h>
#endif
#if defined(__GNUC__)
#pragma GCC poison malloc calloc realloc free
#endif
#ifdef __AVX2__
#define VREG_INT_COUNT …
#else
#define VREG_INT_COUNT …
#endif
#define PARALLEL_COLS_53 …
opj_dwt_t;
#define NB_ELTS_V8 …
opj_v8_t;
opj_v8dwt_t ;
static const OPJ_FLOAT32 opj_dwt_alpha = …;
static const OPJ_FLOAT32 opj_dwt_beta = …;
static const OPJ_FLOAT32 opj_dwt_gamma = …;
static const OPJ_FLOAT32 opj_dwt_delta = …;
static const OPJ_FLOAT32 opj_K = …;
static const OPJ_FLOAT32 opj_invK = …;
static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
OPJ_INT32 * OPJ_RESTRICT b,
OPJ_INT32 dn,
OPJ_INT32 sn, OPJ_INT32 cas);
static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas);
static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
opj_stepsize_t *bandno_stepsize);
static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
static OPJ_BOOL opj_dwt_decode_partial_tile(
opj_tcd_tilecomp_t* tilec,
OPJ_UINT32 numres);
opj_encode_and_deinterleave_v_fnptr_type;
opj_encode_and_deinterleave_h_one_row_fnptr_type;
static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
opj_tcd_tilecomp_t * tilec,
opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
opj_encode_and_deinterleave_h_one_row_fnptr_type
p_encode_and_deinterleave_h_one_row);
static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
OPJ_UINT32 i);
#define IDX_S(i) …
#define IDX_D(i) …
#define UNDERFLOW_SN(i) …
#define UNDERFLOW_DN(i) …
#define OVERFLOW_S(i) …
#define OVERFLOW_D(i) …
#define OPJ_S(i) …
#define OPJ_D(i) …
#define OPJ_S_(i) …
#define OPJ_D_(i) …
#define OPJ_SS_(i) …
#define OPJ_DD_(i) …
static const OPJ_FLOAT64 opj_dwt_norms[4][10] = …;
static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = …;
static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
OPJ_INT32 * OPJ_RESTRICT b,
OPJ_INT32 dn,
OPJ_INT32 sn, OPJ_INT32 cas)
{ … }
#ifdef STANDARD_SLOW_VERSION
static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
{
const OPJ_INT32 *ai = a;
OPJ_INT32 *bi = h->mem + h->cas;
OPJ_INT32 i = h->sn;
while (i--) {
*bi = *(ai++);
bi += 2;
}
ai = a + h->sn;
bi = h->mem + 1 - h->cas;
i = h->dn ;
while (i--) {
*bi = *(ai++);
bi += 2;
}
}
static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
{
const OPJ_INT32 *ai = a;
OPJ_INT32 *bi = v->mem + v->cas;
OPJ_INT32 i = v->sn;
while (i--) {
*bi = *ai;
bi += 2;
ai += x;
}
ai = a + (v->sn * (OPJ_SIZE_T)x);
bi = v->mem + 1 - v->cas;
i = v->dn ;
while (i--) {
*bi = *ai;
bi += 2;
ai += x;
}
}
#endif
#ifdef STANDARD_SLOW_VERSION
static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn,
OPJ_INT32 sn, OPJ_INT32 cas)
{
OPJ_INT32 i;
if (!cas) {
if ((dn > 0) || (sn > 1)) {
for (i = 0; i < sn; i++) {
OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
}
for (i = 0; i < dn; i++) {
OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
}
}
} else {
if (!sn && dn == 1) {
OPJ_S(0) /= 2;
} else {
for (i = 0; i < sn; i++) {
OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
}
for (i = 0; i < dn; i++) {
OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
}
}
}
}
static void opj_dwt_decode_1(const opj_dwt_t *v)
{
opj_dwt_decode_1_(v->mem, v->mem_count, v->dn, v->sn, v->cas);
}
#endif
#if !defined(STANDARD_SLOW_VERSION)
static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
const OPJ_INT32 sn,
const OPJ_INT32 len,
OPJ_INT32* tiledp)
{ … }
static void opj_idwt53_h_cas1(OPJ_INT32* tmp,
const OPJ_INT32 sn,
const OPJ_INT32 len,
OPJ_INT32* tiledp)
{ … }
#endif
static void opj_idwt53_h(const opj_dwt_t *dwt,
OPJ_INT32* tiledp)
{ … }
#if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
#if __AVX2__
#define VREG …
#define LOAD_CST …
#define LOAD …
#define LOADU …
#define STORE …
#define STOREU …
#define ADD …
#define SUB …
#define SAR …
#else
#define VREG …
#define LOAD_CST …
#define LOAD …
#define LOADU …
#define STORE …
#define STOREU …
#define ADD …
#define SUB …
#define SAR …
#endif
#define ADD3 …
static
void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
const OPJ_INT32* tmp,
OPJ_INT32 len,
OPJ_SIZE_T stride)
{ … }
static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
OPJ_INT32* tmp,
const OPJ_INT32 sn,
const OPJ_INT32 len,
OPJ_INT32* tiledp_col,
const OPJ_SIZE_T stride)
{ … }
static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
OPJ_INT32* tmp,
const OPJ_INT32 sn,
const OPJ_INT32 len,
OPJ_INT32* tiledp_col,
const OPJ_SIZE_T stride)
{ … }
#undef VREG
#undef LOAD_CST
#undef LOADU
#undef LOAD
#undef STORE
#undef STOREU
#undef ADD
#undef ADD3
#undef SUB
#undef SAR
#endif
#if !defined(STANDARD_SLOW_VERSION)
static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
const OPJ_INT32 sn,
const OPJ_INT32 len,
OPJ_INT32* tiledp_col,
const OPJ_SIZE_T stride)
{ … }
static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
const OPJ_INT32 sn,
const OPJ_INT32 len,
OPJ_INT32* tiledp_col,
const OPJ_SIZE_T stride)
{ … }
#endif
static void opj_idwt53_v(const opj_dwt_t *dwt,
OPJ_INT32* tiledp_col,
OPJ_SIZE_T stride,
OPJ_INT32 nb_cols)
{ … }
#if 0
static void opj_dwt_encode_step1(OPJ_FLOAT32* fw,
OPJ_UINT32 end,
const OPJ_FLOAT32 c)
{
OPJ_UINT32 i = 0;
for (; i < end; ++i) {
fw[0] *= c;
fw += 2;
}
}
#else
static void opj_dwt_encode_step1_combined(OPJ_FLOAT32* fw,
OPJ_UINT32 iters_c1,
OPJ_UINT32 iters_c2,
const OPJ_FLOAT32 c1,
const OPJ_FLOAT32 c2)
{ … }
#endif
static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
OPJ_UINT32 end,
OPJ_UINT32 m,
OPJ_FLOAT32 c)
{ … }
static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas)
{ … }
static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
opj_stepsize_t *bandno_stepsize)
{ … }
static
void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn,
void* tmpIn,
OPJ_UINT32 width,
OPJ_BOOL even)
{ … }
static
void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn,
void* tmpIn,
OPJ_UINT32 width,
OPJ_BOOL even)
{ … }
opj_dwt_encode_h_job_t;
static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
{ … }
opj_dwt_encode_v_job_t;
static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
{ … }
static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn,
void *tmpOut,
OPJ_UINT32 height,
OPJ_UINT32 stride_width,
OPJ_UINT32 cols)
{ … }
static INLINE void opj_dwt_deinterleave_v_cols(
const OPJ_INT32 * OPJ_RESTRICT src,
OPJ_INT32 * OPJ_RESTRICT dst,
OPJ_INT32 dn,
OPJ_INT32 sn,
OPJ_UINT32 stride_width,
OPJ_INT32 cas,
OPJ_UINT32 cols)
{ … }
static void opj_dwt_encode_and_deinterleave_v(
void *arrayIn,
void *tmpIn,
OPJ_UINT32 height,
OPJ_BOOL even,
OPJ_UINT32 stride_width,
OPJ_UINT32 cols)
{ … }
static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw,
OPJ_UINT32 end,
const OPJ_FLOAT32 cst)
{ … }
static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
OPJ_UINT32 end,
OPJ_UINT32 m,
OPJ_FLOAT32 cst)
{ … }
static void opj_dwt_encode_and_deinterleave_v_real(
void *arrayIn,
void *tmpIn,
OPJ_UINT32 height,
OPJ_BOOL even,
OPJ_UINT32 stride_width,
OPJ_UINT32 cols)
{ … }
static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
opj_tcd_tilecomp_t * tilec,
opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
opj_encode_and_deinterleave_h_one_row_fnptr_type
p_encode_and_deinterleave_h_one_row)
{ … }
OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
opj_tcd_tilecomp_t * tilec)
{ … }
OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
OPJ_UINT32 numres)
{ … }
OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
{ … }
OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
opj_tcd_tilecomp_t * tilec)
{ … }
OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
{ … }
void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
{ … }
static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
OPJ_UINT32 i)
{ … }
opj_dwt_decode_h_job_t;
static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
{ … }
opj_dwt_decode_v_job_t;
static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
{ … }
static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
{ … }
static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
OPJ_INT32 cas,
opj_sparse_array_int32_t* sa,
OPJ_UINT32 sa_line,
OPJ_UINT32 sn,
OPJ_UINT32 win_l_x0,
OPJ_UINT32 win_l_x1,
OPJ_UINT32 win_h_x0,
OPJ_UINT32 win_h_x1)
{ … }
static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
OPJ_INT32 cas,
opj_sparse_array_int32_t* sa,
OPJ_UINT32 sa_col,
OPJ_UINT32 nb_cols,
OPJ_UINT32 sn,
OPJ_UINT32 win_l_y0,
OPJ_UINT32 win_l_y1,
OPJ_UINT32 win_h_y0,
OPJ_UINT32 win_h_y1)
{ … }
static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_SIZE_T a_count,
OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas,
OPJ_INT32 win_l_x0,
OPJ_INT32 win_l_x1,
OPJ_INT32 win_h_x0,
OPJ_INT32 win_h_x1)
{ … }
#define OPJ_S_off(i,off) …
#define OPJ_D_off(i,off) …
#define OPJ_S__off(i,off) …
#define OPJ_D__off(i,off) …
#define OPJ_SS__off(i,off) …
#define OPJ_DD__off(i,off) …
static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
OPJ_UINT32 nb_cols,
OPJ_INT32 dn, OPJ_INT32 sn,
OPJ_INT32 cas,
OPJ_INT32 win_l_x0,
OPJ_INT32 win_l_x1,
OPJ_INT32 win_h_x0,
OPJ_INT32 win_h_x1)
{ … }
static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
OPJ_UINT32 resno,
OPJ_UINT32 bandno,
OPJ_UINT32 tcx0,
OPJ_UINT32 tcy0,
OPJ_UINT32 tcx1,
OPJ_UINT32 tcy1,
OPJ_UINT32* tbx0,
OPJ_UINT32* tby0,
OPJ_UINT32* tbx1,
OPJ_UINT32* tby1)
{ … }
static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
OPJ_UINT32 max_size,
OPJ_UINT32* start,
OPJ_UINT32* end)
{ … }
static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
opj_tcd_tilecomp_t* tilec,
OPJ_UINT32 numres)
{ … }
static OPJ_BOOL opj_dwt_decode_partial_tile(
opj_tcd_tilecomp_t* tilec,
OPJ_UINT32 numres)
{ … }
static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt,
OPJ_FLOAT32* OPJ_RESTRICT a,
OPJ_UINT32 width,
OPJ_UINT32 remaining_height)
{ … }
static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt,
opj_sparse_array_int32_t* sa,
OPJ_UINT32 sa_line,
OPJ_UINT32 remaining_height)
{ … }
static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
OPJ_FLOAT32* OPJ_RESTRICT a,
OPJ_UINT32 width,
OPJ_UINT32 nb_elts_read)
{ … }
static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
opj_sparse_array_int32_t* sa,
OPJ_UINT32 sa_col,
OPJ_UINT32 nb_elts_read)
{ … }
#ifdef __SSE__
static void opj_v8dwt_decode_step1_sse(opj_v8_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
const __m128 c)
{ … }
static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
OPJ_UINT32 m,
__m128 c)
{ … }
#else
static void opj_v8dwt_decode_step1(opj_v8_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
const OPJ_FLOAT32 c)
{
OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
OPJ_UINT32 i;
for (i = start; i < end; ++i) {
fw[i * 2 * 8 ] = fw[i * 2 * 8 ] * c;
fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c;
fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c;
fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c;
fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c;
fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c;
fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c;
fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c;
}
}
static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w,
OPJ_UINT32 start,
OPJ_UINT32 end,
OPJ_UINT32 m,
OPJ_FLOAT32 c)
{
OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
OPJ_UINT32 i;
OPJ_UINT32 imax = opj_uint_min(end, m);
if (start > 0) {
fw += 2 * NB_ELTS_V8 * start;
fl = fw - 2 * NB_ELTS_V8;
}
for (i = start; i < imax; ++i) {
fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c);
fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c);
fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c);
fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c);
fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c);
fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c);
fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c);
fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c);
fl = fw;
fw += 2 * NB_ELTS_V8;
}
if (m < end) {
assert(m + 1 == end);
c += c;
fw[-8] = fw[-8] + fl[0] * c;
fw[-7] = fw[-7] + fl[1] * c;
fw[-6] = fw[-6] + fl[2] * c;
fw[-5] = fw[-5] + fl[3] * c;
fw[-4] = fw[-4] + fl[4] * c;
fw[-3] = fw[-3] + fl[5] * c;
fw[-2] = fw[-2] + fl[6] * c;
fw[-1] = fw[-1] + fl[7] * c;
}
}
#endif
static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
{ … }
opj_dwt97_decode_h_job_t;
static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls)
{ … }
opj_dwt97_decode_v_job_t;
static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls)
{ … }
static
OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp,
opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
OPJ_UINT32 numres)
{ … }
static
OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
OPJ_UINT32 numres)
{ … }
OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
OPJ_UINT32 numres)
{ … }