/******************************************************************** * * * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * * * * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * * * ******************************************************************** function: last mod: $Id$ ********************************************************************/ /*MMX acceleration of Theora's iDCT. Originally written by Rudolf Marek, based on code from On2's VP3.*/ #include "x86int.h" #include "../dct.h" #if defined(OC_X86_ASM) /*These are offsets into the table of constants below.*/ /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ #define OC_COSINE_OFFSET … /*A row of 8's.*/ #define OC_EIGHT_OFFSET … /*38 cycles*/ #define OC_IDCT_BEGIN(_y,_x) … \ /*38+8=46 cycles.*/ #define OC_ROW_IDCT(_y,_x) … \ /*The following macro does two 4x4 transposes in place. At entry, we assume: r0 = a3 a2 a1 a0 I(1) = b3 b2 b1 b0 r2 = c3 c2 c1 c0 r3 = d3 d2 d1 d0 r4 = e3 e2 e1 e0 r5 = f3 f2 f1 f0 r6 = g3 g2 g1 g0 r7 = h3 h2 h1 h0 At exit, we have: I(0) = d0 c0 b0 a0 I(1) = d1 c1 b1 a1 I(2) = d2 c2 b2 a2 I(3) = d3 c3 b3 a3 J(4) = h0 g0 f0 e0 J(5) = h1 g1 f1 e1 J(6) = h2 g2 f2 e2 J(7) = h3 g3 f3 e3 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. Since r1 is free at entry, we calculate the Js first.*/ /*19 cycles.*/ #define OC_TRANSPOSE(_y) … \ /*38+19=57 cycles.*/ #define OC_COLUMN_IDCT(_y) … \ static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ … } /*25 cycles.*/ #define OC_IDCT_BEGIN_10(_y,_x) … \ /*25+8=33 cycles.*/ #define OC_ROW_IDCT_10(_y,_x) … \ /*25+19=44 cycles'*/ #define OC_COLUMN_IDCT_10(_y) … \ static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ … } /*Performs an inverse 8x8 Type-II DCT transform. The input is assumed to be scaled by a factor of 4 relative to orthonormal version of the transform.*/ void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ … } #endif