#include "Dither.hpp" #include "ForceInline.hpp" #include "ProcessDxtc.hpp" #include <assert.h> #include <stdint.h> #include <string.h> #ifdef __ARM_NEON # include <arm_neon.h> #endif #if defined __AVX__ && !defined __SSE4_1__ #define __SSE4_1__ #endif #if defined __SSE4_1__ || defined __AVX2__ # ifdef _MSC_VER # include <intrin.h> # else # include <x86intrin.h> # ifndef _mm256_cvtsi256_si32 #define _mm256_cvtsi256_si32 … # endif # endif #endif static etcpak_force_inline uint16_t to565( uint8_t r, uint8_t g, uint8_t b ) { … } static etcpak_force_inline uint16_t to565( uint32_t c ) { … } static const uint8_t DxtcIndexTable[256] = …; static const uint8_t AlphaIndexTable_SSE[64] = …; static const uint16_t DivTable[255*3+1] = …; static const uint16_t DivTableNEON[255*3+1] = …; static const uint16_t DivTableAlpha[256] = …; static etcpak_force_inline uint64_t ProcessRGB( const uint8_t* src ) { … } #ifdef __AVX2__ static etcpak_force_inline void ProcessRGB_AVX( const uint8_t* src, char*& dst ) { __m256i px0 = _mm256_loadu_si256(((__m256i*)src) + 0); __m256i px1 = _mm256_loadu_si256(((__m256i*)src) + 1); __m256i px2 = _mm256_loadu_si256(((__m256i*)src) + 2); __m256i px3 = _mm256_loadu_si256(((__m256i*)src) + 3); __m256i smask = _mm256_set1_epi32( 0xF8FCF8 ); __m256i sd0 = _mm256_and_si256( px0, smask ); __m256i sd1 = _mm256_and_si256( px1, smask ); __m256i sd2 = _mm256_and_si256( px2, smask ); __m256i sd3 = _mm256_and_si256( px3, smask ); __m256i sc = _mm256_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0)); __m256i sc0 = _mm256_cmpeq_epi8(sd0, sc); __m256i sc1 = _mm256_cmpeq_epi8(sd1, sc); __m256i sc2 = _mm256_cmpeq_epi8(sd2, sc); __m256i sc3 = _mm256_cmpeq_epi8(sd3, sc); __m256i sm0 = _mm256_and_si256(sc0, sc1); __m256i sm1 = _mm256_and_si256(sc2, sc3); __m256i sm = _mm256_and_si256(sm0, sm1); const int64_t solid0 = 1 - _mm_testc_si128( _mm256_castsi256_si128( sm ), _mm_set1_epi32( -1 ) ); const int64_t solid1 = 1 - _mm_testc_si128( _mm256_extracti128_si256( sm, 1 ), _mm_set1_epi32( -1 ) ); if( solid0 + solid1 == 0 ) { const auto c0 = uint64_t( to565( src[0], src[1], src[2] ) ); const auto c1 = uint64_t( to565( src[16], src[17], src[18] ) ); memcpy( dst, &c0, 8 ); memcpy( dst+8, &c1, 8 ); dst += 16; return; } __m256i min0 = _mm256_min_epu8( px0, px1 ); __m256i min1 = _mm256_min_epu8( px2, px3 ); __m256i min2 = _mm256_min_epu8( min0, min1 ); __m256i max0 = _mm256_max_epu8( px0, px1 ); __m256i max1 = _mm256_max_epu8( px2, px3 ); __m256i max2 = _mm256_max_epu8( max0, max1 ); __m256i min3 = _mm256_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) ); __m256i max3 = _mm256_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) ); __m256i min4 = _mm256_min_epu8( min2, min3 ); __m256i max4 = _mm256_max_epu8( max2, max3 ); __m256i min5 = _mm256_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) ); __m256i max5 = _mm256_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) ); __m256i rmin = _mm256_min_epu8( min4, min5 ); __m256i rmax = _mm256_max_epu8( max4, max5 ); __m256i range1 = _mm256_subs_epu8( rmax, rmin ); __m256i range2 = _mm256_sad_epu8( rmax, rmin ); uint16_t vrange0 = DivTable[_mm256_cvtsi256_si32( range2 ) >> 1]; uint16_t vrange1 = DivTable[_mm256_extract_epi16( range2, 8 ) >> 1]; __m256i range00 = _mm256_set1_epi16( vrange0 ); __m256i range = _mm256_inserti128_si256( range00, _mm_set1_epi16( vrange1 ), 1 ); __m256i inset1 = _mm256_srli_epi16( range1, 4 ); __m256i inset = _mm256_and_si256( inset1, _mm256_set1_epi8( 0xF ) ); __m256i min = _mm256_adds_epu8( rmin, inset ); __m256i max = _mm256_subs_epu8( rmax, inset ); __m256i c0 = _mm256_subs_epu8( px0, rmin ); __m256i c1 = _mm256_subs_epu8( px1, rmin ); __m256i c2 = _mm256_subs_epu8( px2, rmin ); __m256i c3 = _mm256_subs_epu8( px3, rmin ); __m256i is0 = _mm256_maddubs_epi16( c0, _mm256_set1_epi8( 1 ) ); __m256i is1 = _mm256_maddubs_epi16( c1, _mm256_set1_epi8( 1 ) ); __m256i is2 = _mm256_maddubs_epi16( c2, _mm256_set1_epi8( 1 ) ); __m256i is3 = _mm256_maddubs_epi16( c3, _mm256_set1_epi8( 1 ) ); __m256i s0 = _mm256_hadd_epi16( is0, is1 ); __m256i s1 = _mm256_hadd_epi16( is2, is3 ); __m256i m0 = _mm256_mulhi_epu16( s0, range ); __m256i m1 = _mm256_mulhi_epu16( s1, range ); __m256i p0 = _mm256_packus_epi16( m0, m1 ); __m256i p1 = _mm256_or_si256( _mm256_srai_epi32( p0, 6 ), _mm256_srai_epi32( p0, 12 ) ); __m256i p2 = _mm256_or_si256( _mm256_srai_epi32( p0, 18 ), p0 ); __m256i p3 = _mm256_or_si256( p1, p2 ); __m256i p =_mm256_shuffle_epi8( p3, _mm256_set1_epi32( 0x0C080400 ) ); __m256i mm0 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), min ); __m256i mm1 = _mm256_unpacklo_epi8( _mm256_setzero_si256(), max ); __m256i mm2 = _mm256_unpacklo_epi64( mm1, mm0 ); __m256i mmr = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 11 ), 11 ); __m256i mmg = _mm256_slli_epi64( _mm256_srli_epi64( mm2, 26 ), 5 ); __m256i mmb = _mm256_srli_epi64( _mm256_slli_epi64( mm2, 16 ), 59 ); __m256i mm3 = _mm256_or_si256( mmr, mmg ); __m256i mm4 = _mm256_or_si256( mm3, mmb ); __m256i mm5 = _mm256_shuffle_epi8( mm4, _mm256_set1_epi32( 0x09080100 ) ); __m256i d0 = _mm256_unpacklo_epi32( mm5, p ); __m256i d1 = _mm256_permute4x64_epi64( d0, _MM_SHUFFLE( 3, 2, 2, 0 ) ); __m128i d2 = _mm256_castsi256_si128( d1 ); __m128i mask = _mm_set_epi64x( 0xFFFF0000 | -solid1, 0xFFFF0000 | -solid0 ); __m128i d3 = _mm_and_si128( d2, mask ); _mm_storeu_si128( (__m128i*)dst, d3 ); for( int j=4; j<8; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]]; for( int j=12; j<16; j++ ) dst[j] = (char)DxtcIndexTable[(uint8_t)dst[j]]; dst += 16; } #endif static const uint8_t AlphaIndexTable[8] = …; static etcpak_force_inline uint64_t ProcessAlpha( const uint8_t* src ) { … } #ifdef __SSE4_1__ static etcpak_force_inline uint64_t ProcessRGB_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 ) { __m128i smask = _mm_set1_epi32( 0xF8FCF8 ); __m128i sd0 = _mm_and_si128( px0, smask ); __m128i sd1 = _mm_and_si128( px1, smask ); __m128i sd2 = _mm_and_si128( px2, smask ); __m128i sd3 = _mm_and_si128( px3, smask ); __m128i sc = _mm_shuffle_epi32(sd0, _MM_SHUFFLE(0, 0, 0, 0)); __m128i sc0 = _mm_cmpeq_epi8(sd0, sc); __m128i sc1 = _mm_cmpeq_epi8(sd1, sc); __m128i sc2 = _mm_cmpeq_epi8(sd2, sc); __m128i sc3 = _mm_cmpeq_epi8(sd3, sc); __m128i sm0 = _mm_and_si128(sc0, sc1); __m128i sm1 = _mm_and_si128(sc2, sc3); __m128i sm = _mm_and_si128(sm0, sm1); if( _mm_testc_si128(sm, _mm_set1_epi32(-1)) ) { return uint64_t( to565( _mm_cvtsi128_si32( px0 ) ) ) << 16; } px0 = _mm_and_si128( px0, _mm_set1_epi32( 0xFFFFFF ) ); px1 = _mm_and_si128( px1, _mm_set1_epi32( 0xFFFFFF ) ); px2 = _mm_and_si128( px2, _mm_set1_epi32( 0xFFFFFF ) ); px3 = _mm_and_si128( px3, _mm_set1_epi32( 0xFFFFFF ) ); __m128i min0 = _mm_min_epu8( px0, px1 ); __m128i min1 = _mm_min_epu8( px2, px3 ); __m128i min2 = _mm_min_epu8( min0, min1 ); __m128i max0 = _mm_max_epu8( px0, px1 ); __m128i max1 = _mm_max_epu8( px2, px3 ); __m128i max2 = _mm_max_epu8( max0, max1 ); __m128i min3 = _mm_shuffle_epi32( min2, _MM_SHUFFLE( 2, 3, 0, 1 ) ); __m128i max3 = _mm_shuffle_epi32( max2, _MM_SHUFFLE( 2, 3, 0, 1 ) ); __m128i min4 = _mm_min_epu8( min2, min3 ); __m128i max4 = _mm_max_epu8( max2, max3 ); __m128i min5 = _mm_shuffle_epi32( min4, _MM_SHUFFLE( 0, 0, 2, 2 ) ); __m128i max5 = _mm_shuffle_epi32( max4, _MM_SHUFFLE( 0, 0, 2, 2 ) ); __m128i rmin = _mm_min_epu8( min4, min5 ); __m128i rmax = _mm_max_epu8( max4, max5 ); __m128i range1 = _mm_subs_epu8( rmax, rmin ); __m128i range2 = _mm_sad_epu8( rmax, rmin ); uint32_t vrange = _mm_cvtsi128_si32( range2 ) >> 1; __m128i range = _mm_set1_epi16( DivTable[vrange] ); __m128i inset1 = _mm_srli_epi16( range1, 4 ); __m128i inset = _mm_and_si128( inset1, _mm_set1_epi8( 0xF ) ); __m128i min = _mm_adds_epu8( rmin, inset ); __m128i max = _mm_subs_epu8( rmax, inset ); __m128i c0 = _mm_subs_epu8( px0, rmin ); __m128i c1 = _mm_subs_epu8( px1, rmin ); __m128i c2 = _mm_subs_epu8( px2, rmin ); __m128i c3 = _mm_subs_epu8( px3, rmin ); __m128i is0 = _mm_maddubs_epi16( c0, _mm_set1_epi8( 1 ) ); __m128i is1 = _mm_maddubs_epi16( c1, _mm_set1_epi8( 1 ) ); __m128i is2 = _mm_maddubs_epi16( c2, _mm_set1_epi8( 1 ) ); __m128i is3 = _mm_maddubs_epi16( c3, _mm_set1_epi8( 1 ) ); __m128i s0 = _mm_hadd_epi16( is0, is1 ); __m128i s1 = _mm_hadd_epi16( is2, is3 ); __m128i m0 = _mm_mulhi_epu16( s0, range ); __m128i m1 = _mm_mulhi_epu16( s1, range ); __m128i p0 = _mm_packus_epi16( m0, m1 ); __m128i p1 = _mm_or_si128( _mm_srai_epi32( p0, 6 ), _mm_srai_epi32( p0, 12 ) ); __m128i p2 = _mm_or_si128( _mm_srai_epi32( p0, 18 ), p0 ); __m128i p3 = _mm_or_si128( p1, p2 ); __m128i p =_mm_shuffle_epi8( p3, _mm_set1_epi32( 0x0C080400 ) ); uint32_t vmin = _mm_cvtsi128_si32( min ); uint32_t vmax = _mm_cvtsi128_si32( max ); uint32_t vp = _mm_cvtsi128_si32( p ); return uint64_t( ( uint64_t( to565( vmin ) ) << 16 ) | to565( vmax ) | ( uint64_t( vp ) << 32 ) ); } static etcpak_force_inline uint64_t ProcessOneChannel_SSE( __m128i a ) { __m128i solidCmp = _mm_shuffle_epi8( a, _mm_setzero_si128() ); __m128i cmpRes = _mm_cmpeq_epi8( a, solidCmp ); if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) ) { return _mm_cvtsi128_si32( a ) & 0xFF; } __m128i a1 = _mm_shuffle_epi32( a, _MM_SHUFFLE( 2, 3, 0, 1 ) ); __m128i max1 = _mm_max_epu8( a, a1 ); __m128i min1 = _mm_min_epu8( a, a1 ); __m128i amax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); __m128i amin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) ); __m128i max2 = _mm_max_epu8( max1, amax2 ); __m128i min2 = _mm_min_epu8( min1, amin2 ); __m128i amax3 = _mm_alignr_epi8( max2, max2, 2 ); __m128i amin3 = _mm_alignr_epi8( min2, min2, 2 ); __m128i max3 = _mm_max_epu8( max2, amax3 ); __m128i min3 = _mm_min_epu8( min2, amin3 ); __m128i amax4 = _mm_alignr_epi8( max3, max3, 1 ); __m128i amin4 = _mm_alignr_epi8( min3, min3, 1 ); __m128i max = _mm_max_epu8( max3, amax4 ); __m128i min = _mm_min_epu8( min3, amin4 ); __m128i minmax = _mm_unpacklo_epi8( max, min ); __m128i r = _mm_sub_epi8( max, min ); int range = _mm_cvtsi128_si32( r ) & 0xFF; __m128i rv = _mm_set1_epi16( DivTableAlpha[range] ); __m128i v = _mm_sub_epi8( a, min ); __m128i lo16 = _mm_unpacklo_epi8( v, _mm_setzero_si128() ); __m128i hi16 = _mm_unpackhi_epi8( v, _mm_setzero_si128() ); __m128i lomul = _mm_mulhi_epu16( lo16, rv ); __m128i himul = _mm_mulhi_epu16( hi16, rv ); __m128i p0 = _mm_packus_epi16( lomul, himul ); __m128i p1 = _mm_or_si128( _mm_and_si128( p0, _mm_set1_epi16( 0x3F ) ), _mm_srai_epi16( _mm_and_si128( p0, _mm_set1_epi16( 0x3F00 ) ), 5 ) ); __m128i p2 = _mm_packus_epi16( p1, p1 ); uint64_t pi = _mm_cvtsi128_si64( p2 ); uint64_t data = 0; for( int i=0; i<8; i++ ) { uint64_t idx = AlphaIndexTable_SSE[(pi>>(i*8)) & 0x3F]; data |= idx << (i*6); } return (uint64_t)(uint16_t)_mm_cvtsi128_si32( minmax ) | ( data << 16 ); } static etcpak_force_inline uint64_t ProcessAlpha_SSE( __m128i px0, __m128i px1, __m128i px2, __m128i px3 ) { __m128i mask = _mm_setr_epi32( 0x0f0b0703, -1, -1, -1 ); __m128i m0 = _mm_shuffle_epi8( px0, mask ); __m128i m1 = _mm_shuffle_epi8( px1, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 3, 0, 3 ) ) ); __m128i m2 = _mm_shuffle_epi8( px2, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 3, 0, 3, 3 ) ) ); __m128i m3 = _mm_shuffle_epi8( px3, _mm_shuffle_epi32( mask, _MM_SHUFFLE( 0, 3, 3, 3 ) ) ); __m128i m4 = _mm_or_si128( m0, m1 ); __m128i m5 = _mm_or_si128( m2, m3 ); __m128i a = _mm_or_si128( m4, m5 ); return ProcessOneChannel_SSE( a ); } #endif void CompressDxt1( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) { … } void CompressDxt1Dither( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) { … } void CompressDxt5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) { … } void CompressBc4( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) { … } void CompressBc5( const uint32_t* src, uint64_t* dst, uint32_t blocks, size_t width ) { … }