diff --git a/transpose.c b/transpose.c index 446566e..8ad1f0a 100644 --- a/transpose.c +++ b/transpose.c @@ -1,6 +1,6 @@ /** Copyright (C) powturbo 2013-2023 - GPL v2 License + SPDX-License-Identifier: GPL v2 License This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -60,36 +60,52 @@ #define powof2(n) !((n)&((n)-1)) -//------------ Scalar ------------------- -#define TPENC tpenc -#define TPDEC tpdec -#define TPZENC tpzenc -#define TPZDEC tpzdec -#define TPXENC tpxenc -#define TPXDEC tpxdec - +//-- 24 bits / 3 bytes (scalar only) ---------------------- #define ESIZE 3 #define STRIDE ESIZE -#include "transpose.c" -#undef ESIZE +#define TPENC tpenc +#define TPDEC tpdec +#include "transpose.c" + +#define TPENC tpzenc +#define TPDEC tpzdec +#include "transpose.c" + +#define TPENC tpxenc +#define TPDEC tpxdec +#include "transpose.c" + +//-- 128 bits / 16 bytes (scalar only) -------------------- #define ESIZE 16 #define STRIDE ESIZE -#include "transpose.c" -#undef ESIZE -//----------------- SIMD -------------------------------------------------------------------------------------------- +#define TPENC tpenc +#define TPDEC tpdec +#include "transpose.c" + +#define TPENC tpzenc +#define TPDEC tpzdec +#include "transpose.c" + +#define TPENC tpxenc +#define TPDEC tpxdec +#include "transpose.c" + +//----------------------------------------------------------- #define LD128(_ip_) _mm_loadu_si128( (__m128i *)(_ip_)) #define ST128(_op_,_v_) _mm_storeu_si128((__m128i *)(_op_),_v_) #define LD256(ip) _mm256_loadu_si256(ip) #define ST256(op,v) _mm256_storeu_si256(op,v) -//-------------------------------- 16 bits ------------------------------------------------- +//---------------------------------------------- 16 bits ------------------------------------------------- #define ESIZE 2 #define USIZE 16 -//------------ byte --------- +//-- byte ------------------------ #define STRIDE ESIZE +#define TPENC tpenc +#define TPDEC tpdec #define TPENC128V tpenc128v #define TPDEC128V tpdec128v #define TPENC256V tpenc256v @@ -104,25 +120,27 @@ #define VD128(_ov_,_sv_) #define VD256(_ov_,_sv_) #include "transpose.c" -#undef TPENC256V +#undef TPENC256V #undef TPDEC256V //-------------- #define ISDELTA //-- zigzag +#define TPENC tpzenc +#define TPDEC tpzdec #define TPENC128V tpzenc128v #define TPDEC128V tpzdec128v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_delta_epi16(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm_zzage_epi16(_tv); } #define VDINI128 __m128i sv = _mm_setzero_si128() #define VD128(_v_,_sv_) _v_ = mm_zzagd_epi16( _v_); _sv_ = mm_scan_epi16(_v_,_sv_); _v_ = _sv_ #include "transpose.c" //-- xor +#define TPENC tpxenc +#define TPDEC tpxdec #define TPENC128V tpxenc128v #define TPDEC128V tpxdec128v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_xore_epi16(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; } #define VDINI128 __m128i sv = _mm_setzero_si128() @@ -130,9 +148,10 @@ #include "transpose.c" //-- #undef ISDELTA -#undef STRIDE //----------- nibble --------- #define STRIDE 4 +#define TPENC tp4enc +#define TPDEC tp4dec #define TPENC128V tp4enc128v #define TPDEC128V tp4dec128v #define TPENC256V tp4enc256v @@ -153,20 +172,22 @@ #define ISDELTA //-- zigzag +#define TPENC tp4zenc +#define TPDEC tp4zdec #define TPENC128V tp4zenc128v #define TPDEC128V tp4zdec128v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_delta_epi16(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm_zzage_epi16(_tv); } #define VDINI128 __m128i sv = _mm_setzero_si128() #define VD128(_v_,_sv_) _v_ = mm_zzagd_epi16( _v_); _sv_ = mm_scan_epi16(_v_,_sv_); _v_ = _sv_ #include "transpose.c" //-- xor +#define TPENC tp4xenc +#define TPDEC tp4xdec #define TPENC128V tp4xenc128v #define TPDEC128V tp4xdec128v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_xore_epi16(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; } #define VDINI128 __m128i sv = _mm_setzero_si128() @@ -174,13 +195,14 @@ #include "transpose.c" //-- #undef ISDELTA -#undef ESIZE -//----------------------------- 32 bits ---------------------------------------------------------- +//------------------------------------------------ 32 bits ---------------------------------------------------------- #define ESIZE 4 #define USIZE 32 #define STRIDE ESIZE //----------------- byte ------------ +#define TPENC tpenc +#define TPDEC tpdec #define TPENC128V tpenc128v #define TPDEC128V tpdec128v #define TPENC256V tpenc256v @@ -194,17 +216,17 @@ #define VDINI256 #define VD128(_ov_,_sv_) #define VD256(_ov_,_sv_) - #include "transpose.c" //--------------------------------- #define ISDELTA //-- zigzag +#define TPENC tpzenc +#define TPDEC tpzdec #define TPENC128V tpzenc128v #define TPDEC128V tpzdec128v #define TPENC256V tpzenc256v #define TPDEC256V tpzdec256v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_delta_epi32(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm_zzage_epi32(_tv); } #define VEINI256 __m256i sv = _mm256_setzero_si256() @@ -215,12 +237,13 @@ #define VD256(_v_,_sv_) _v_ = mm256_zzagd_epi32(_v_); _sv_ = mm256_scan_epi32(_v_,_sv_); _v_ = _sv_ #include "transpose.c" //-- xor +#define TPENC tpxenc +#define TPDEC tpxdec #define TPENC128V tpxenc128v #define TPDEC128V tpxdec128v #define TPENC256V tpxenc256v #define TPDEC256V tpxdec256v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_xore_epi32(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; } #define VEINI256 __m256i sv = _mm256_setzero_si256() @@ -233,9 +256,10 @@ //--------- #undef ISDELTA -#undef STRIDE //----------------- nibble ---------- #define STRIDE 8 +#define TPENC tp4enc +#define TPDEC tp4dec #define TPENC128V tp4enc128v #define TPDEC128V tp4dec128v #define TPENC256V tp4enc256v @@ -253,12 +277,13 @@ //------------- #define ISDELTA //-- zigzag +#define TPENC tp4zenc +#define TPDEC tp4zdec #define TPENC128V tp4zenc128v #define TPDEC128V tp4zdec128v #define TPENC256V tp4zenc256v #define TPDEC256V tp4zdec256v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_delta_epi32(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm_zzage_epi32(_tv); } #define VEINI256 __m256i sv = _mm256_setzero_si256() @@ -269,12 +294,13 @@ #define VD256(_v_,_sv_) _v_ = mm256_zzagd_epi32(_v_); _sv_ = mm256_scan_epi32(_v_,_sv_); _v_ = _sv_ #include "transpose.c" //-- xor +#define TPENC tp4xenc +#define TPDEC tp4xdec #define TPENC128V tp4xenc128v #define TPDEC128V tp4xdec128v #define TPENC256V tp4xenc256v #define TPDEC256V tp4xdec256v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_xore_epi32(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; } #define VEINI256 __m256i sv = _mm256_setzero_si256() @@ -286,15 +312,14 @@ #include "transpose.c" //-------------- #undef ISDELTA -#undef ESIZE -#undef STRIDE - //------------------------------- 64 bits --------------------------------------------------------------- #define ESIZE 8 #define USIZE 64 #define STRIDE ESIZE //--- byte 64 bits ------------ +#define TPENC tpenc +#define TPDEC tpdec #define TPENC128V tpenc128v #define TPDEC128V tpdec128v #define TPENC256V tpenc256v @@ -313,12 +338,13 @@ //--------------- #define ISDELTA //--- zigzag +#define TPENC tpzenc +#define TPDEC tpzdec #define TPENC128V tpzenc128v #define TPDEC128V tpzdec128v #define TPENC256V tpzenc256v #define TPDEC256V tpzdec256v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_delta_epi64(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm_zzage_epi64(_tv); } #define VEINI256 __m256i sv = _mm256_setzero_si256() @@ -329,12 +355,13 @@ #define VD256(_v_,_sv_) _v_ = mm256_zzagd_epi64(_v_); _sv_ = mm256_scan_epi64(_v_,_sv_); _v_ = _sv_ #include "transpose.c" //--- xor +#define TPENC tpxenc +#define TPDEC tpxdec #define TPENC128V tpxenc128v #define TPDEC128V tpxdec128v #define TPENC256V tpxenc256v #define TPDEC256V tpxdec256v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_xore_epi64(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; } #define VEINI256 __m256i sv = _mm256_setzero_si256() @@ -346,9 +373,10 @@ #include "transpose.c" //----------- #undef ISDELTA -#undef STRIDE //---------- nibble ---------- #define STRIDE 16 +#define TPENC tp4enc +#define TPDEC tp4dec #define TPENC128V tp4enc128v #define TPDEC128V tp4dec128v #define TPENC256V tp4enc256v @@ -366,12 +394,13 @@ //------------------------- #define ISDELTA //-- zigzag +#define TPENC tp4zenc +#define TPDEC tp4zdec #define TPENC128V tp4zenc128v #define TPDEC128V tp4zdec128v #define TPENC256V tp4zenc256v #define TPDEC256V tp4zdec256v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_delta_epi64(_iv_,_sv_); _sv_ = _iv_; _iv_ = mm_zzage_epi64(_tv); } #define VEINI256 __m256i sv = _mm256_setzero_si256() @@ -383,12 +412,13 @@ #include "transpose.c" //-- xor +#define TPENC tp4xenc +#define TPDEC tp4xdec #define TPENC128V tp4xenc128v #define TPDEC128V tp4xdec128v #define TPENC256V tp4xenc256v #define TPDEC256V tp4xdec256v -#define VDELTA 0 #define VEINI128 __m128i sv = _mm_setzero_si128() #define VE128(_iv_,_sv_) { __m128i _tv = mm_xore_epi64(_iv_,_sv_); _sv_ = _iv_; _iv_ = _tv; } #define VEINI256 __m256i sv = _mm256_setzero_si256() @@ -398,805 +428,14 @@ #define VDINI256 __m256i sv = _mm256_setzero_si256() #define VD256(_v_,_sv_) _sv_ = _v_ = mm256_xord_epi64(_v_,_sv_) #include "transpose.c" -//-- -#undef ISDELTA -//-------------------------------------------------------------------------------------------------------------------------------------------------- +#else //************************************************************* Templates ******************************************************************************************************** -#if !defined(__AVX2__) && !defined(__SSE3__) //include only 1 time +#define SIE(p,i) (p+=stride) //faster on ARM //#define SIE(_p_,_i_) (_p_+ _i_*stride) +#define SID(p,i) (p+=stride) //#define SID(_p_,_i_) (_p_+ _i_*stride) -static unsigned _cpuisa; -//--------------------- CPU detection ------------------------------------------- - #if defined(__i386__) || defined(__x86_64__) - #if _MSC_VER >=1300 -#include - #elif defined (__INTEL_COMPILER) -#include - #endif - -static inline void cpuid(int reg[4], int id) { - #if defined (_MSC_VER) //|| defined (__INTEL_COMPILER) - __cpuidex(reg, id, 0); - #elif defined(__i386__) || defined(__x86_64__) - __asm("cpuid" : "=a"(reg[0]),"=b"(reg[1]),"=c"(reg[2]),"=d"(reg[3]) : "a"(id),"c"(0) : ); - #endif -} - -static inline uint64_t xgetbv (int ctr) { - #if(defined _MSC_VER && (_MSC_FULL_VER >= 160040219) || defined __INTEL_COMPILER) - return _xgetbv(ctr); - #elif defined(__i386__) || defined(__x86_64__) - unsigned a, d; - __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); - return (uint64_t)d << 32 | a; - #else - unsigned a=0, d=0; - return (uint64_t)d << 32 | a; - #endif -} - #endif - -#define AVX512F 0x001 -#define AVX512DQ 0x002 -#define AVX512IFMA 0x004 -#define AVX512PF 0x008 -#define AVX512ER 0x010 -#define AVX512CD 0x020 -#define AVX512BW 0x040 -#define AVX512VL 0x080 -#define AVX512VNNI 0x100 -#define AVX512VBMI 0x200 -#define AVX512VBMI2 0x400 - -#define IS_SSE 0x10 -#define IS_SSE2 0x20 -#define IS_SSE3 0x30 -#define IS_SSSE3 0x32 -#define IS_POWER9 0x34 // powerpc -#define IS_NEON 0x38 // arm neon -#define IS_SSE41 0x40 -#define IS_SSE41x 0x41 //+popcount -#define IS_SSE42 0x42 -#define IS_AVX 0x50 -#define IS_AVX2 0x60 -#define IS_AVX512 0x800 - -unsigned cpuisa(void) { - int c[4] = {0}; - if(_cpuisa) return _cpuisa; - _cpuisa++; - #if defined(__i386__) || defined(__x86_64__) - cpuid(c, 0); - if(c[0]) { - cpuid(c, 1); - //family = ((c >> 8) & 0xf) + ((c >> 20) & 0xff) - //model = ((c >> 4) & 0xf) + ((c >> 12) & 0xf0) - if( c[3] & (1 << 25)) { _cpuisa = IS_SSE; - if( c[3] & (1 << 26)) { _cpuisa = IS_SSE2; - if( c[2] & (1 << 0)) { _cpuisa = IS_SSE3; - // _cpuisa = IS_SSE3SLOW; // Atom SSSE3 slow - if( c[2] & (1 << 9)) { _cpuisa = IS_SSSE3; - if( c[2] & (1 << 19)) { _cpuisa = IS_SSE41; - if( c[2] & (1 << 23)) { _cpuisa = IS_SSE41x; // +popcount - if( c[2] & (1 << 20)) { _cpuisa = IS_SSE42; // SSE4.2 - if((c[2] & (1 << 28)) && - (c[2] & (1 << 27)) && // OSXSAVE - (c[2] & (1 << 26)) && // XSAVE - (xgetbv(0) & 6)==6) { _cpuisa = IS_AVX; // AVX - if(c[2]& (1 << 3)) _cpuisa |= 1; // +FMA3 - if(c[2]& (1 << 16)) _cpuisa |= 2; // +FMA4 - if(c[2]& (1 << 25)) _cpuisa |= 4; // +AES - cpuid(c, 7); - if(c[1] & (1 << 5)) { _cpuisa = IS_AVX2; - if(c[1] & (1 << 16)) { - cpuid(c, 0xd); - if((c[0] & 0x60)==0x60) { _cpuisa = IS_AVX512; - cpuid(c, 7); - if(c[1] & (1<<16)) _cpuisa |= AVX512F; - if(c[1] & (1<<17)) _cpuisa |= AVX512DQ; - if(c[1] & (1<<21)) _cpuisa |= AVX512IFMA; - if(c[1] & (1<<26)) _cpuisa |= AVX512PF; - if(c[1] & (1<<27)) _cpuisa |= AVX512ER; - if(c[1] & (1<<28)) _cpuisa |= AVX512CD; - if(c[1] & (1<<30)) _cpuisa |= AVX512BW; - if(c[1] & (1u<<31)) _cpuisa |= AVX512VL; - if(c[2] & (1<< 1)) _cpuisa |= AVX512VBMI; - if(c[2] & (1<<11)) _cpuisa |= AVX512VNNI; - if(c[2] & (1<< 6)) _cpuisa |= AVX512VBMI2; - }}} - }}}}}}}}} - #elif defined(__powerpc64__) - _cpuisa = IS_POWER9; // power9 - #elif defined(__ARM_NEON) - _cpuisa = IS_NEON; // ARM_NEON - #endif - return _cpuisa; -} - -unsigned cpuini(unsigned cpuisa) { if(cpuisa) _cpuisa = cpuisa; return _cpuisa; } - -char *cpustr(unsigned cpuisa) { - if(!cpuisa) cpuisa = _cpuisa; - #if defined(__i386__) || defined(__x86_64__) - if(cpuisa >= IS_AVX512) { - if(cpuisa & AVX512VBMI2) return "avx512vbmi2"; - if(cpuisa & AVX512VBMI) return "avx512vbmi"; - if(cpuisa & AVX512VNNI) return "avx512vnni"; - if(cpuisa & AVX512VL) return "avx512vl"; - if(cpuisa & AVX512BW) return "avx512bw"; - if(cpuisa & AVX512CD) return "avx512cd"; - if(cpuisa & AVX512ER) return "avx512er"; - if(cpuisa & AVX512PF) return "avx512pf"; - if(cpuisa & AVX512IFMA) return "avx512ifma"; - if(cpuisa & AVX512DQ) return "avx512dq"; - if(cpuisa & AVX512F) return "avx512f"; - return "avx512"; - } - else if(cpuisa >= IS_AVX2) return "avx2"; - else if(cpuisa >= IS_AVX) - switch(cpuisa&0xf) { - case 1: return "avx+fma3"; - case 2: return "avx+fma4"; - case 4: return "avx+aes"; - case 5: return "avx+fma3+aes"; - default:return "avx"; - } - else if(cpuisa >= IS_SSE42) return "sse4.2"; - else if(cpuisa >= IS_SSE41x) return "sse4.1+popcnt"; - else if(cpuisa >= IS_SSE41) return "sse4.1"; - else if(cpuisa >= IS_SSSE3) return "ssse3"; - else if(cpuisa >= IS_SSE3) return "sse3"; - else if(cpuisa >= IS_SSE2) return "sse2"; - else if(cpuisa >= IS_SSE) return "sse"; - #elif defined(__powerpc64__) - if(cpuisa >= IS_POWER9) return "power9"; - #elif defined(__ARM_NEON) - if(cpuisa >= IS_NEON) return "arm_neon"; - #endif - return "none"; -} - -//---------------------------------------------------------------------------------------------------------------------- -typedef void (*TPFUNC)( unsigned char *in, unsigned n, unsigned char *out); - - // 0 1 2 3 4 5 6 7 8 9 16 -static TPFUNC _tpe[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // byte -static TPFUNC _tpd[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 }; - -static TPFUNC _tp4e[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // Nibble -static TPFUNC _tp4d[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 }; - -//-- zigzag delta -static TPFUNC _tpze[] = { 0, 0, tpzenc2, tpzenc3, tpzenc4, 0, 0, 0, tpzenc8, 0, 0, 0, 0, 0, 0, 0, tpzenc16 }; // byte -static TPFUNC _tpzd[] = { 0, 0, tpzdec2, tpzdec3, tpzdec4, 0, 0, 0, tpzdec8, 0, 0, 0, 0, 0, 0, 0, tpzdec16 }; - -static TPFUNC _tp4ze[] = { 0, 0, tpzenc2, tpzenc3, tpzenc4, 0, 0, 0, tpzenc8, 0, 0, 0, 0, 0, 0, 0, tpzenc16 }; // Nibble -static TPFUNC _tp4zd[] = { 0, 0, tpzdec2, tpzdec3, tpzdec4, 0, 0, 0, tpzdec8, 0, 0, 0, 0, 0, 0, 0, tpzdec16 }; - -//-- xor -static TPFUNC _tpxe[] = { 0, 0, tpxenc2, tpxenc3, tpxenc4, 0, 0, 0, tpxenc8, 0, 0, 0, 0, 0, 0, 0, tpxenc16 }; // byte -static TPFUNC _tpxd[] = { 0, 0, tpxdec2, tpxdec3, tpxdec4, 0, 0, 0, tpxdec8, 0, 0, 0, 0, 0, 0, 0, tpxdec16 }; - -static TPFUNC _tp4xe[] = { 0, 0, tpxenc2, tpxenc3, tpxenc4, 0, 0, 0, tpxenc8, 0, 0, 0, 0, 0, 0, 0, tpxenc16 }; // Nibble -static TPFUNC _tp4xd[] = { 0, 0, tpxdec2, tpxdec3, tpxdec4, 0, 0, 0, tpxdec8, 0, 0, 0, 0, 0, 0, 0, tpxdec16 }; - -static int tpset; - -void tpini(int id) { - int i; - if(tpset) return; - tpset++; - i = id?id:cpuisa(); - #if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) - if(i >= IS_AVX2) { - _tpe[ 2] = tpenc256v2; _tpd[ 2] = tpdec256v2; _tp4e[ 2] = tp4enc256v2; _tp4d[ 2] = tp4dec256v2; - _tpe[ 4] = tpenc256v4; _tpd[ 4] = tpdec256v4; _tp4e[ 4] = tp4enc256v4; _tp4d[ 4] = tp4dec256v4; - _tpe[ 8] = tpenc256v8; _tpd[ 8] = tpdec256v8; _tp4e[ 8] = tp4enc256v8; _tp4d[ 8] = tp4dec256v8; - - _tpze[2] = tpzenc128v2; _tpzd[2] = tpzdec128v2; _tp4ze[2] = tp4zenc128v2; _tp4zd[2] = tp4zdec128v2; // 16 bits: only sse - _tpze[4] = tpzenc256v4; _tpzd[4] = tpzdec256v4; _tp4ze[4] = tp4zenc256v4; _tp4zd[4] = tp4zdec256v4; - _tpze[8] = tpzenc256v8; _tpzd[8] = tpzdec256v8; _tp4ze[8] = tp4zenc256v8; _tp4zd[8] = tp4zdec256v8; - - _tpxe[2] = tpxenc128v2; _tpxd[2] = tpxdec128v2; _tp4xe[2] = tp4xenc128v2; _tp4xd[2] = tp4xdec128v2; // 16 bits: only sse - _tpxe[4] = tpxenc256v4; _tpxd[4] = tpxdec256v4; _tp4xe[4] = tp4xenc256v4; _tp4xd[4] = tp4xdec256v4; - _tpxe[8] = tpxenc256v8; _tpxd[8] = tpxdec256v8; _tp4xe[8] = tp4xenc256v8; _tp4xd[8] = tp4xdec256v8; - } else - #endif - #if defined(__i386__) || defined(__x86_64__) || defined(__ARM_NEON) || defined(__powerpc64__) || defined(_M_X64) - if(i >= IS_SSE2) { - _tpe[ 2] = tpenc128v2; _tpd[ 2] = tpdec128v2; _tp4e[ 2] = tp4enc128v2; _tp4d[ 2] = tp4dec128v2; - _tpe[ 4] = tpenc128v4; _tpd[ 4] = tpdec128v4; _tp4e[ 4] = tp4enc128v4; _tp4d[ 4] = tp4dec128v4; - _tpe[ 8] = tpenc128v8; _tpd[ 8] = tpdec128v8; _tp4e[ 8] = tp4enc128v8; _tp4d[ 8] = tp4dec128v8; - if(i == 35) _tpd[8] = tpdec8; // ARM NEON scalar is faster!, TODO:retest on Apple M1 - _tpze[2] = tpzenc128v2; _tpzd[2] = tpzdec128v2; _tp4ze[2] = tp4zenc128v2; _tp4zd[2] = tp4zdec128v2; - _tpze[4] = tpzenc128v4; _tpzd[4] = tpzdec128v4; _tp4ze[4] = tp4zenc128v4; _tp4zd[4] = tp4zdec128v4; - _tpze[8] = tpzenc128v8; _tpzd[8] = tpzdec128v8; _tp4ze[8] = tp4zenc128v8; _tp4zd[8] = tp4zdec128v8; - if(i == 35) _tpzd[8] = tpzdec8; - - _tpxe[2] = tpxenc128v2; _tpxd[2] = tpxdec128v2; _tp4xe[2] = tp4xenc128v2; _tp4xd[2] = tp4xdec128v2; - _tpxe[4] = tpxenc128v4; _tpxd[4] = tpxdec128v4; _tp4xe[4] = tp4xenc128v4; _tp4xd[4] = tp4xdec128v4; - _tpxe[8] = tpxenc128v8; _tpxd[8] = tpxdec128v8; _tp4xe[8] = tp4xenc128v8; _tp4xd[8] = tp4xdec128v8; - if(i == 35) _tpxd[8] = tpxdec8; - } - #endif - ; -} - -void tpenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tpe[esize])) f(in,n,out); - else { - unsigned i, stride=n/esize; - unsigned char *op,*ip; - for(ip = in,op = out; ip < in+stride*esize; op++) - for(i = 0; i < esize; i++) - op[i*stride] = *ip++; - for(op = out + esize*stride; ip < in+n;) - *op++ = *ip++; - } -} - -void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tpd[esize])) f(in,n,out); - else { - unsigned i, stride = n/esize; - unsigned char *op,*ip; - for(op = out,ip = in; op < out+stride*esize; ip++) - for(i = 0; i < esize; i++) - *op++ = ip[i*stride]; - for(ip = in+esize*stride; op < out+n;) - *op++ = *ip++; - } -} - -void tp4enc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tp4e[esize])) f(in,n,out); - else tpenc(in,n,out,esize); -} - -void tp4dec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tp4d[esize])) f(in,n,out); - else tpdec(in,n,out,esize); -} - -//-- zigzag -void tpzenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tpze[esize])) f(in,n,out); - else { - unsigned i, stride = n/esize; - unsigned char *op, *ip; - for(ip = in,op = out; ip < in+stride*esize; op++) - for(i = 0; i < esize; i++) - op[i*stride] = *ip++; // TODO:zigzag - for(op = out + esize*stride; ip < in+n;) - *op++ = *ip++; // TODO:zigzag - } -} - -void tpzdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tpzd[esize])) f(in,n,out); - else { - unsigned i, stride = n/esize; - unsigned char *op,*ip; - for(op = out,ip = in; op < out+stride*esize; ip++) - for(i = 0; i < esize; i++) - *op++ = ip[i*stride]; - for(ip = in+esize*stride; op < out+n;) - *op++ = *ip++; - } -} - -void tp4zenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tp4ze[esize])) f(in,n,out); - else tpzenc(in,n,out,esize); -} - -void tp4zdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tp4zd[esize])) f(in,n,out); - else tpzdec(in,n,out,esize); -} - -//-- xor -void tpxenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tpxe[esize])) f(in,n,out); - else { - unsigned i, stride = n/esize; - unsigned char *op, *ip; - for(ip = in,op = out; ip < in+stride*esize; op++) - for(i = 0; i < esize; i++) - op[i*stride] = *ip++; // TODO:xor - for(op = out + esize*stride; ip < in+n;) - *op++ = *ip++; // TODO:xor - } -} - -void tpxdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tpxd[esize])) f(in,n,out); - else { - unsigned i, stride = n/esize; - unsigned char *op,*ip; - for(op = out,ip = in; op < out+stride*esize; ip++) - for(i = 0; i < esize; i++) - *op++ = ip[i*stride]; - for(ip = in+esize*stride; op < out+n;) - *op++ = *ip++; - } -} - -void tp4xenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tp4xe[esize])) f(in,n,out); - else tpxenc(in,n,out,esize); -} - -void tp4xdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tp4xd[esize])) f(in,n,out); - else tpxdec(in,n,out,esize); -} - -#define E for(e = esize-1; e >= 0; e--) -#define ODX2 (x + y * nx) * esize + e -void tp2denc(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out, unsigned esize) { - switch(esize) { - case 2: tp2denc2(in,nx,ny,out); break; - case 4: tp2denc4(in,nx,ny,out); break; - case 8: tp2denc8(in,nx,ny,out); break; - default: { - unsigned x,y; - uint8_t *op = out, *ip = in; - int e; - for( x = 0; x < nx; x++) - for(y = 0; y < ny; y++) E - op[ODX2] = *ip++; - } - } -} - -void tp2ddec(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out, unsigned esize) { - switch(esize) { - case 2: tp2ddec2(in,nx,ny,out); break; - case 4: tp2ddec4(in,nx,ny,out); break; - case 8: tp2ddec8(in,nx,ny,out); break; - default: { unsigned x,y; - uint8_t *op = out, *ip = in; - int e; - for( x = 0; x < nx; x++) - for(y = 0; y < ny; y++) E - *op++ = ip[ODX2]; - } - } -} -#undef ODX2 - -#define ODX3 (x + y * nx + z * ny * nx) * esize + e -void tp3denc(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) { - switch(esize) { - case 2: tp3denc2(in,nx,ny,nz,out); break; - case 4: tp3denc4(in,nx,ny,nz,out); break; - case 8: tp3denc8(in,nx,ny,nz,out); break; - default: { unsigned x,y,z; - uint8_t *op = out, *ip = in; - int e; - for( x = 0; x < nx; x++) - for( y = 0; y < ny; y++) - for(z = 0; z < nz; z++) E - op[ODX3] = *ip++; - } - } -} - -void tp3ddec(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) { - switch(esize) { - case 2: tp3ddec2(in,nx,ny,nz,out); break; - case 4: tp3ddec4(in,nx,ny,nz,out); break; - case 8: tp3ddec8(in,nx,ny,nz,out); break; - default: { unsigned x,y,z; - uint8_t *op = out, *ip = in; - int e; - for(x = 0; x < nx; ++x) - for(y = 0; y < ny; ++y) - for(z = 0; z < nz; ++z) E - *op++ = ip[ODX3]; - } - } -} -#undef ODX3 - -#define ODX4 (w + x * nw + y * nx * nw + z * nx * ny * nw) * esize + e -void tp4denc(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) { - switch(esize) { - case 2: tp4denc2(in,nw,nx,ny,nz,out); break; - case 4: tp4denc4(in,nw,nx,ny,nz,out); break; - case 8: tp4denc8(in,nw,nx,ny,nz,out); break; - default: { - unsigned w, x, y, z; - uint8_t *op = out, *ip = in; - int e; - for( w = 0; w < nw; w++) - for( x = 0; x < nx; x++) - for( y = 0; y < ny; y++) - for(z = 0; z < nz; z++) E - op[ODX4] = *ip++; - } - } -} - -void tp4ddec(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) { - switch(esize) { - case 2: tp4ddec2(in,nw,nx,ny,nz,out); break; - case 4: tp4ddec4(in,nw,nx,ny,nz,out); break; - case 8: tp4ddec8(in,nw,nx,ny,nz,out); break; - default: { - unsigned w,x,y,z; - uint8_t *op = out,*ip = in; - int e; - for( w = 0; w < nw; w++) - for( x = 0; x < nx; ++x) - for( y = 0; y < ny; ++y) - for(z = 0; z < nz; ++z) E - *op++ = ip[ODX4]; - } - } -} - -#undef ODX4 -#undef E - #endif - -#else -//************************************************************* Templates (separat part included multiple times) ******************************************************************** - -#define SIE(p,i) (p+=stride) //faster on ARM -//#define SIE(_p_,_i_) (_p_+ _i_*stride) - -#define SID(p,i) (p+=stride) -//#define SID(_p_,_i_) (_p_+ _i_*stride) - -#if !defined(__AVX2__) && !defined(__SSE3__) && !defined(ISDELTA) //--------------------------------------- plain ------------------------------------------------------------------- - - #if STRIDE == ESIZE -#if ESIZE == 2 || ESIZE == 4 || ESIZE == 8 -#define uint_t T3(uint, USIZE, _t) - -#define ODX2 (x + y * nx) -#define O2D(_i_) (x + (y+_i_) * nx) -void T2(tp2denc,ESIZE)(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out) { - unsigned x,y; - uint_t *op = out, *ip = in; - - for( x = 0; x < nx; x++) - for(y = 0; y < ny; y++) - op[ODX2] = *ip++; -} - -void T2(tp2ddec,ESIZE)(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out) { - unsigned x, y; - uint_t *op = out, *ip = in; - - for( x = 0; x < nx; x++) - for(y=0; y != ny; y++) - *op++ = ip[ODX2]; -} -#undef ODX2 - -#define ODX3 (x + y * nx + z * ny * nx) -void T2(tp3denc,ESIZE)(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out) { - unsigned x, y, z; - uint_t *op = out, *ip = in; - - for( x = 0; x < nx; x++) - for( y = 0; y < ny; y++) - for(z = 0; z < nz; z++) - op[ODX3] = *ip++; -} - -void T2(tp3ddec,ESIZE)(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out) { - unsigned x,y,z; - uint_t *op = out, *ip = in; - - for(x = 0; x < nx; ++x) - for(y = 0; y < ny; ++y) - for(z = 0; z < nz; ++z) - *op++ = ip[ODX3]; -} -#undef ODX3 - -#define ODX4 (w + x * nw + y * nx * nw + z * nx * ny * nw) -void T2(tp4denc,ESIZE)(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out) { - unsigned w,x,y,z; - uint_t *op = out, *ip=in; - - for( w = 0; w < nw; w++) - for( x = 0; x < nx; x++) - for( y = 0; y < ny; y++) - for(z = 0; z < nz; z++) - op[ODX4] = *ip++; -} - -void T2(tp4ddec,ESIZE)(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out) { - unsigned w,x,y,z; - uint_t *op = out, *ip = in; - - for( w = 0; w < nw; ++w) - for( x = 0; x < nx; ++x) - for( y = 0; y < ny; ++y) - for(z = 0; z < nz; ++z) - *op++= ip[ODX4]; -} -#undef ODX4 - #endif - -void T2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned char *op,*ip,*e; - unsigned stride = n/STRIDE; - - #if powof2(ESIZE) - e = in+(n&~(ESIZE-1)); - #else - e = in+stride*ESIZE; - #endif - - for(ip = in,op = out; ip < e; op++, ip+=ESIZE) { unsigned char *p = op; - p[0] = ip[ 0]; - *SIE(p, 1) = ip[ 1]; - #if ESIZE > 2 - *SIE(p, 2) = ip[ 2]; - #if ESIZE > 3 - *SIE(p, 3) = ip[ 3]; - #if ESIZE > 4 - uint32_t u = ctou32(p); - *SIE(p, 4) = ip[ 4]; - *SIE(p, 5) = ip[ 5]; - *SIE(p, 6) = ip[ 6]; - *SIE(p, 7) = ip[ 7]; - #if ESIZE > 8 - *SIE(p, 8) = ip[ 8]; - *SIE(p, 9) = ip[ 9]; - *SIE(p,10) = ip[10]; - *SIE(p,11) = ip[11]; - *SIE(p,12) = ip[12]; - *SIE(p,13) = ip[13]; - *SIE(p,14) = ip[14]; - *SIE(p,15) = ip[15]; - #endif - #endif - #endif - #endif - } - for(op = out+stride*ESIZE;ip < in+n;) - *op++ = *ip++; -} - -void T2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned char *op,*ip,*e; - unsigned stride = n/STRIDE; - - #if powof2(ESIZE) - e = out+(n&~(ESIZE-1)); - #else - e = out+stride*ESIZE; - #endif - for(op = out,ip = in; op < e; ip++,op+=ESIZE) { unsigned char *p = ip; - op[ 0] = *p; - op[ 1] = *SID(p,1); - #if ESIZE > 2 - op[ 2] = *SID(p,2); - #if ESIZE > 3 - op[ 3] = *SID(p,3); - #if ESIZE > 4 - op[ 4] = *SID(p,4); - op[ 5] = *SID(p,5); - op[ 6] = *SID(p,6); - op[ 7] = *SID(p,7); - #if ESIZE > 8 - op[ 8] = *SID(p,8); - op[ 9] = *SID(p,9); - op[10] = *SID(p,10); - op[11] = *SID(p,11); - op[12] = *SID(p,12); - op[13] = *SID(p,13); - op[14] = *SID(p,14); - op[15] = *SID(p,15); - #endif - #endif - #endif - #endif - } - for(ip = in+stride*ESIZE; op < out+n; ) - *op++ = *ip++; -} - -// TODO: zigzag -void T2(TPZENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned char *op,*ip,*e; - unsigned stride = n/STRIDE; - - #if powof2(ESIZE) - e = in+(n&~(ESIZE-1)); - #else - e = in+stride*ESIZE; - #endif - - for(ip = in,op = out; ip < e; op++, ip+=ESIZE) { unsigned char *p = op; - p[0] = ip[ 0]; - *SIE(p, 1) = ip[ 1]; - #if ESIZE > 2 - *SIE(p, 2) = ip[ 2]; - #if ESIZE > 3 - *SIE(p, 3) = ip[ 3]; - #if ESIZE > 4 - uint32_t u = ctou32(p); - *SIE(p, 4) = ip[ 4]; - *SIE(p, 5) = ip[ 5]; - *SIE(p, 6) = ip[ 6]; - *SIE(p, 7) = ip[ 7]; - #if ESIZE > 8 - *SIE(p, 8) = ip[ 8]; - *SIE(p, 9) = ip[ 9]; - *SIE(p,10) = ip[10]; - *SIE(p,11) = ip[11]; - *SIE(p,12) = ip[12]; - *SIE(p,13) = ip[13]; - *SIE(p,14) = ip[14]; - *SIE(p,15) = ip[15]; - #endif - #endif - #endif - #endif - } - for(op = out+stride*ESIZE;ip < in+n;) - *op++ = *ip++; -} - -void T2(TPZDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned char *op,*ip,*e; - unsigned stride = n/STRIDE; - - #if powof2(ESIZE) - e = out+(n&~(ESIZE-1)); - #else - e = out+stride*ESIZE; - #endif - for(op = out,ip = in; op < e; ip++,op+=ESIZE) { unsigned char *p = ip; - op[ 0] = *p; - op[ 1] = *SID(p,1); - #if ESIZE > 2 - op[ 2] = *SID(p,2); - #if ESIZE > 3 - op[ 3] = *SID(p,3); - #if ESIZE > 4 - op[ 4] = *SID(p,4); - op[ 5] = *SID(p,5); - op[ 6] = *SID(p,6); - op[ 7] = *SID(p,7); - #if ESIZE > 8 - op[ 8] = *SID(p,8); - op[ 9] = *SID(p,9); - op[10] = *SID(p,10); - op[11] = *SID(p,11); - op[12] = *SID(p,12); - op[13] = *SID(p,13); - op[14] = *SID(p,14); - op[15] = *SID(p,15); - #endif - #endif - #endif - #endif - } - for(ip = in+stride*ESIZE; op < out+n; ) - *op++ = *ip++; -} - -// TODO: xor -void T2(TPXENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned char *op,*ip,*e; - unsigned stride = n/STRIDE; - - #if powof2(ESIZE) - e = in+(n&~(ESIZE-1)); - #else - e = in+stride*ESIZE; - #endif - - for(ip = in,op = out; ip < e; op++, ip+=ESIZE) { unsigned char *p = op; - p[0] = ip[ 0]; - *SIE(p, 1) = ip[ 1]; - #if ESIZE > 2 - *SIE(p, 2) = ip[ 2]; - #if ESIZE > 3 - *SIE(p, 3) = ip[ 3]; - #if ESIZE > 4 - uint32_t u = ctou32(p); - *SIE(p, 4) = ip[ 4]; - *SIE(p, 5) = ip[ 5]; - *SIE(p, 6) = ip[ 6]; - *SIE(p, 7) = ip[ 7]; - #if ESIZE > 8 - *SIE(p, 8) = ip[ 8]; - *SIE(p, 9) = ip[ 9]; - *SIE(p,10) = ip[10]; - *SIE(p,11) = ip[11]; - *SIE(p,12) = ip[12]; - *SIE(p,13) = ip[13]; - *SIE(p,14) = ip[14]; - *SIE(p,15) = ip[15]; - #endif - #endif - #endif - #endif - } - for(op = out+stride*ESIZE;ip < in+n;) - *op++ = *ip++; -} - -void T2(TPXDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned char *op,*ip,*e; - unsigned stride = n/STRIDE; - - #if powof2(ESIZE) - e = out+(n&~(ESIZE-1)); - #else - e = out+stride*ESIZE; - #endif - for(op = out,ip = in; op < e; ip++,op+=ESIZE) { unsigned char *p = ip; - op[ 0] = *p; - op[ 1] = *SID(p,1); - #if ESIZE > 2 - op[ 2] = *SID(p,2); - #if ESIZE > 3 - op[ 3] = *SID(p,3); - #if ESIZE > 4 - op[ 4] = *SID(p,4); - op[ 5] = *SID(p,5); - op[ 6] = *SID(p,6); - op[ 7] = *SID(p,7); - #if ESIZE > 8 - op[ 8] = *SID(p,8); - op[ 9] = *SID(p,9); - op[10] = *SID(p,10); - op[11] = *SID(p,11); - op[12] = *SID(p,12); - op[13] = *SID(p,13); - op[14] = *SID(p,14); - op[15] = *SID(p,15); - #endif - #endif - #endif - #endif - } - for(ip = in+stride*ESIZE; op < out+n; ) - *op++ = *ip++; -} - - #endif // STRIDE - -#else - -#if ESIZE == 2 || ESIZE == 4 || ESIZE == 8 - #if defined(__AVX2__) - #ifdef TPENC256V + #if defined(__AVX2__) + #ifdef TPENC256V void T2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned v = n&~(ESIZE*32-1); unsigned stride = v/STRIDE; @@ -1378,8 +617,9 @@ void T2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { } T2(tpenc,ESIZE)(in+v, n-v, out+v); } - #endif + #endif // TPENC256V + #ifdef TPDEC256V #define NBL0(x,y) ov[x] = _mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(p ))),_MM_SHUFFLE(3, 1, 2, 0));\ ov[y] = _mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(p+=stride))),_MM_SHUFFLE(3, 1, 2, 0)); @@ -1392,7 +632,6 @@ void T2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { _iv_ = _mm256_or_si256(_mm256_slli_epi16(ov[y],4), ov[x]); \ } - #ifdef TPDEC256V void T2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned v = n&~(ESIZE*32-1); unsigned stride = v/STRIDE; @@ -1486,9 +725,10 @@ void T2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { } if(n-v) T2(tpdec,ESIZE)(in+v, n-v, out+v); } - #endif - - #elif defined(__SSE3__) || defined(__ARM_NEON) + #endif //TPDEC256V + #else //__AVX2__ + + #if (defined(__SSE3__) || defined(__ARM_NEON)) && (ESIZE == 2 || ESIZE == 4 || ESIZE == 8) #define ST(_p_,_v_,_i_) _mm_storeu_si128((__m128i *)SIE(_p_,_i_), _v_) #define ST0(_p_,_v_) _mm_storeu_si128((__m128i *)(_p_), _v_) @@ -1569,9 +809,9 @@ void T2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { #elif ESIZE == 8 #ifdef __ARM_NEON -#define vzipl_u16(_a_,_b_) vzip_u16(vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_))) -#define vziph_u16(_a_,_b_) vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))) -//#define VQ + #define vzipl_u16(_a_,_b_) vzip_u16(vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_))) + #define vziph_u16(_a_,_b_) vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))) + //#define VQ #ifndef VQ uint16x4x2_t v16[8]; uint32x2x2_t v32[8]; @@ -1922,9 +1162,161 @@ void T2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { } T2(tpdec,ESIZE)(in+v, n-v, out+v); } - #endif + #endif // SSE3 - #endif + //--------------------------------------- plain ------------------------------------------------------------------- + #if STRIDE == ESIZE // bytes only, no nibble version + #if (ESIZE == 2 || ESIZE == 4 || ESIZE == 8) && !defined(ISDELTA) +#define uint_t T3(uint, USIZE, _t) -#endif -#endif +#define ODX2 (x + y * nx) +#define O2D(_i_) (x + (y+_i_) * nx) +void T2(tp2denc,ESIZE)(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out) { + unsigned x,y; + uint_t *op = out, *ip = in; + + for( x = 0; x < nx; x++) + for(y = 0; y < ny; y++) + op[ODX2] = *ip++; +} + +void T2(tp2ddec,ESIZE)(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out) { + unsigned x, y; + uint_t *op = out, *ip = in; + + for( x = 0; x < nx; x++) + for(y=0; y != ny; y++) + *op++ = ip[ODX2]; +} +#undef ODX2 + +#define ODX3 (x + y * nx + z * ny * nx) +void T2(tp3denc,ESIZE)(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out) { + unsigned x, y, z; + uint_t *op = out, *ip = in; + + for( x = 0; x < nx; x++) + for( y = 0; y < ny; y++) + for(z = 0; z < nz; z++) + op[ODX3] = *ip++; +} + +void T2(tp3ddec,ESIZE)(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out) { + unsigned x,y,z; + uint_t *op = out, *ip = in; + + for(x = 0; x < nx; ++x) + for(y = 0; y < ny; ++y) + for(z = 0; z < nz; ++z) + *op++ = ip[ODX3]; +} +#undef ODX3 + +#define ODX4 (w + x * nw + y * nx * nw + z * nx * ny * nw) +void T2(tp4denc,ESIZE)(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out) { + unsigned w,x,y,z; + uint_t *op = out, *ip=in; + + for( w = 0; w < nw; w++) + for( x = 0; x < nx; x++) + for( y = 0; y < ny; y++) + for(z = 0; z < nz; z++) + op[ODX4] = *ip++; +} + +void T2(tp4ddec,ESIZE)(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out) { + unsigned w,x,y,z; + uint_t *op = out, *ip = in; + + for( w = 0; w < nw; ++w) + for( x = 0; x < nx; ++x) + for( y = 0; y < ny; ++y) + for(z = 0; z < nz; ++z) + *op++= ip[ODX4]; +} +#undef ODX4 + #endif // ISDELTA + +void T2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { + unsigned char *op,*ip,*e; + unsigned stride = n/STRIDE; + + #if powof2(ESIZE) + e = in+(n&~(ESIZE-1)); + #else + e = in+stride*ESIZE; + #endif + + for(ip = in,op = out; ip < e; op++, ip+=ESIZE) { unsigned char *p = op; + p[0] = ip[ 0]; + *SIE(p, 1) = ip[ 1]; + #if ESIZE > 2 + *SIE(p, 2) = ip[ 2]; + #if ESIZE > 3 + *SIE(p, 3) = ip[ 3]; + #if ESIZE > 4 + uint32_t u = ctou32(p); + *SIE(p, 4) = ip[ 4]; + *SIE(p, 5) = ip[ 5]; + *SIE(p, 6) = ip[ 6]; + *SIE(p, 7) = ip[ 7]; + #if ESIZE > 8 + *SIE(p, 8) = ip[ 8]; + *SIE(p, 9) = ip[ 9]; + *SIE(p,10) = ip[10]; + *SIE(p,11) = ip[11]; + *SIE(p,12) = ip[12]; + *SIE(p,13) = ip[13]; + *SIE(p,14) = ip[14]; + *SIE(p,15) = ip[15]; + #endif + #endif + #endif + #endif + } + for(op = out+stride*ESIZE;ip < in+n;) + *op++ = *ip++; +} + +void T2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { + unsigned char *op,*ip,*e; + unsigned stride = n/STRIDE; + + #if powof2(ESIZE) + e = out+(n&~(ESIZE-1)); + #else + e = out+stride*ESIZE; + #endif + for(op = out,ip = in; op < e; ip++,op += ESIZE) { unsigned char *p = ip; + op[ 0] = *p; + op[ 1] = *SID(p,1); + #if ESIZE > 2 + op[ 2] = *SID(p,2); + #if ESIZE > 3 + op[ 3] = *SID(p,3); + #if ESIZE > 4 + op[ 4] = *SID(p,4); + op[ 5] = *SID(p,5); + op[ 6] = *SID(p,6); + op[ 7] = *SID(p,7); + #if ESIZE > 8 + op[ 8] = *SID(p,8); + op[ 9] = *SID(p,9); + op[10] = *SID(p,10); + op[11] = *SID(p,11); + op[12] = *SID(p,12); + op[13] = *SID(p,13); + op[14] = *SID(p,14); + op[15] = *SID(p,15); + #endif + #endif + #endif + #endif + } + for(ip = in+stride*ESIZE; op < out+n; ) + *op++ = *ip++; +} + #endif // STRIDE = ESIZE + + #endif // avx2 +#endif // template