diff --git a/transpose.c b/transpose.c index 5e651d0..2db63cc 100644 --- a/transpose.c +++ b/transpose.c @@ -1,7 +1,7 @@ /** Copyright (C) powturbo 2013-2019 GPL v2 License - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -45,8 +45,8 @@ #include "sse_neon.h" #endif -#pragma warning( disable : 4005) - +#pragma warning( disable : 4005) + #include "conf.h" #include "transpose.h" @@ -57,7 +57,7 @@ #endif #define powof2(n) !((n)&((n)-1)) - + #define TPENC tpenc #define TPDEC tpdec @@ -148,22 +148,22 @@ static unsigned _cpuisa; #include #endif -static inline void cpuid(int reg[4], int id) { - #if defined (_MSC_VER) //|| defined (__INTEL_COMPILER) - __cpuidex(reg, id, 0); - #elif defined(__i386__) || defined(__x86_64__) +static inline void cpuid(int reg[4], int id) { + #if defined (_MSC_VER) //|| defined (__INTEL_COMPILER) + __cpuidex(reg, id, 0); + #elif defined(__i386__) || defined(__x86_64__) __asm("cpuid" : "=a"(reg[0]),"=b"(reg[1]),"=c"(reg[2]),"=d"(reg[3]) : "a"(id),"c"(0) : ); #endif } -static inline uint64_t xgetbv (int ctr) { +static inline uint64_t xgetbv (int ctr) { #if(defined _MSC_VER && (_MSC_FULL_VER >= 160040219) || defined __INTEL_COMPILER) - return _xgetbv(ctr); + return _xgetbv(ctr); #elif defined(__i386__) || defined(__x86_64__) unsigned a, d; __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); return (uint64_t)d << 32 | a; - #else + #else unsigned a=0, d=0; return (uint64_t)d << 32 | a; #endif @@ -190,42 +190,42 @@ static inline uint64_t xgetbv (int ctr) { #define IS_NEON 0x38 // arm neon #define IS_SSE41 0x40 #define IS_SSE41x 0x41 //+popcount -#define IS_SSE42 0x42 +#define IS_SSE42 0x42 #define IS_AVX 0x50 #define IS_AVX2 0x60 #define IS_AVX512 0x800 unsigned cpuisa(void) { - int c[4] = {0}; - if(_cpuisa) return _cpuisa; - _cpuisa++; - #if defined(__i386__) || defined(__x86_64__) - cpuid(c, 0); - if(c[0]) { - cpuid(c, 1); + int c[4] = {0}; + if(_cpuisa) return _cpuisa; + _cpuisa++; + #if defined(__i386__) || defined(__x86_64__) + cpuid(c, 0); + if(c[0]) { + cpuid(c, 1); //family = ((c >> 8) & 0xf) + ((c >> 20) & 0xff) //model = ((c >> 4) & 0xf) + ((c >> 12) & 0xf0) - if( c[3] & (1 << 25)) { _cpuisa = IS_SSE; - if( c[3] & (1 << 26)) { _cpuisa = IS_SSE2; - if( c[2] & (1 << 0)) { _cpuisa = IS_SSE3; + if( c[3] & (1 << 25)) { _cpuisa = IS_SSE; + if( c[3] & (1 << 26)) { _cpuisa = IS_SSE2; + if( c[2] & (1 << 0)) { _cpuisa = IS_SSE3; // _cpuisa = IS_SSE3SLOW; // Atom SSSE3 slow - if( c[2] & (1 << 9)) { _cpuisa = IS_SSSE3; - if( c[2] & (1 << 19)) { _cpuisa = IS_SSE41; - if( c[2] & (1 << 23)) { _cpuisa = IS_SSE41x; // +popcount + if( c[2] & (1 << 9)) { _cpuisa = IS_SSSE3; + if( c[2] & (1 << 19)) { _cpuisa = IS_SSE41; + if( c[2] & (1 << 23)) { _cpuisa = IS_SSE41x; // +popcount if( c[2] & (1 << 20)) { _cpuisa = IS_SSE42; // SSE4.2 - if((c[2] & (1 << 28)) && - (c[2] & (1 << 27)) && // OSXSAVE + if((c[2] & (1 << 28)) && + (c[2] & (1 << 27)) && // OSXSAVE (c[2] & (1 << 26)) && // XSAVE (xgetbv(0) & 6)==6) { _cpuisa = IS_AVX; // AVX if(c[2]& (1 << 3)) _cpuisa |= 1; // +FMA3 - if(c[2]& (1 << 16)) _cpuisa |= 2; // +FMA4 + if(c[2]& (1 << 16)) _cpuisa |= 2; // +FMA4 if(c[2]& (1 << 25)) _cpuisa |= 4; // +AES - cpuid(c, 7); - if(c[1] & (1 << 5)) { _cpuisa = IS_AVX2; - if(c[1] & (1 << 16)) { - cpuid(c, 0xd); - if((c[0] & 0x60)==0x60) { _cpuisa = IS_AVX512; - cpuid(c, 7); + cpuid(c, 7); + if(c[1] & (1 << 5)) { _cpuisa = IS_AVX2; + if(c[1] & (1 << 16)) { + cpuid(c, 0xd); + if((c[0] & 0x60)==0x60) { _cpuisa = IS_AVX512; + cpuid(c, 7); if(c[1] & (1<<16)) _cpuisa |= AVX512F; if(c[1] & (1<<17)) _cpuisa |= AVX512DQ; if(c[1] & (1<<21)) _cpuisa |= AVX512IFMA; @@ -236,23 +236,23 @@ unsigned cpuisa(void) { if(c[1] & (1<<31)) _cpuisa |= AVX512VL; if(c[2] & (1<< 1)) _cpuisa |= AVX512VBMI; if(c[2] & (1<<11)) _cpuisa |= AVX512VNNI; - if(c[2] & (1<< 6)) _cpuisa |= AVX512VBMI2; + if(c[2] & (1<< 6)) _cpuisa |= AVX512VBMI2; }}} }}}}}}}}} #elif defined(__powerpc64__) - _cpuisa = IS_POWER9; // power9 + _cpuisa = IS_POWER9; // power9 #elif defined(__ARM_NEON) - _cpuisa = IS_NEON; // ARM_NEON - #endif + _cpuisa = IS_NEON; // ARM_NEON + #endif return _cpuisa; } #endif - + unsigned cpuini(unsigned cpuisa) { if(cpuisa) _cpuisa = cpuisa; return _cpuisa; } char *cpustr(unsigned cpuisa) { if(!cpuisa) cpuisa = _cpuisa; - #if defined(__i386__) || defined(__x86_64__) + #if defined(__i386__) || defined(__x86_64__) if(cpuisa >= IS_AVX512) { if(cpuisa & AVX512VBMI2) return "avx512vbmi2"; if(cpuisa & AVX512VBMI) return "avx512vbmi"; @@ -266,17 +266,17 @@ char *cpustr(unsigned cpuisa) { if(cpuisa & AVX512DQ) return "avx512dq"; if(cpuisa & AVX512F) return "avx512f"; return "avx512"; - } + } else if(cpuisa >= IS_AVX2) return "avx2"; - else if(cpuisa >= IS_AVX) - switch(cpuisa&0xf) { + else if(cpuisa >= IS_AVX) + switch(cpuisa&0xf) { case 1: return "avx+fma3"; case 2: return "avx+fma4"; case 4: return "avx+aes"; case 5: return "avx+fma3+aes"; - default:return "avx"; + default:return "avx"; } - else if(cpuisa >= IS_SSE42) return "sse4.2"; + else if(cpuisa >= IS_SSE42) return "sse4.2"; else if(cpuisa >= IS_SSE41x) return "sse4.1+popcnt"; else if(cpuisa >= IS_SSE41) return "sse4.1"; else if(cpuisa >= IS_SSSE3) return "ssse3"; @@ -284,145 +284,145 @@ char *cpustr(unsigned cpuisa) { else if(cpuisa >= IS_SSE2) return "sse2"; else if(cpuisa >= IS_SSE) return "sse"; #elif defined(__powerpc64__) - if(cpuisa >= IS_POWER9) return "power9"; - #elif defined(__ARM_NEON) + if(cpuisa >= IS_POWER9) return "power9"; + #elif defined(__ARM_NEON) if(cpuisa >= IS_NEON) return "arm_neon"; #endif return "none"; } - + //--------------------------------------------------------------------------------- typedef void (*TPFUNC)( unsigned char *in, unsigned n, unsigned char *out); - // 0 1 2 3 4 5 6 7 8 9 16 + // 0 1 2 3 4 5 6 7 8 9 16 static TPFUNC _tpe[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; static TPFUNC _tpd[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 }; - + static TPFUNC _tp4e[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // Nibble static TPFUNC _tp4d[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 }; static int tpset; - -void tpini(int id) { - int i; - if(tpset) return; - tpset++; + +void tpini(int id) { + int i; + if(tpset) return; + tpset++; i = id?id:cpuisa(); #if defined(__i386__) || defined(__x86_64__) - if(i >= IS_AVX2) { + if(i >= IS_AVX2) { _tpe[2] = tpenc128v2; _tpd[2] = tpdec256v2; _tp4e[2] = tp4enc256v2; _tp4d[2] = tp4dec256v2; //SSE encoding _tpe[2] is faster _tpe[4] = tpenc128v4; _tpd[4] = tpdec256v4; _tp4e[4] = tp4enc256v4; _tp4d[4] = tp4dec256v4; //SSE encoding _tpe[4] is faster - _tpe[8] = tpenc256v8; _tpd[8] = tpdec256v8; _tp4e[8] = tp4enc256v8; _tp4d[8] = tp4dec256v8; - } else + _tpe[8] = tpenc256v8; _tpd[8] = tpdec256v8; _tp4e[8] = tp4enc256v8; _tp4d[8] = tp4dec256v8; + } else #endif #if defined(__i386__) || defined(__x86_64__) || defined(__ARM_NEON) || defined(__powerpc64__) if(i >= IS_SSE2) { _tpe[2] = tpenc128v2; _tpd[2] = tpdec128v2; _tp4e[2] = tp4enc128v2; _tp4d[2] = tp4dec128v2; _tpe[4] = tpenc128v4; _tpd[4] = tpdec128v4; _tp4e[4] = tp4enc128v4; _tp4d[4] = tp4dec128v4; _tpe[8] = tpenc128v8; _tpd[8] = tpdec128v8; _tp4e[8] = tp4enc128v8; _tp4d[8] = tp4dec128v8; - if(i == 35) _tpd[8] = tpdec8; // ARM NEON scalar is faster + if(i == 35) _tpd[8] = tpdec8; // ARM NEON scalar is faster } #endif } -void tpenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); +void tpenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); if(esize <= 16 && (f = _tpe[esize])) f(in,n,out); else { - unsigned i, stride=n/esize; + unsigned i, stride=n/esize; unsigned char *op,*ip; for(ip = in,op = out; ip < in+stride*esize; op++) for(i = 0; i < esize; i++) - op[i*stride] = *ip++; - for(op = out + esize*stride; ip < in+n;) + op[i*stride] = *ip++; + for(op = out + esize*stride; ip < in+n;) *op++ = *ip++; } -} +} -void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tpd[esize])) f(in,n,out); +void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tpd[esize])) f(in,n,out); else { - unsigned i, stride = n/esize; - unsigned char *op,*ip; + unsigned i, stride = n/esize; + unsigned char *op,*ip; for(op = out,ip = in; op < out+stride*esize; ip++) - for(i = 0; i < esize; i++) + for(i = 0; i < esize; i++) *op++ = ip[i*stride]; - for(ip = in+esize*stride; op < out+n;) + for(ip = in+esize*stride; op < out+n;) *op++ = *ip++; } -} +} #define E for(e = esize-1; e >= 0; e--) -//#define E for(e=0; e < esize; e++) -#define ODX2 (_x + _y * x) * esize + e -void tp2denc(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize) { - unsigned _x,_y; uint8_t *op = out, *ip = in; int e; - for( _x = 0; _x < x; _x++) +//#define E for(e=0; e < esize; e++) +#define ODX2 (_x + _y * x) * esize + e +void tp2denc(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize) { + unsigned _x,_y; uint8_t *op = out, *ip = in; int e; + for( _x = 0; _x < x; _x++) for(_y = 0; _y < y; _y++) E - op[ODX2] = *ip++; + op[ODX2] = *ip++; -} +} -void tp2ddec(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize) { - unsigned _x,_y; uint8_t *op=out,*ip=in; int e; - for( _x = 0; _x < x; _x++) +void tp2ddec(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize) { + unsigned _x,_y; uint8_t *op=out,*ip=in; int e; + for( _x = 0; _x < x; _x++) for(_y = 0; _y < y; _y++) E *op++ = ip[ODX2]; -} +} -#define ODX3 (_x + _y * x + _z * y * x) * esize + e +#define ODX3 (_x + _y * x + _z * y * x) * esize + e void tp3denc(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { - unsigned _x,_y,_z; uint8_t *op = out, *ip=in; int e; + unsigned _x,_y,_z; uint8_t *op = out, *ip=in; int e; for( _x = 0; _x < x; _x++) - for( _y = 0; _y < y; _y++) + for( _y = 0; _y < y; _y++) for(_z = 0; _z < z; _z++) E - op[ODX3] = *ip++; + op[ODX3] = *ip++; } void tp3ddec(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { unsigned _x,_y,_z; uint8_t *op=out,*ip=in; int e; for(_x = 0; _x < x; ++_x) - for(_y = 0; _y < y; ++_y) + for(_y = 0; _y < y; ++_y) for(_z = 0; _z < z; ++_z) E *op++= ip[ODX3]; -} +} -#define ODX4 (_w + _x * w + _y * x * w + _z * x * y * w) * esize + e -void tp4denc(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { - unsigned _w,_x,_y,_z; uint8_t *op = out, *ip=in; int e; +#define ODX4 (_w + _x * w + _y * x * w + _z * x * y * w) * esize + e +void tp4denc(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { + unsigned _w,_x,_y,_z; uint8_t *op = out, *ip=in; int e; for( _w = 0; _w < w; _w++) for( _x = 0; _x < x; _x++) - for( _y = 0; _y < y; _y++) + for( _y = 0; _y < y; _y++) for(_z = 0; _z < z; _z++) E - op[ODX4] = *ip++; + op[ODX4] = *ip++; } -void tp4ddec(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { - unsigned _w,_x,_y,_z; uint8_t *op=out,*ip=in; int e; +void tp4ddec(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { + unsigned _w,_x,_y,_z; uint8_t *op=out,*ip=in; int e; for( _w = 0; _w < w; _w++) for( _x = 0; _x < x; ++_x) - for( _y = 0; _y < y; ++_y) + for( _y = 0; _y < y; ++_y) for(_z = 0; _z < z; ++_z) E *op++= ip[ODX4]; -} +} -void tp4enc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { +void tp4enc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { TPFUNC f; - if(!tpset) tpini(0); + if(!tpset) tpini(0); if(esize <= 16 && (f = _tp4e[esize])) f(in,n,out); else tpenc(in,n,out,esize); -} - -void tp4dec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - TPFUNC f; - if(!tpset) tpini(0); - if(esize <= 16 && (f = _tp4d[esize])) f(in,n,out); - else tpdec(in,n,out,esize); } - #endif + +void tp4dec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tp4d[esize])) f(in,n,out); + else tpdec(in,n,out,esize); +} + #endif #else //---------------------------------------------- Templates -------------------------------------------------------------- #define SIE(p,i) (p+=stride) //faster on ARM @@ -434,18 +434,18 @@ void tp4dec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { #if !defined(SSE2_ON) && !defined(AVX2_ON) //--------------------------------------- plain ------------------------------------------------------------------- #if STRIDE == ESIZE -void TEMPLATE2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { +void TEMPLATE2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned char *op,*ip,*e; - unsigned stride = n/STRIDE; + unsigned stride = n/STRIDE; #if powof2(ESIZE) e = in+(n&~(ESIZE-1)); - #else - e = in+stride*ESIZE; + #else + e = in+stride*ESIZE; #endif - + for(ip = in,op = out; ip < e; op++, ip+=ESIZE) { unsigned char *p = op; - p[0] = ip[ 0]; + p[0] = ip[ 0]; *SIE(p, 1) = ip[ 1]; #if ESIZE > 2 *SIE(p, 2) = ip[ 2]; @@ -469,27 +469,27 @@ void TEMPLATE2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) #endif #endif #endif - #endif + #endif } - for(op = out+stride*ESIZE;ip < in+n;) + for(op = out+stride*ESIZE;ip < in+n;) *op++ = *ip++; -} +} + +void TEMPLATE2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { + unsigned char *op,*ip,*e; + unsigned stride = n/STRIDE; -void TEMPLATE2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned char *op,*ip,*e; - unsigned stride = n/STRIDE; - #if powof2(ESIZE) e = out+(n&~(ESIZE-1)); - #else - e = out+stride*ESIZE; + #else + e = out+stride*ESIZE; #endif - for(op = out,ip = in; op < e; ip++,op+=ESIZE) { unsigned char *p = ip; + for(op = out,ip = in; op < e; ip++,op+=ESIZE) { unsigned char *p = ip; op[ 0] = *p; - op[ 1] = *SID(p,1); + op[ 1] = *SID(p,1); #if ESIZE > 2 op[ 2] = *SID(p,2); - #if ESIZE > 3 + #if ESIZE > 3 op[ 3] = *SID(p,3); #if ESIZE > 4 op[ 4] = *SID(p,4); @@ -510,155 +510,155 @@ void TEMPLATE2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) #endif #endif } - for(ip = in+stride*ESIZE; op < out+n; ) + for(ip = in+stride*ESIZE; op < out+n; ) *op++ = *ip++; -} +} #endif // STRIDE - + #else #if ESIZE == 2 || ESIZE == 4 || ESIZE == 8 #if defined(__AVX2__) void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned v = n&~(ESIZE*32-1); - unsigned stride = v/STRIDE; + unsigned stride = v/STRIDE; unsigned char *op,*ip; - + #if ESIZE == 2 - __m256i sv = _mm256_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, + __m256i sv = _mm256_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, - 15, 13, 11, 9, 7, 5, 3, 1, + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); - __m256i sv0 = _mm256_set_epi8(15, 13, 11, 9, + __m256i sv0 = _mm256_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, - 14, 12, 10, 8, - 6, 4, 2, 0, - 15, 13, 11, 9, - 7, 5, 3, 1, - 14, 12, 10, 8, - 6, 4, 2, 0); - __m256i sv1 = _mm256_set_epi8(14, 12, 10, 8, + 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, - 14, 12, 10, 8, + 14, 12, 10, 8, + 6, 4, 2, 0); + __m256i sv1 = _mm256_set_epi8(14, 12, 10, 8, + 6, 4, 2, 0, + 15, 13, 11, 9, + 7, 5, 3, 1, + 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1); #else __m256i pv = _mm256_set_epi32( 7, 3, 6, 2, 5, 1, 4, 0), #if ESIZE == 4 - sv0 = _mm256_set_epi8(15, 11, 7, 3, + sv0 = _mm256_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, - 14, 10, 6, 2, + 14, 10, 6, 2, 12, 8, 4, 0, - 15, 11, 7, 3, + 15, 11, 7, 3, 13, 9, 5, 1, - 14, 10, 6, 2, + 14, 10, 6, 2, 12, 8, 4, 0), sv1= _mm256_set_epi8(13, 9, 5, 1, - 15, 11, 7, 3, + 15, 11, 7, 3, 12, 8, 4, 0, - 14, 10, 6, 2, + 14, 10, 6, 2, 13, 9, 5, 1, - 15, 11, 7, 3, + 15, 11, 7, 3, 12, 8, 4, 0, 14, 10, 6, 2); #else - sv = _mm256_set_epi8(15, 7, - 14, 6, - 13, 5, - 12, 4, - 11, 3, - 10, 2, - 9, 1, + sv = _mm256_set_epi8(15, 7, + 14, 6, + 13, 5, + 12, 4, + 11, 3, + 10, 2, + 9, 1, 8, 0, - 15, 7, - 14, 6, - 13, 5, - 12, 4, - 11, 3, - 10, 2, - 9, 1, + 15, 7, + 14, 6, + 13, 5, + 12, 4, + 11, 3, + 10, 2, + 9, 1, 8, 0 ), - tv = _mm256_set_epi8(15, 14, 11, 10, 13, 12, 9, 8, + tv = _mm256_set_epi8(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0, - 15, 14, 11, 10, 13, 12, 9, 8, + 15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0); #endif #endif - + #if STRIDE > ESIZE // ------------------ byte transpose ---------------------------------- - __m256i cl = _mm256_set1_epi8( 0x0f), - ch = _mm256_set1_epi8( 0xf0), + __m256i cl = _mm256_set1_epi8( 0x0f), + ch = _mm256_set1_epi8( 0xf0), cb = _mm256_set1_epi16(0xff); #endif - for(ip = in,op = out; ip != in+v; ip += ESIZE*32, op += ESIZE*32/STRIDE) { - unsigned char *p = op; PREFETCH(ip+ESIZE*192,0); - __m256i iv[ESIZE],ov[ESIZE]; + for(ip = in,op = out; ip != in+v; ip += ESIZE*32, op += ESIZE*32/STRIDE) { + unsigned char *p = op; PREFETCH(ip+ESIZE*192,0); + __m256i iv[ESIZE],ov[ESIZE]; #if ESIZE == 2 - ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0); - ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1); + ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0); + ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1); iv[0] = _mm256_permute4x64_epi64(_mm256_blend_epi32(ov[0], ov[1],0b11001100),_MM_SHUFFLE(3, 1, 2, 0)); - iv[1] = _mm256_blend_epi32(ov[0], ov[1],0b00110011); + iv[1] = _mm256_blend_epi32(ov[0], ov[1],0b00110011); iv[1] = _mm256_permute4x64_epi64(_mm256_shuffle_epi32(iv[1],_MM_SHUFFLE(1, 0, 3, 2)),_MM_SHUFFLE(3, 1, 2, 0)); #elif ESIZE == 4 - iv[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0); - iv[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1); - iv[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv0); - iv[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv1); + iv[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0); + iv[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1); + iv[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv0); + iv[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv1); - ov[0] = _mm256_blend_epi32(iv[0], iv[1],0b10101010); + ov[0] = _mm256_blend_epi32(iv[0], iv[1],0b10101010); ov[1] = _mm256_shuffle_epi32(_mm256_blend_epi32(iv[0], iv[1],0b01010101),_MM_SHUFFLE(2, 3, 0, 1)); - ov[2] = _mm256_blend_epi32(iv[2], iv[3],0b10101010); + ov[2] = _mm256_blend_epi32(iv[2], iv[3],0b10101010); ov[3] = _mm256_shuffle_epi32(_mm256_blend_epi32(iv[2], iv[3],0b01010101),_MM_SHUFFLE(2, 3, 0, 1)); - iv[0] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[2]), pv); + iv[0] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[2]), pv); iv[1] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[0], ov[2]), pv); iv[2] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[1], ov[3]), pv); - iv[3] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[3]), pv); + iv[3] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[3]), pv); #else - ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv); - ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv); - ov[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv); - ov[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv); - - iv[0] = _mm256_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm256_unpackhi_epi16(ov[0], ov[1]); - iv[2] = _mm256_unpacklo_epi16(ov[2], ov[3]); iv[3] = _mm256_unpackhi_epi16(ov[2], ov[3]); - + ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv); + ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv); + ov[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv); + ov[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv); + + iv[0] = _mm256_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm256_unpackhi_epi16(ov[0], ov[1]); + iv[2] = _mm256_unpacklo_epi16(ov[2], ov[3]); iv[3] = _mm256_unpackhi_epi16(ov[2], ov[3]); + ov[0] = _mm256_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm256_unpackhi_epi32(iv[0], iv[2]); ov[2] = _mm256_unpacklo_epi32(iv[1], iv[3]); ov[3] = _mm256_unpackhi_epi32(iv[1], iv[3]); - - ov[4] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+128)), sv); - ov[5] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+160)), sv); - ov[6] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+192)), sv); - ov[7] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+224)), sv); + + ov[4] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+128)), sv); + ov[5] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+160)), sv); + ov[6] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+192)), sv); + ov[7] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+224)), sv); iv[4] = _mm256_unpacklo_epi16(ov[4], ov[5]); iv[5] = _mm256_unpackhi_epi16(ov[4], ov[5]); iv[6] = _mm256_unpacklo_epi16(ov[6], ov[7]); iv[7] = _mm256_unpackhi_epi16(ov[6], ov[7]); ov[4] = _mm256_unpacklo_epi32(iv[4], iv[6]); ov[5] = _mm256_unpackhi_epi32(iv[4], iv[6]); ov[6] = _mm256_unpacklo_epi32(iv[5], iv[7]); ov[7] = _mm256_unpackhi_epi32(iv[5], iv[7]); - + iv[0] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[4]), pv), tv); iv[1] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[0], ov[4]), pv), tv); iv[2] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[1], ov[5]), pv), tv); iv[3] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[5]), pv), tv); - + iv[4] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[2], ov[6]), pv), tv); iv[5] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[2], ov[6]), pv), tv); iv[6] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[3], ov[7]), pv), tv); iv[7] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[3], ov[7]), pv), tv); #endif - - #if STRIDE <= ESIZE - _mm256_storeu_si256((__m256i *) p, iv[0]); + + #if STRIDE <= ESIZE + _mm256_storeu_si256((__m256i *) p, iv[0]); _mm256_storeu_si256((__m256i *)(p+=stride), iv[1]); #if ESIZE > 2 _mm256_storeu_si256((__m256i *)(p+=stride), iv[2]); - _mm256_storeu_si256((__m256i *)(p+=stride), iv[3]); + _mm256_storeu_si256((__m256i *)(p+=stride), iv[3]); #if ESIZE > 4 _mm256_storeu_si256((__m256i *)(p+=stride), iv[4]); _mm256_storeu_si256((__m256i *)(p+=stride), iv[5]); @@ -672,33 +672,33 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o #define ST128(_p_,_v_,_i_) _mm_storeu_si128((__m256i *)SIE(_p_,_i_), _mm256_castsi256_si128(_v_)) #define ST1280(_p_,_v_) _mm_storeu_si128((__m256i *)(_p_), _mm256_castsi256_si128(_v_)) - ov[0] = _mm256_and_si256(iv[0], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); - ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[0], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); - ov[2] = _mm256_and_si256(iv[1], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); + ov[0] = _mm256_and_si256(iv[0], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); + ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[0], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); + ov[2] = _mm256_and_si256(iv[1], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[1], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); ST1280(p,ov[0]); ST128(p,ov[1],1); ST128(p,ov[2],2); ST128(p,ov[3],3); #if ESIZE > 2 ov[0] = _mm256_and_si256(iv[2], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[2], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); ov[2] = _mm256_and_si256(iv[3], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); - ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[3], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); + ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[3], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); ST128(p,ov[0],4); ST128(p,ov[1],5); ST128(p,ov[2],6); ST128(p,ov[3],7); #if ESIZE > 4 - ov[0] = _mm256_and_si256(iv[4], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); - ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[4], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); - ov[2] = _mm256_and_si256(iv[5], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); - ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[5], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); + ov[0] = _mm256_and_si256(iv[4], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); + ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[4], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); + ov[2] = _mm256_and_si256(iv[5], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); + ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[5], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); ST128(p,ov[0],8); ST128(p,ov[1],9); ST128(p,ov[2],10); ST128(p,ov[3],11); - ov[0] = _mm256_and_si256(iv[6], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); - ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[6], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); - ov[2] = _mm256_and_si256(iv[7], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); - ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[7], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); + ov[0] = _mm256_and_si256(iv[6], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); + ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[6], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); + ov[2] = _mm256_and_si256(iv[7], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); + ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[7], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); ST128(p,ov[0],12); ST128(p,ov[1],13); ST128(p,ov[2],14); ST128(p,ov[3],15); #endif - #endif + #endif #endif - } + } TEMPLATE2(tpenc,ESIZE)(in+v, n-v, out+v); } @@ -712,12 +712,12 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o ov[x] = _mm256_and_si256(_mm256_unpacklo_epi8(ov[x], _mm256_srli_epi16(ov[x],4)), cl);\ ov[y] = _mm256_and_si256(_mm256_unpacklo_epi8(ov[y], _mm256_srli_epi16(ov[y],4)), cl);\ _iv_ = _mm256_or_si256(_mm256_slli_epi16(ov[y],4), ov[x]); \ -} +} void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned v = n&~(ESIZE*32-1); - unsigned stride = v/STRIDE; - unsigned char *op,*ip; + unsigned stride = v/STRIDE; + unsigned char *op,*ip; #if STRIDE > ESIZE __m256i cl = _mm256_set1_epi8(0x0f), ch=_mm256_set1_epi8(0xf0), cb = _mm256_set1_epi16(0xff); @@ -725,7 +725,7 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o for(op = out,ip = in; op != out+v; ip += ESIZE*32/STRIDE, op += ESIZE*32) { unsigned char *p = ip; PREFETCH(ip+ESIZE*192,0); __m256i iv[ESIZE], ov[ESIZE]; - + #if STRIDE > ESIZE NBL0(0,1); NBL( 2,3); NB(0,1,iv[0]); NB(2,3,iv[1]); #if ESIZE > 2 @@ -736,40 +736,40 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o #endif #endif #else - iv[0] = _mm256_loadu_si256((__m256i *) p ); - iv[1] = _mm256_loadu_si256((__m256i *)(p+=stride)); + iv[0] = _mm256_loadu_si256((__m256i *) p ); + iv[1] = _mm256_loadu_si256((__m256i *)(p+=stride)); #if ESIZE > 2 - iv[2] = _mm256_loadu_si256((__m256i *)(p+=stride)); + iv[2] = _mm256_loadu_si256((__m256i *)(p+=stride)); iv[3] = _mm256_loadu_si256((__m256i *)(p+=stride)); #if ESIZE > 4 - iv[4] = _mm256_loadu_si256((__m256i *)(p+=stride)); - iv[5] = _mm256_loadu_si256((__m256i *)(p+=stride)); - iv[6] = _mm256_loadu_si256((__m256i *)(p+=stride)); - iv[7] = _mm256_loadu_si256((__m256i *)(p+=stride)); + iv[4] = _mm256_loadu_si256((__m256i *)(p+=stride)); + iv[5] = _mm256_loadu_si256((__m256i *)(p+=stride)); + iv[6] = _mm256_loadu_si256((__m256i *)(p+=stride)); + iv[7] = _mm256_loadu_si256((__m256i *)(p+=stride)); #endif #endif #endif - - #if ESIZE == 2 + + #if ESIZE == 2 ov[0] = _mm256_permute4x64_epi64(iv[0], _MM_SHUFFLE(3, 1, 2, 0)); ov[1] = _mm256_permute4x64_epi64(iv[1], _MM_SHUFFLE(3, 1, 2, 0)); - _mm256_storeu_si256((__m256i *)op, _mm256_unpacklo_epi8(ov[0], ov[1])); - _mm256_storeu_si256((__m256i *)(op+32), _mm256_unpackhi_epi8(ov[0], ov[1])); - #elif ESIZE == 4 + _mm256_storeu_si256((__m256i *)op, _mm256_unpacklo_epi8(ov[0], ov[1])); + _mm256_storeu_si256((__m256i *)(op+32), _mm256_unpackhi_epi8(ov[0], ov[1])); + #elif ESIZE == 4 ov[0] = _mm256_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm256_unpackhi_epi8( iv[0], iv[1]); ov[2] = _mm256_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm256_unpackhi_epi8( iv[2], iv[3]); iv[0] = _mm256_unpacklo_epi16(ov[0], ov[2]); iv[1] = _mm256_unpackhi_epi16(ov[0], ov[2]); iv[2] = _mm256_unpacklo_epi16(ov[1], ov[3]); iv[3] = _mm256_unpackhi_epi16(ov[1], ov[3]); - ov[0] = _mm256_permute2x128_si256(iv[0], iv[1], (2 << 4) | 0); - ov[1] = _mm256_permute2x128_si256(iv[2], iv[3], (2 << 4) | 0); - ov[2] = _mm256_permute2x128_si256(iv[0], iv[1], (3 << 4) | 1); - ov[3] = _mm256_permute2x128_si256(iv[2], iv[3], (3 << 4) | 1); - _mm256_storeu_si256((__m256i *) op, ov[0]); - _mm256_storeu_si256((__m256i *)(op+32), ov[1]); - _mm256_storeu_si256((__m256i *)(op+64), ov[2]); - _mm256_storeu_si256((__m256i *)(op+96), ov[3]); + ov[0] = _mm256_permute2x128_si256(iv[0], iv[1], (2 << 4) | 0); + ov[1] = _mm256_permute2x128_si256(iv[2], iv[3], (2 << 4) | 0); + ov[2] = _mm256_permute2x128_si256(iv[0], iv[1], (3 << 4) | 1); + ov[3] = _mm256_permute2x128_si256(iv[2], iv[3], (3 << 4) | 1); + _mm256_storeu_si256((__m256i *) op, ov[0]); + _mm256_storeu_si256((__m256i *)(op+32), ov[1]); + _mm256_storeu_si256((__m256i *)(op+64), ov[2]); + _mm256_storeu_si256((__m256i *)(op+96), ov[3]); #else ov[0] = _mm256_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm256_unpackhi_epi8(iv[0], iv[1]); ov[2] = _mm256_unpacklo_epi8(iv[2], iv[3]); ov[3] = _mm256_unpackhi_epi8(iv[2], iv[3]); @@ -785,22 +785,22 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o iv[6] = _mm256_permute4x64_epi64(_mm256_unpacklo_epi16(ov[5], ov[7]), _MM_SHUFFLE(3, 1, 2, 0)); iv[7] = _mm256_permute4x64_epi64(_mm256_unpackhi_epi16(ov[5], ov[7]), _MM_SHUFFLE(3, 1, 2, 0)); - ov[0] = _mm256_unpacklo_epi32(iv[0], iv[4]); - ov[1] = _mm256_unpacklo_epi32(iv[1], iv[5]); - ov[2] = _mm256_unpacklo_epi32(iv[2], iv[6]); - ov[3] = _mm256_unpacklo_epi32(iv[3], iv[7]); - ov[4] = _mm256_unpackhi_epi32(iv[0], iv[4]); - ov[5] = _mm256_unpackhi_epi32(iv[1], iv[5]); - ov[6] = _mm256_unpackhi_epi32(iv[2], iv[6]); - ov[7] = _mm256_unpackhi_epi32(iv[3], iv[7]); + ov[0] = _mm256_unpacklo_epi32(iv[0], iv[4]); + ov[1] = _mm256_unpacklo_epi32(iv[1], iv[5]); + ov[2] = _mm256_unpacklo_epi32(iv[2], iv[6]); + ov[3] = _mm256_unpacklo_epi32(iv[3], iv[7]); + ov[4] = _mm256_unpackhi_epi32(iv[0], iv[4]); + ov[5] = _mm256_unpackhi_epi32(iv[1], iv[5]); + ov[6] = _mm256_unpackhi_epi32(iv[2], iv[6]); + ov[7] = _mm256_unpackhi_epi32(iv[3], iv[7]); - ST256((__m256i *) op, ov[0] ); - ST256((__m256i *)(op+ 32), ov[1] ); - ST256((__m256i *)(op+ 64), ov[2] ); - ST256((__m256i *)(op+ 96), ov[3] ); - ST256((__m256i *)(op+128), ov[4] ); - ST256((__m256i *)(op+160), ov[5] ); - ST256((__m256i *)(op+192), ov[6] ); + ST256((__m256i *) op, ov[0] ); + ST256((__m256i *)(op+ 32), ov[1] ); + ST256((__m256i *)(op+ 64), ov[2] ); + ST256((__m256i *)(op+ 96), ov[3] ); + ST256((__m256i *)(op+128), ov[4] ); + ST256((__m256i *)(op+160), ov[5] ); + ST256((__m256i *)(op+192), ov[6] ); ST256((__m256i *)(op+224), ov[7] ); #endif } @@ -812,27 +812,27 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o #define ST0(_p_,_v_) _mm_storeu_si128((__m128i *)(_p_), _v_) void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned v = n&~(ESIZE*32-1); - unsigned stride = v/STRIDE; + unsigned v = n&~(ESIZE*32-1); + unsigned stride = v/STRIDE; unsigned char *op,*ip; #if defined(__SSE3__) || defined(__ARM_NEON) #if ESIZE == 2 - __m128i sv = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, + __m128i sv = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); #elif ESIZE == 4 - __m128i sv = _mm_set_epi8(15, 11, 7,3, - 14, 10, 6,2, + __m128i sv = _mm_set_epi8(15, 11, 7,3, + 14, 10, 6,2, 13, 9, 5,1, 12, 8, 4,0); #else - __m128i sv = _mm_set_epi8(15, 7, - 14, 6, - 13, 5, - 12, 4, - 11, 3, - 10, 2, - 9, 1, + __m128i sv = _mm_set_epi8(15, 7, + 14, 6, + 13, 5, + 12, 4, + 11, 3, + 10, 2, + 9, 1, 8, 0 ); #endif #endif @@ -853,28 +853,28 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o iv[0] = (__m128i)w.val[0]; iv[1] = (__m128i)w.val[1]; #endif #else - ov[0] = LD128(ip); ov[0] = _mm_shuffle_epi8(ov[0], sv); - ov[1] = LD128(ip+16); ov[1] = _mm_shuffle_epi8(ov[1], sv); + ov[0] = LD128(ip); ov[0] = _mm_shuffle_epi8(ov[0], sv); + ov[1] = LD128(ip+16); ov[1] = _mm_shuffle_epi8(ov[1], sv); iv[0] = _mm_unpacklo_epi64(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[1]); #if STRIDE <= ESIZE ST0(p,iv[0]); ST(p,iv[1],1); #endif #endif - + #elif ESIZE == 4 #ifdef __ARM_NEON - uint8x16x4_t w = vld4q_u8(ip); + uint8x16x4_t w = vld4q_u8(ip); #if STRIDE <= ESIZE - ST0(p,(__m128i)w.val[0]); ST(p,(__m128i)w.val[1],1); ST(p,(__m128i)w.val[2],2); ST(p,(__m128i)w.val[3],3); + ST0(p,(__m128i)w.val[0]); ST(p,(__m128i)w.val[1],1); ST(p,(__m128i)w.val[2],2); ST(p,(__m128i)w.val[3],3); #else iv[0] = (__m128i)w.val[0]; iv[1] = (__m128i)w.val[1]; iv[2] = (__m128i)w.val[2]; iv[3] = (__m128i)w.val[3]; #endif #else iv[0] = LD128(ip ); iv[0] = _mm_shuffle_epi8(iv[0], sv); - iv[1] = LD128(ip+16); iv[1] = _mm_shuffle_epi8(iv[1], sv); - iv[2] = LD128(ip+32); iv[2] = _mm_shuffle_epi8(iv[2], sv); - iv[3] = LD128(ip+48); iv[3] = _mm_shuffle_epi8(iv[3], sv); + iv[1] = LD128(ip+16); iv[1] = _mm_shuffle_epi8(iv[1], sv); + iv[2] = LD128(ip+32); iv[2] = _mm_shuffle_epi8(iv[2], sv); + iv[3] = LD128(ip+48); iv[3] = _mm_shuffle_epi8(iv[3], sv); ov[0] = _mm_unpacklo_epi32(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[1]); ov[2] = _mm_unpacklo_epi32(iv[2], iv[3]); ov[3] = _mm_unpackhi_epi32(iv[2], iv[3]); @@ -885,34 +885,34 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3); #endif #endif - + #elif ESIZE == 8 #ifdef __ARM_NEON #define vzipl_u16(_a_,_b_) vzip_u16(vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_))) #define vziph_u16(_a_,_b_) vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_))) //#define VQ #ifndef VQ - uint16x4x2_t v16[8]; - uint32x2x2_t v32[8]; + uint16x4x2_t v16[8]; + uint32x2x2_t v32[8]; #else - uint8x16x2_t v8[4]; - uint16x8x2_t v16[4]; - uint32x4x2_t v32[4]; //uint64x2x2_t v64[4]; + uint8x16x2_t v8[4]; + uint16x8x2_t v16[4]; + uint32x4x2_t v32[4]; //uint64x2x2_t v64[4]; #endif - #ifdef VQ - ov[0] = LD128(ip ); //ov[0] = _mm_shuffle_epi8(ov[0], sv); - ov[1] = LD128(ip+ 16); //ov[1] = _mm_shuffle_epi8(ov[1], sv); - ov[2] = LD128(ip+ 32); //ov[2] = _mm_shuffle_epi8(ov[2], sv); - ov[3] = LD128(ip+ 48); //ov[3] = _mm_shuffle_epi8(ov[3], sv); - ov[4] = LD128(ip+ 64); //ov[4] = _mm_shuffle_epi8(ov[4], sv); - ov[5] = LD128(ip+ 80); //ov[5] = _mm_shuffle_epi8(ov[5], sv); - ov[6] = LD128(ip+ 96); //ov[6] = _mm_shuffle_epi8(ov[6], sv); - ov[7] = LD128(ip+112); //ov[7] = _mm_shuffle_epi8(ov[7], sv); - - v8[0] = vzipq_u8((uint8x16_t)ov[0], (uint8x16_t)ov[1]); - v8[1] = vzipq_u8((uint8x16_t)ov[2], (uint8x16_t)ov[3]); - v8[2] = vzipq_u8((uint8x16_t)ov[4], (uint8x16_t)ov[5]); - v8[3] = vzipq_u8((uint8x16_t)ov[6], (uint8x16_t)ov[7]); + #ifdef VQ + ov[0] = LD128(ip ); //ov[0] = _mm_shuffle_epi8(ov[0], sv); + ov[1] = LD128(ip+ 16); //ov[1] = _mm_shuffle_epi8(ov[1], sv); + ov[2] = LD128(ip+ 32); //ov[2] = _mm_shuffle_epi8(ov[2], sv); + ov[3] = LD128(ip+ 48); //ov[3] = _mm_shuffle_epi8(ov[3], sv); + ov[4] = LD128(ip+ 64); //ov[4] = _mm_shuffle_epi8(ov[4], sv); + ov[5] = LD128(ip+ 80); //ov[5] = _mm_shuffle_epi8(ov[5], sv); + ov[6] = LD128(ip+ 96); //ov[6] = _mm_shuffle_epi8(ov[6], sv); + ov[7] = LD128(ip+112); //ov[7] = _mm_shuffle_epi8(ov[7], sv); + + v8[0] = vzipq_u8((uint8x16_t)ov[0], (uint8x16_t)ov[1]); + v8[1] = vzipq_u8((uint8x16_t)ov[2], (uint8x16_t)ov[3]); + v8[2] = vzipq_u8((uint8x16_t)ov[4], (uint8x16_t)ov[5]); + v8[3] = vzipq_u8((uint8x16_t)ov[6], (uint8x16_t)ov[7]); /* v16[0] = vzipq_u16((uint16x8_t)ov[0], (uint16x8_t)ov[1]); v16[1] = vzipq_u16((uint16x8_t)ov[2], (uint16x8_t)ov[3]); @@ -933,16 +933,16 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o iv[4] = _mm_unpacklo_epi64(v32[1].val[0], v32[3].val[0]); iv[5] = _mm_unpackhi_epi64(v32[1].val[0], v32[3].val[0]); iv[6] = _mm_unpacklo_epi64(v32[1].val[1], v32[3].val[1]); iv[7] = _mm_unpackhi_epi64(v32[1].val[1], v32[3].val[1]); #else - ov[0] = LD128(ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv); - ov[1] = LD128(ip+ 16); ov[1] = _mm_shuffle_epi8(ov[1], sv); - ov[2] = LD128(ip+ 32); ov[2] = _mm_shuffle_epi8(ov[2], sv); - ov[3] = LD128(ip+ 48); ov[3] = _mm_shuffle_epi8(ov[3], sv); - ov[4] = LD128(ip+ 64); ov[4] = _mm_shuffle_epi8(ov[4], sv); - ov[5] = LD128(ip+ 80); ov[5] = _mm_shuffle_epi8(ov[5], sv); - ov[6] = LD128(ip+ 96); ov[6] = _mm_shuffle_epi8(ov[6], sv); - ov[7] = LD128(ip+112); ov[7] = _mm_shuffle_epi8(ov[7], sv); - v16[0] = vzipl_u16(ov[0], ov[1]); v16[1] = vziph_u16(ov[0], ov[1]); - v16[2] = vzipl_u16(ov[2], ov[3]); v16[3] = vziph_u16(ov[2], ov[3]); + ov[0] = LD128(ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv); + ov[1] = LD128(ip+ 16); ov[1] = _mm_shuffle_epi8(ov[1], sv); + ov[2] = LD128(ip+ 32); ov[2] = _mm_shuffle_epi8(ov[2], sv); + ov[3] = LD128(ip+ 48); ov[3] = _mm_shuffle_epi8(ov[3], sv); + ov[4] = LD128(ip+ 64); ov[4] = _mm_shuffle_epi8(ov[4], sv); + ov[5] = LD128(ip+ 80); ov[5] = _mm_shuffle_epi8(ov[5], sv); + ov[6] = LD128(ip+ 96); ov[6] = _mm_shuffle_epi8(ov[6], sv); + ov[7] = LD128(ip+112); ov[7] = _mm_shuffle_epi8(ov[7], sv); + v16[0] = vzipl_u16(ov[0], ov[1]); v16[1] = vziph_u16(ov[0], ov[1]); + v16[2] = vzipl_u16(ov[2], ov[3]); v16[3] = vziph_u16(ov[2], ov[3]); v16[4] = vzipl_u16(ov[4], ov[5]); v16[5] = vziph_u16(ov[4], ov[5]); v16[6] = vzipl_u16(ov[6], ov[7]); v16[7] = vziph_u16(ov[6], ov[7]); @@ -969,21 +969,21 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3); ST(p,iv[4],4); ST(p,iv[5],5); ST(p,iv[6],6); ST(p,iv[7],7); #endif #else // SSE - ov[0] = LD128(ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv); - ov[1] = LD128(ip+16); ov[1] = _mm_shuffle_epi8(ov[1], sv); - ov[2] = LD128(ip+32); ov[2] = _mm_shuffle_epi8(ov[2], sv); - ov[3] = LD128(ip+48); ov[3] = _mm_shuffle_epi8(ov[3], sv); - - iv[0] = _mm_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[1]); - iv[2] = _mm_unpacklo_epi16(ov[2], ov[3]); iv[3] = _mm_unpackhi_epi16(ov[2], ov[3]); - + ov[0] = LD128(ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv); + ov[1] = LD128(ip+16); ov[1] = _mm_shuffle_epi8(ov[1], sv); + ov[2] = LD128(ip+32); ov[2] = _mm_shuffle_epi8(ov[2], sv); + ov[3] = LD128(ip+48); ov[3] = _mm_shuffle_epi8(ov[3], sv); + + iv[0] = _mm_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[1]); + iv[2] = _mm_unpacklo_epi16(ov[2], ov[3]); iv[3] = _mm_unpackhi_epi16(ov[2], ov[3]); + ov[0] = _mm_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[2]); ov[2] = _mm_unpacklo_epi32(iv[1], iv[3]); ov[3] = _mm_unpackhi_epi32(iv[1], iv[3]); - - ov[4] = LD128(ip+ 64); ov[4] = _mm_shuffle_epi8(ov[4], sv); - ov[5] = LD128(ip+ 80); ov[5] = _mm_shuffle_epi8(ov[5], sv); - ov[6] = LD128(ip+ 96); ov[6] = _mm_shuffle_epi8(ov[6], sv); - ov[7] = LD128(ip+112); ov[7] = _mm_shuffle_epi8(ov[7], sv); + + ov[4] = LD128(ip+ 64); ov[4] = _mm_shuffle_epi8(ov[4], sv); + ov[5] = LD128(ip+ 80); ov[5] = _mm_shuffle_epi8(ov[5], sv); + ov[6] = LD128(ip+ 96); ov[6] = _mm_shuffle_epi8(ov[6], sv); + ov[7] = LD128(ip+112); ov[7] = _mm_shuffle_epi8(ov[7], sv); iv[4] = _mm_unpacklo_epi16(ov[4], ov[5]); iv[5] = _mm_unpackhi_epi16(ov[4], ov[5]); iv[6] = _mm_unpacklo_epi16(ov[6], ov[7]); iv[7] = _mm_unpackhi_epi16(ov[6], ov[7]); @@ -993,27 +993,27 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o iv[0] = _mm_unpacklo_epi64(ov[0], ov[4]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[4]); iv[2] = _mm_unpacklo_epi64(ov[1], ov[5]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[5]); - + iv[4] = _mm_unpacklo_epi64(ov[2], ov[6]); iv[5] = _mm_unpackhi_epi64(ov[2], ov[6]); iv[6] = _mm_unpacklo_epi64(ov[3], ov[7]); iv[7] = _mm_unpackhi_epi64(ov[3], ov[7]); #if STRIDE <= ESIZE ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3); ST(p,iv[4],4); ST(p,iv[5],5); ST(p,iv[6],6); ST(p,iv[7],7); #endif #endif - #endif - + #endif + #elif defined(__SSE2__) #if ESIZE == 2 - iv[0] = LD128(ip ); iv[1] = LD128(ip+16)); - - ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); + iv[0] = LD128(ip ); iv[1] = LD128(ip+16)); + + ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); iv[0] = _mm_unpacklo_epi8(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8(ov[0], ov[1]); - + ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); iv[0] = _mm_unpacklo_epi8(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8(ov[0], ov[1]); ST0(p,iv[0]); ST(p,iv[1],1); #elif ESIZE == 4 - iv[0] = LD128(ip ); iv[1] = LD128(ip+16); iv[2] = LD128(ip+32); iv[3] = LD128(ip+48); + iv[0] = LD128(ip ); iv[1] = LD128(ip+16); iv[2] = LD128(ip+32); iv[3] = LD128(ip+48); ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]); iv[0] = _mm_unpacklo_epi8( ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8( ov[0], ov[1]); @@ -1024,28 +1024,28 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]); iv[2] = _mm_unpacklo_epi8( ov[2], ov[3]); iv[3] = _mm_unpackhi_epi8( ov[2], ov[3]); ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]); - + iv[2] = _mm_unpacklo_epi64(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[3]); ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3); #elif ESIZE == 8 - iv[0] = LD128(ip ); iv[1] = LD128(ip+16); iv[2] = LD128(ip+32); iv[3] = LD128(ip+48); - iv[4] = LD128(ip+64); iv[5] = LD128(ip+80); iv[6] = LD128(ip+96); iv[7] = LD128(ip+112); + iv[0] = LD128(ip ); iv[1] = LD128(ip+16); iv[2] = LD128(ip+32); iv[3] = LD128(ip+48); + iv[4] = LD128(ip+64); iv[5] = LD128(ip+80); iv[6] = LD128(ip+96); iv[7] = LD128(ip+112); - ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]); - ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]); + ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]); + ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]); ov[4] = _mm_unpacklo_epi8( iv[4], iv[5]); ov[5] = _mm_unpackhi_epi8( iv[4], iv[5]); ov[6] = _mm_unpacklo_epi8( iv[6], iv[7]); ov[7] = _mm_unpackhi_epi8( iv[6], iv[7]); - iv[0] = _mm_unpacklo_epi8( ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8( ov[0], ov[1]); - iv[2] = _mm_unpacklo_epi8( ov[2], ov[3]); iv[3] = _mm_unpackhi_epi8( ov[2], ov[3]); + iv[0] = _mm_unpacklo_epi8( ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8( ov[0], ov[1]); + iv[2] = _mm_unpacklo_epi8( ov[2], ov[3]); iv[3] = _mm_unpackhi_epi8( ov[2], ov[3]); iv[4] = _mm_unpacklo_epi8( ov[4], ov[5]); iv[5] = _mm_unpackhi_epi8( ov[4], ov[5]); iv[6] = _mm_unpacklo_epi8( ov[6], ov[7]); iv[7] = _mm_unpackhi_epi8( ov[6], ov[7]); - + ov[0] = _mm_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[2]); ov[2] = _mm_unpacklo_epi32(iv[1], iv[3]); ov[3] = _mm_unpackhi_epi32(iv[1], iv[3]); ov[4] = _mm_unpacklo_epi32(iv[4], iv[6]); ov[5] = _mm_unpackhi_epi32(iv[4], iv[6]); ov[6] = _mm_unpacklo_epi32(iv[5], iv[7]); ov[7] = _mm_unpackhi_epi32(iv[5], iv[7]); - ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3); + ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3); iv[0] = _mm_unpacklo_epi64(ov[0], ov[4]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[4]); iv[2] = _mm_unpacklo_epi64(ov[1], ov[5]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[5]); @@ -1059,140 +1059,140 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o #define STL(_p_,_v_,_i_) _mm_storel_epi64((__m128i *)SIE(_p_,_i_), _v_) #define STL0(_p_,_v_) _mm_storel_epi64((__m128i *)(_p_), _v_) - ov[0] = _mm_and_si128(iv[0], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128(ov[0],2)); - ov[1] = _mm_srli_epi16(_mm_and_si128(iv[0], ch),4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128(ov[1],2)); - ov[2] = _mm_and_si128(iv[1], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128(ov[2],2)); - ov[3] = _mm_srli_epi16(_mm_and_si128(iv[1], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128(ov[3],2)); + ov[0] = _mm_and_si128(iv[0], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128(ov[0],2)); + ov[1] = _mm_srli_epi16(_mm_and_si128(iv[0], ch),4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128(ov[1],2)); + ov[2] = _mm_and_si128(iv[1], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128(ov[2],2)); + ov[3] = _mm_srli_epi16(_mm_and_si128(iv[1], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128(ov[3],2)); STL0(p,ov[0]); STL(p,ov[1],1);STL(p,ov[2],2);STL(p,ov[3],3); #if ESIZE > 2 - ov[0] = _mm_and_si128(iv[2], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128(ov[0],2)); - ov[1] = _mm_srli_epi16(_mm_and_si128(iv[2], ch),4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128(ov[1],2)); - ov[2] = _mm_and_si128(iv[3], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128(ov[2],2)); - ov[3] = _mm_srli_epi16(_mm_and_si128(iv[3], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128(ov[3],2)); + ov[0] = _mm_and_si128(iv[2], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128(ov[0],2)); + ov[1] = _mm_srli_epi16(_mm_and_si128(iv[2], ch),4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128(ov[1],2)); + ov[2] = _mm_and_si128(iv[3], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128(ov[2],2)); + ov[3] = _mm_srli_epi16(_mm_and_si128(iv[3], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128(ov[3],2)); STL(p,ov[0],4); STL(p,ov[1],5);STL(p,ov[2],6);STL(p,ov[3],7); #if ESIZE > 4 - ov[0] = _mm_and_si128(iv[4], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128(ov[0],2)); - ov[1] = _mm_srli_epi16(_mm_and_si128(iv[4], ch),4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128(ov[1],2)); - ov[2] = _mm_and_si128(iv[5], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128(ov[2],2)); - ov[3] = _mm_srli_epi16(_mm_and_si128(iv[5], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128(ov[3],2)); + ov[0] = _mm_and_si128(iv[4], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128(ov[0],2)); + ov[1] = _mm_srli_epi16(_mm_and_si128(iv[4], ch),4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128(ov[1],2)); + ov[2] = _mm_and_si128(iv[5], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128(ov[2],2)); + ov[3] = _mm_srli_epi16(_mm_and_si128(iv[5], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128(ov[3],2)); STL(p,ov[0],8); STL(p,ov[1],9);STL(p,ov[2],10);STL(p,ov[3],11); - - ov[4] = _mm_and_si128(iv[6], cl); ov[4] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[4],4), ov[4]),cb); ov[4] = _mm_packus_epi16(ov[4], _mm_srli_si128(ov[4],2)); - ov[5] = _mm_srli_epi16(_mm_and_si128(iv[6], ch),4); ov[5] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[5],4), ov[5]),cb); ov[5] = _mm_packus_epi16(ov[5], _mm_srli_si128(ov[5],2)); - ov[6] = _mm_and_si128(iv[7], cl); ov[6] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[6],4), ov[6]),cb); ov[6] = _mm_packus_epi16(ov[6], _mm_srli_si128(ov[6],2)); - ov[7] = _mm_srli_epi16(_mm_and_si128(iv[7], ch),4); ov[7] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[7],4), ov[7]),cb); ov[7] = _mm_packus_epi16(ov[7], _mm_srli_si128(ov[7],2)); + + ov[4] = _mm_and_si128(iv[6], cl); ov[4] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[4],4), ov[4]),cb); ov[4] = _mm_packus_epi16(ov[4], _mm_srli_si128(ov[4],2)); + ov[5] = _mm_srli_epi16(_mm_and_si128(iv[6], ch),4); ov[5] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[5],4), ov[5]),cb); ov[5] = _mm_packus_epi16(ov[5], _mm_srli_si128(ov[5],2)); + ov[6] = _mm_and_si128(iv[7], cl); ov[6] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[6],4), ov[6]),cb); ov[6] = _mm_packus_epi16(ov[6], _mm_srli_si128(ov[6],2)); + ov[7] = _mm_srli_epi16(_mm_and_si128(iv[7], ch),4); ov[7] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[7],4), ov[7]),cb); ov[7] = _mm_packus_epi16(ov[7], _mm_srli_si128(ov[7],2)); STL(p,ov[4],12); STL(p,ov[5],13);STL(p,ov[6],14);STL(p,ov[7],15); #endif - #endif - #endif + #endif + #endif } TEMPLATE2(tpenc,ESIZE)(in+v, n-v, out+v); } void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned v = n&~(ESIZE*32-1); - unsigned stride = v/STRIDE; - unsigned char *op,*ip; - + unsigned v = n&~(ESIZE*32-1); + unsigned stride = v/STRIDE; + unsigned char *op,*ip; + #if STRIDE > ESIZE __m128i cl = _mm_set1_epi8(0x0f), ch=_mm_set1_epi8(0xf0), cb = _mm_set1_epi16(0xff); #endif - for(op = out,ip = in; op != out+v; op+=ESIZE*16,ip += ESIZE*16/STRIDE) { + for(op = out,ip = in; op != out+v; op+=ESIZE*16,ip += ESIZE*16/STRIDE) { unsigned char *p=ip; PREFETCH(ip+(ESIZE*16/STRIDE)*ESIZE,0); __m128i iv[ESIZE], ov[ESIZE]; #if STRIDE > ESIZE //------------ Nibble transpose ------------------- ov[0] = _mm_loadl_epi64((__m128i *) p ); - ov[1] = _mm_loadl_epi64((__m128i *)SID(p,1)); + ov[1] = _mm_loadl_epi64((__m128i *)SID(p,1)); ov[2] = _mm_loadl_epi64((__m128i *)SID(p,2)); ov[3] = _mm_loadl_epi64((__m128i *)SID(p,3)); ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); // 0,1->0 - ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); - iv[0] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); - + ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); + iv[0] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); + ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); // 2,3->1 - ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); - iv[1] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); + ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); + iv[1] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); #if ESIZE > 2 - ov[0] = _mm_loadl_epi64((__m128i *)SID(p,4)); - ov[1] = _mm_loadl_epi64((__m128i *)SID(p,5)); - ov[2] = _mm_loadl_epi64((__m128i *)SID(p,6)); - ov[3] = _mm_loadl_epi64((__m128i *)SID(p,7)); + ov[0] = _mm_loadl_epi64((__m128i *)SID(p,4)); + ov[1] = _mm_loadl_epi64((__m128i *)SID(p,5)); + ov[2] = _mm_loadl_epi64((__m128i *)SID(p,6)); + ov[3] = _mm_loadl_epi64((__m128i *)SID(p,7)); - ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); // 0,1->2 - ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); - iv[2] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); + ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); // 0,1->2 + ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); + iv[2] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); - ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); // 2,3->3 - ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); - iv[3] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); + ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); // 2,3->3 + ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); + iv[3] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); #endif #if ESIZE > 4 - ov[0] = _mm_loadl_epi64((__m128i *)SID(p,8)); - ov[1] = _mm_loadl_epi64((__m128i *)SID(p,9)); - ov[2] = _mm_loadl_epi64((__m128i *)SID(p,10)); - ov[3] = _mm_loadl_epi64((__m128i *)SID(p,11)); + ov[0] = _mm_loadl_epi64((__m128i *)SID(p,8)); + ov[1] = _mm_loadl_epi64((__m128i *)SID(p,9)); + ov[2] = _mm_loadl_epi64((__m128i *)SID(p,10)); + ov[3] = _mm_loadl_epi64((__m128i *)SID(p,11)); - ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); // 0,1->4 - ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); - iv[4] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); + ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); // 0,1->4 + ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); + iv[4] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); - ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); // 2,3->5 - ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); - ov[3] = _mm_and_si128(ov[3], cl); - iv[5] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); + ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); // 2,3->5 + ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); + ov[3] = _mm_and_si128(ov[3], cl); + iv[5] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); - ov[0] = _mm_loadl_epi64((__m128i *)SID(p,12)); - ov[1] = _mm_loadl_epi64((__m128i *)SID(p,13)); - ov[2] = _mm_loadl_epi64((__m128i *)SID(p,14)); - ov[3] = _mm_loadl_epi64((__m128i *)SID(p,15)); + ov[0] = _mm_loadl_epi64((__m128i *)SID(p,12)); + ov[1] = _mm_loadl_epi64((__m128i *)SID(p,13)); + ov[2] = _mm_loadl_epi64((__m128i *)SID(p,14)); + ov[3] = _mm_loadl_epi64((__m128i *)SID(p,15)); - ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); // 0,1->6 - ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); - iv[6] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); + ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); // 0,1->6 + ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); + iv[6] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); - ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); // 2,3->7 - ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); - iv[7] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); - #endif + ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); // 2,3->7 + ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); + iv[7] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); + #endif #else // --------------------------- Byte transpose ------------------- - iv[0] = _mm_loadu_si128((__m128i *) p ); - iv[1] = _mm_loadu_si128((__m128i *)SID(p,1)); + iv[0] = _mm_loadu_si128((__m128i *) p ); + iv[1] = _mm_loadu_si128((__m128i *)SID(p,1)); #if ESIZE > 2 - iv[2] = _mm_loadu_si128((__m128i *)SID(p,2)); - iv[3] = _mm_loadu_si128((__m128i *)SID(p,3)); + iv[2] = _mm_loadu_si128((__m128i *)SID(p,2)); + iv[3] = _mm_loadu_si128((__m128i *)SID(p,3)); #if ESIZE > 4 - iv[4] = _mm_loadu_si128((__m128i *)SID(p,4)); - iv[5] = _mm_loadu_si128((__m128i *)SID(p,5)); - iv[6] = _mm_loadu_si128((__m128i *)SID(p,6)); - iv[7] = _mm_loadu_si128((__m128i *)SID(p,7)); + iv[4] = _mm_loadu_si128((__m128i *)SID(p,4)); + iv[5] = _mm_loadu_si128((__m128i *)SID(p,5)); + iv[6] = _mm_loadu_si128((__m128i *)SID(p,6)); + iv[7] = _mm_loadu_si128((__m128i *)SID(p,7)); #endif #endif #endif #if ESIZE == 2 #ifdef __ARM_NEON - uint8x16x2_t w; w.val[0] = (uint8x16_t)iv[0]; + uint8x16x2_t w; w.val[0] = (uint8x16_t)iv[0]; w.val[1] = (uint8x16_t)iv[1]; vst2q_u8(op, w); #else - ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]);//i(0,1)->o(0,1) + ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]);//i(0,1)->o(0,1) ST128(op, ov[0]); ST128(op+16, ov[1]); #endif #elif ESIZE == 4 #ifdef __ARM_NEON - uint8x16x4_t w; w.val[0] = (uint8x16_t)iv[0]; + uint8x16x4_t w; w.val[0] = (uint8x16_t)iv[0]; w.val[1] = (uint8x16_t)iv[1]; - w.val[2] = (uint8x16_t)iv[2]; + w.val[2] = (uint8x16_t)iv[2]; w.val[3] = (uint8x16_t)iv[3]; vst4q_u8(op,w); #else ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); //i(0,1)->o(0,1) ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8(iv[2], iv[3]); //i(2,3)->o(2,3) - + iv[0] = _mm_unpacklo_epi16(ov[0], ov[2]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[2]);//o(0,2)->i(0,1) iv[2] = _mm_unpacklo_epi16(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi16(ov[1], ov[3]);//o(1,3)->i(2,3) - ST128(op, iv[0]); ST128(op+16,iv[1]); ST128(op+32,iv[2]); ST128(op+48,iv[3]); - #endif + ST128(op, iv[0]); ST128(op+16,iv[1]); ST128(op+32,iv[2]); ST128(op+48,iv[3]); + #endif #else ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]);//i(0,1)->o(0,1) ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]);//i(2,3)->o(2,3) @@ -1202,15 +1202,15 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o iv[0] = _mm_unpacklo_epi16(ov[0], ov[2]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[2]); iv[2] = _mm_unpacklo_epi16(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi16(ov[1], ov[3]); iv[4] = _mm_unpacklo_epi16(ov[4], ov[6]); iv[5] = _mm_unpackhi_epi16(ov[4], ov[6]); - iv[6] = _mm_unpacklo_epi16(ov[5], ov[7]); iv[7] = _mm_unpackhi_epi16(ov[5], ov[7]); - - ov[0] = _mm_unpacklo_epi32(iv[0], iv[4]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[4]); - ov[2] = _mm_unpacklo_epi32(iv[1], iv[5]); ov[3] = _mm_unpackhi_epi32(iv[1], iv[5]); - ov[4] = _mm_unpacklo_epi32(iv[2], iv[6]); ov[5] = _mm_unpackhi_epi32(iv[2], iv[6]); + iv[6] = _mm_unpacklo_epi16(ov[5], ov[7]); iv[7] = _mm_unpackhi_epi16(ov[5], ov[7]); + + ov[0] = _mm_unpacklo_epi32(iv[0], iv[4]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[4]); + ov[2] = _mm_unpacklo_epi32(iv[1], iv[5]); ov[3] = _mm_unpackhi_epi32(iv[1], iv[5]); + ov[4] = _mm_unpacklo_epi32(iv[2], iv[6]); ov[5] = _mm_unpackhi_epi32(iv[2], iv[6]); ov[6] = _mm_unpacklo_epi32(iv[3], iv[7]); ov[7] = _mm_unpackhi_epi32(iv[3], iv[7]); - ST128(op, ov[0]); ST128(op+16,ov[1]); ST128(op+32,ov[2]); ST128(op+48, ov[3]); - ST128(op+64,ov[4]); ST128(op+80,ov[5]); ST128(op+96,ov[6]); ST128(op+112,ov[7]); + ST128(op, ov[0]); ST128(op+16,ov[1]); ST128(op+32,ov[2]); ST128(op+48, ov[3]); + ST128(op+64,ov[4]); ST128(op+80,ov[5]); ST128(op+96,ov[6]); ST128(op+112,ov[7]); #endif } TEMPLATE2(tpdec,ESIZE)(in+v, n-v, out+v); @@ -1218,6 +1218,6 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o #endif #endif - + #endif #endif