/** Copyright (C) powturbo 2013-2018 GPL v2 License This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - homepage : https://sites.google.com/site/powturbo/ - github : https://github.com/powturbo - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ // Nibble/Byte transpose #ifndef ESIZE #include #ifdef __AVX2__ #include #elif defined(__SSE4_1__) #include #elif defined(__SSSE3__) #include #elif defined(__SSE2__) #include #endif #pragma warning( disable : 4005) #include "conf.h" #include "transpose.h" #define PREFETCH(_ip_) __builtin_prefetch(_ip_+512,0) //#define PREFETCH(ip) #define powof2(n) !((n)&((n)-1)) #define TPENC tpenc #define TPDEC tpdec #define ESIZE 3 #define STRIDE ESIZE #include "transpose.c" #undef ESIZE #define ESIZE 16 #define STRIDE ESIZE #include "transpose.c" #undef ESIZE #define ESIZE 2 #define STRIDE ESIZE #define LD128(ip) _mm_loadu_si128(ip) #define ST128(op,v) _mm_storeu_si128(op,v) #define TPENC128V tpenc128v #define TPDEC128V tpdec128v #define LD256(ip) _mm256_loadu_si256(ip) #define ST256(op,v) _mm256_storeu_si256(op,v) #define TPENC256V tpenc256v #define TPDEC256V tpdec256v #include "transpose.c" #undef STRIDE #define STRIDE 4 #define TPENC128V tp4enc128v #define TPDEC128V tp4dec128v #define TPENC256V tp4enc256v #define TPDEC256V tp4dec256v #include "transpose.c" #undef ESIZE #define ESIZE 4 #define STRIDE ESIZE #define TPENC128V tpenc128v #define TPDEC128V tpdec128v #define TPENC256V tpenc256v #define TPDEC256V tpdec256v #include "transpose.c" #undef STRIDE #define STRIDE 8 #define TPENC128V tp4enc128v #define TPDEC128V tp4dec128v #define TPENC256V tp4enc256v #define TPDEC256V tp4dec256v #include "transpose.c" #undef ESIZE #undef STRIDE #define ESIZE 8 #define STRIDE ESIZE #define TPENC128V tpenc128v #define TPDEC128V tpdec128v #define TPENC256V tpenc256v #define TPDEC256V tpdec256v #include "transpose.c" #undef STRIDE #define STRIDE 16 #define TPENC128V tp4enc128v #define TPDEC128V tp4dec128v #define TPENC256V tp4enc256v #define TPDEC256V tp4dec256v #include "transpose.c" //--------------------- CPU detection ------------------------------------------- #if (_MSC_VER >=1300) || defined (__INTEL_COMPILER) #include #endif #if !defined(SSE2_ON) && !defined(AVX2_ON) static inline void cpuid(int reg[4], int id) { #if defined (_MSC_VER) || defined (__INTEL_COMPILER) __cpuidex(reg, id, 0); #elif defined(__i386__) || defined(__x86_64__) __asm("cpuid" : "=a"(reg[0]),"=b"(reg[1]),"=c"(reg[2]),"=d"(reg[3]) : "a"(id),"c"(0) : ); #endif } static inline uint64_t xgetbv (int ctr) { #if(defined _MSC_VER && (_MSC_FULL_VER >= 160040219) || defined __INTEL_COMPILER) return _xgetbv(ctr); #elif defined(__i386__) || defined(__x86_64__) unsigned a, d; __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); return (uint64_t)d << 32 | a; #else unsigned a=0, d=0; return (uint64_t)d << 32 | a; #endif } static int _cpuiset; int cpuini(int cpuiset) { if(cpuiset) _cpuiset = cpuiset; return _cpuiset; } char *cpustr(int cpuiset) { if(_cpuiset >= 52) return "avx2"; else if(_cpuiset >= 41) return "sse4.1"; else if(_cpuiset >= 31) return "sse3"; else if(_cpuiset >= 20) return "ss2"; else return "none"; } int cpuiset(void) { int c[4] = {0}; if(_cpuiset) return _cpuiset; _cpuiset++; cpuid(c, 0); if(c[0]) { cpuid(c, 1); if( c[3] & (1 << 25)) { _cpuiset = 10; // SSE if( c[3] & (1 << 26)) { _cpuiset = 20; // SSE2 if( c[2] & (1 << 0)) { _cpuiset = 30; // SSE3 if( c[2] & (1 << 9)) { _cpuiset = 31; // SSSE3 if( c[2] & (1 << 19)) { _cpuiset = 40; // SSE4.1 if( c[2] & (1 << 23)) { _cpuiset = 41; // +popcount if( c[2] & (1 << 20)) { _cpuiset = 42; if((c[2] & (1 << 28)) && (c[2] & (1 << 27)) && // OSXSAVE (c[2] & (1 << 26)) && // XSAVE (xgetbv(0) & 6)==6){ _cpuiset = 50; // AVX if(c[2]& (1 << 25)) _cpuiset = 51; // +AES cpuid(c, 7); if(c[1] & (1 << 5)) _cpuiset = 52; // AVX2 }}}}}}}}} return _cpuiset; } //--------------------------------------------------------------------------------- typedef void (*TPFUNC)( unsigned char *in, unsigned n, unsigned char *out); // 0 1 2 3 4 5 6 7 8 9 16 static TPFUNC _tpe[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; static TPFUNC _tpd[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 }; #ifdef USE_SSE static TPFUNC _tp4e[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // Nibble static TPFUNC _tp4d[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 }; #endif static int tpset; void tpini(int id) { int i; if(tpset) return; tpset++; i = id?id:cpuiset(); #if defined(USE_AVX2) if(i >= 52) { _tpe[2] = tpenc256v2; _tpd[2] = tpdec256v2; _tp4e[2] = tp4enc256v2; _tp4d[2] = tp4dec256v2; _tpe[4] = tpenc256v4; _tpd[4] = tpdec256v4; _tp4e[4] = tp4enc256v4; _tp4d[4] = tp4dec256v4; _tpe[8] = tpenc256v8; _tpd[8] = tpdec256v8; _tp4e[8] = tp4enc256v8; _tp4d[8] = tp4dec256v8; } else #endif #ifdef USE_SSE if(i >= 20) { _tpe[2] = tpenc128v2; _tpd[2] = tpdec128v2; _tp4e[2] = tp4enc128v2; _tp4d[2] = tp4dec128v2; _tpe[4] = tpenc128v4; _tpd[4] = tpdec128v4; _tp4e[4] = tp4enc128v4; _tp4d[4] = tp4dec128v4; _tpe[8] = tpenc128v8; _tpd[8] = tpdec128v8; _tp4e[8] = tp4enc128v8; _tp4d[8] = tp4dec128v8; } #endif ; } void tpenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { TPFUNC f; if(!tpset) tpini(0); if(esize <= 16 && (f = _tpe[esize])) f(in,n,out); else { unsigned i, stride=n/esize; unsigned char *op,*ip; for(ip = in,op = out; ip < in+stride*esize; op++) for(i = 0; i < esize; i++) op[i*stride] = *ip++; for(op = out + esize*stride; ip < in+n;) *op++ = *ip++; } } void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { TPFUNC f; if(!tpset) tpini(0); if(esize <= 16 && (f = _tpd[esize])) f(in,n,out); else { unsigned i,stride=n/esize; unsigned char *op,*ip; for(op = out,ip = in; op < out+stride*esize; ip++) for(i = 0; i < esize; i++) *op++ = ip[i*stride]; for(ip = in+esize*stride; op < out+n;) *op++ = *ip++; } } #ifdef USE_SSE void tp4enc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { TPFUNC f; if(!tpset) tpini(0); if(esize <= 16 && (f = _tp4e[esize])) f(in,n,out); else tpenc(in,n,out,esize); } void tp4dec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { TPFUNC f; if(!tpset) tpini(0); if(esize <= 16 && (f = _tp4d[esize])) f(in,n,out); else tpdec(in,n,out,esize); } #endif #endif #else #if !defined(SSE2_ON) && !defined(AVX2_ON) #if STRIDE == ESIZE void TEMPLATE2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned char *op,*ip,*e; unsigned stride = n/STRIDE; #if powof2(ESIZE) e = in+(n&~(ESIZE-1)); #else e = in+stride*ESIZE; #endif for(ip = in,op = out; ip < e; op++, ip+=ESIZE) { unsigned char *p = op; p[0] = ip[ 0]; *(p+=stride) = ip[ 1]; #if ESIZE > 2 *(p+=stride) = ip[ 2]; #if ESIZE > 3 *(p+=stride) = ip[ 3]; #if ESIZE > 4 *(p+=stride) = ip[ 4]; *(p+=stride) = ip[ 5]; *(p+=stride) = ip[ 6]; *(p+=stride) = ip[ 7]; #if ESIZE > 8 *(p+=stride) = ip[ 8]; *(p+=stride) = ip[ 9]; *(p+=stride) = ip[10]; *(p+=stride) = ip[11]; *(p+=stride) = ip[12]; *(p+=stride) = ip[13]; *(p+=stride) = ip[14]; *(p+=stride) = ip[15]; #endif #endif #endif #endif } for(op = out+stride*ESIZE;ip < in+n;) *op++ = *ip++; } void TEMPLATE2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned char *op,*ip,*e; unsigned stride = n/STRIDE; #if powof2(ESIZE) e = out+(n&~(ESIZE-1)); #else e = out+stride*ESIZE; #endif for(op = out,ip = in; op < e; ip++,op+=ESIZE) { unsigned char *p = ip; op[ 0] = *p; op[ 1] = *(p+=stride); #if ESIZE > 2 op[ 2] = *(p+=stride); #if ESIZE > 3 op[ 3] = *(p+=stride); #if ESIZE > 4 op[ 4] = *(p+=stride); op[ 5] = *(p+=stride); op[ 6] = *(p+=stride); op[ 7] = *(p+=stride); #if ESIZE > 8 op[ 8] = *(p+=stride); op[ 9] = *(p+=stride); op[10] = *(p+=stride); op[11] = *(p+=stride); op[12] = *(p+=stride); op[13] = *(p+=stride); op[14] = *(p+=stride); op[15] = *(p+=stride); #endif #endif #endif #endif } for(ip = in+stride*ESIZE; op < out+n; ) *op++ = *ip++; } #endif // STRIDE #endif #if ESIZE == 2 || ESIZE == 4 || ESIZE == 8 #if defined(__SSE2__) && defined(SSE2_ON) void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned v = n&~(ESIZE*16-1); unsigned stride = v/STRIDE; unsigned char *op,*ip; #ifdef __SSE3__ #if ESIZE == 2 __m128i sv = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); #elif ESIZE == 4 __m128i sv = _mm_set_epi8(15, 11, 7,3, 14, 10, 6,2, 13, 9, 5,1, 12, 8, 4,0); #else __m128i sv = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0 ); #endif #endif #if STRIDE > ESIZE __m128i cl = _mm_set1_epi8(0x0f), ch=_mm_set1_epi8(0xf0), cb = _mm_set1_epi16(0xff); #endif for(ip = in, op = out; ip != in+v; op += ESIZE*16/STRIDE) { unsigned char *p = op; __m128i iv[ESIZE],ov[ESIZE]; #ifdef __SSSE3__ #if ESIZE == 2 ov[0] = LD128((__m128i *)ip); ov[0] = _mm_shuffle_epi8(ov[0], sv); ov[1] = LD128((__m128i *)(ip+16)); ov[1] = _mm_shuffle_epi8(ov[1], sv); ip+= 32; PREFETCH(ip); iv[0] = _mm_unpacklo_epi64(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[1]); #elif ESIZE == 4 iv[0] = LD128((__m128i *) ip ); iv[0] = _mm_shuffle_epi8(iv[0], sv); iv[1] = LD128((__m128i *)(ip+16)); iv[1] = _mm_shuffle_epi8(iv[1], sv); iv[2] = LD128((__m128i *)(ip+32)); iv[2] = _mm_shuffle_epi8(iv[2], sv); iv[3] = LD128((__m128i *)(ip+48)); iv[3] = _mm_shuffle_epi8(iv[3], sv); ip += 64; PREFETCH(ip); ov[0] = _mm_unpacklo_epi32(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[1]); ov[2] = _mm_unpacklo_epi32(iv[2], iv[3]); ov[3] = _mm_unpackhi_epi32(iv[2], iv[3]); iv[0] = _mm_unpacklo_epi64(ov[0], ov[2]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[2]); iv[2] = _mm_unpacklo_epi64(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[3]); #elif ESIZE == 8 ov[0] = LD128((__m128i *) ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv); ov[1] = LD128((__m128i *)(ip+16)); ov[1] = _mm_shuffle_epi8(ov[1], sv); ov[2] = LD128((__m128i *)(ip+32)); ov[2] = _mm_shuffle_epi8(ov[2], sv); ov[3] = LD128((__m128i *)(ip+48)); ov[3] = _mm_shuffle_epi8(ov[3], sv); ip += 64; iv[0] = _mm_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[1]); iv[2] = _mm_unpacklo_epi16(ov[2], ov[3]); iv[3] = _mm_unpackhi_epi16(ov[2], ov[3]); ov[0] = _mm_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[2]); ov[2] = _mm_unpacklo_epi32(iv[1], iv[3]); ov[3] = _mm_unpackhi_epi32(iv[1], iv[3]); ov[4] = LD128((__m128i *) ip ); ov[4] = _mm_shuffle_epi8(ov[4], sv); ov[5] = LD128((__m128i *)(ip+16)); ov[5] = _mm_shuffle_epi8(ov[5], sv); ov[6] = LD128((__m128i *)(ip+32)); ov[6] = _mm_shuffle_epi8(ov[6], sv); ov[7] = LD128((__m128i *)(ip+48)); ov[7] = _mm_shuffle_epi8(ov[7], sv); ip += 64; PREFETCH(ip); iv[4] = _mm_unpacklo_epi16(ov[4], ov[5]); iv[5] = _mm_unpackhi_epi16(ov[4], ov[5]); iv[6] = _mm_unpacklo_epi16(ov[6], ov[7]); iv[7] = _mm_unpackhi_epi16(ov[6], ov[7]); ov[4] = _mm_unpacklo_epi32(iv[4], iv[6]); ov[5] = _mm_unpackhi_epi32(iv[4], iv[6]); ov[6] = _mm_unpacklo_epi32(iv[5], iv[7]); ov[7] = _mm_unpackhi_epi32(iv[5], iv[7]); iv[0] = _mm_unpacklo_epi64(ov[0], ov[4]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[4]); iv[2] = _mm_unpacklo_epi64(ov[1], ov[5]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[5]); iv[4] = _mm_unpacklo_epi64(ov[2], ov[6]); iv[5] = _mm_unpackhi_epi64(ov[2], ov[6]); iv[6] = _mm_unpacklo_epi64(ov[3], ov[7]); iv[7] = _mm_unpackhi_epi64(ov[3], ov[7]); #endif #elif defined(__SSE2__) #if ESIZE == 2 iv[0] = LD128((__m128i *)ip); ip += 16; iv[1] = LD128((__m128i *)ip); ip += 16; PREFETCH(ip); ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); iv[0] = _mm_unpacklo_epi8(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8(ov[0], ov[1]); ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); iv[0] = _mm_unpacklo_epi8(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8(ov[0], ov[1]); #elif ESIZE == 4 iv[0] = LD128((__m128i *) ip ); iv[1] = LD128((__m128i *)(ip+16)); iv[2] = LD128((__m128i *)(ip+32)); iv[3] = LD128((__m128i *)(ip+48)); ip += 64; PREFETCH(ip); ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); iv[0] = _mm_unpacklo_epi8(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8(ov[0], ov[1]); ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); ov[2] = _mm_unpacklo_epi8(iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8(iv[2], iv[3]); iv[2] = _mm_unpacklo_epi8(ov[2], ov[3]); iv[3] = _mm_unpackhi_epi8(ov[2], ov[3]); ov[2] = _mm_unpacklo_epi8(iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8(iv[2], iv[3]); iv[0] = _mm_unpacklo_epi64(ov[0], ov[2]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[2]); iv[2] = _mm_unpacklo_epi64(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[3]); #elif ESIZE == 8 iv[0] = LD128((__m128i *) ip ); iv[1] = LD128((__m128i *)(ip+ 16)); iv[2] = LD128((__m128i *)(ip+ 32)); iv[3] = LD128((__m128i *)(ip+ 48)); iv[4] = LD128((__m128i *)(ip+ 64)); iv[5] = LD128((__m128i *)(ip+ 80)); iv[6] = LD128((__m128i *)(ip+ 96)); iv[7] = LD128((__m128i *)(ip+112)); ip += 128; PREFETCH(ip); ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); ov[2] = _mm_unpacklo_epi8(iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8(iv[2], iv[3]); ov[4] = _mm_unpacklo_epi8(iv[4], iv[5]); ov[5] = _mm_unpackhi_epi8(iv[4], iv[5]); ov[6] = _mm_unpacklo_epi8(iv[6], iv[7]); ov[7] = _mm_unpackhi_epi8(iv[6], iv[7]); iv[0] = _mm_unpacklo_epi8(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8(ov[0], ov[1]); iv[2] = _mm_unpacklo_epi8(ov[2], ov[3]); iv[3] = _mm_unpackhi_epi8(ov[2], ov[3]); iv[4] = _mm_unpacklo_epi8(ov[4], ov[5]); iv[5] = _mm_unpackhi_epi8(ov[4], ov[5]); iv[6] = _mm_unpacklo_epi8(ov[6], ov[7]); iv[7] = _mm_unpackhi_epi8(ov[6], ov[7]); ov[0] = _mm_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[2]); ov[2] = _mm_unpacklo_epi32(iv[1], iv[3]); ov[3] = _mm_unpackhi_epi32(iv[1], iv[3]); ov[4] = _mm_unpacklo_epi32(iv[4], iv[6]); ov[5] = _mm_unpackhi_epi32(iv[4], iv[6]); ov[6] = _mm_unpacklo_epi32(iv[5], iv[7]); ov[7] = _mm_unpackhi_epi32(iv[5], iv[7]); iv[0] = _mm_unpacklo_epi64(ov[0], ov[4]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[4]); iv[2] = _mm_unpacklo_epi64(ov[1], ov[5]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[5]); iv[4] = _mm_unpacklo_epi64(ov[2], ov[6]); iv[5] = _mm_unpackhi_epi64(ov[2], ov[6]); iv[6] = _mm_unpacklo_epi64(ov[3], ov[7]); iv[7] = _mm_unpackhi_epi64(ov[3], ov[7]); #endif #endif #if STRIDE <= ESIZE _mm_storeu_si128((__m128i *) p, iv[0]); _mm_storeu_si128((__m128i *)(p+=stride), iv[1]); #if ESIZE > 2 _mm_storeu_si128((__m128i *)(p+=stride), iv[2]); _mm_storeu_si128((__m128i *)(p+=stride), iv[3]); #if ESIZE > 4 _mm_storeu_si128((__m128i *)(p+=stride), iv[4]); _mm_storeu_si128((__m128i *)(p+=stride), iv[5]); _mm_storeu_si128((__m128i *)(p+=stride), iv[6]); _mm_storeu_si128((__m128i *)(p+=stride), iv[7]); #endif #endif #else // Nibble ov[0] = _mm_and_si128(iv[0], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128( ov[0],2)); ov[1] = _mm_srli_epi16(_mm_and_si128(iv[0], ch), 4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128( ov[1],2)); ov[2] = _mm_and_si128(iv[1], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128( ov[2],2)); ov[3] = _mm_srli_epi16(_mm_and_si128(iv[1], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2)); _mm_storel_epi64((__m128i *) p, ov[0]); _mm_storel_epi64((__m128i *)(p+=stride), ov[1]); _mm_storel_epi64((__m128i *)(p+=stride), ov[2]); _mm_storel_epi64((__m128i *)(p+=stride), ov[3]); #if ESIZE > 2 ov[0] = _mm_and_si128(iv[2], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128( ov[0],2)); ov[1] = _mm_srli_epi16(_mm_and_si128(iv[2], ch),4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128( ov[1],2)); ov[2] = _mm_and_si128(iv[3], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128( ov[2],2)); ov[3] = _mm_srli_epi16(_mm_and_si128(iv[3], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2)); _mm_storel_epi64((__m128i *)(p+=stride), ov[0]); _mm_storel_epi64((__m128i *)(p+=stride), ov[1]); _mm_storel_epi64((__m128i *)(p+=stride), ov[2]); _mm_storel_epi64((__m128i *)(p+=stride), ov[3]); #if ESIZE > 4 ov[0] = _mm_and_si128(iv[4], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128( ov[0],2)); ov[1] = _mm_srli_epi16(_mm_and_si128(iv[4], ch), 4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128( ov[1],2)); ov[2] = _mm_and_si128(iv[5], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128( ov[2],2)); ov[3] = _mm_srli_epi16(_mm_and_si128(iv[5], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2)); _mm_storel_epi64((__m128i *)(p+=stride), ov[0]); _mm_storel_epi64((__m128i *)(p+=stride), ov[1]); _mm_storel_epi64((__m128i *)(p+=stride), ov[2]); _mm_storel_epi64((__m128i *)(p+=stride), ov[3]); ov[0] = _mm_and_si128(iv[6], cl); ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128( ov[0],2)); ov[1] = _mm_srli_epi16(_mm_and_si128(iv[6], ch),4); ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128( ov[1],2)); ov[2] = _mm_and_si128(iv[7], cl); ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128( ov[2],2)); ov[3] = _mm_srli_epi16(_mm_and_si128(iv[7], ch),4); ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2)); _mm_storel_epi64((__m128i *)(p+=stride), ov[0]); _mm_storel_epi64((__m128i *)(p+=stride), ov[1]); _mm_storel_epi64((__m128i *)(p+=stride), ov[2]); _mm_storel_epi64((__m128i *)(p+=stride), ov[3]); #endif #endif #endif } TEMPLATE2(tpenc,ESIZE)(in+v, n-v, out+v); } void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned v = n&~(ESIZE*16-1); unsigned stride = v/STRIDE; unsigned char *op,*ip; #if STRIDE > ESIZE __m128i cl = _mm_set1_epi8(0x0f), ch=_mm_set1_epi8(0xf0), cb = _mm_set1_epi16(0xff); #endif for(op = out,ip = in; op != out+v; ip += ESIZE*16/STRIDE) { unsigned char *p=ip; __m128i iv[ESIZE], ov[ESIZE]; #if STRIDE > ESIZE ov[0] = _mm_loadl_epi64((__m128i *) p ); ov[1] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); iv[0] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); ov[2] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[3] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); iv[1] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); #if ESIZE > 2 ov[0] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[1] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); iv[2] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); ov[2] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[3] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); iv[3] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); #endif #if ESIZE > 4 ov[0] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[1] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); iv[4] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); ov[2] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[3] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); iv[5] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); ov[0] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[1] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); ov[0] = _mm_and_si128(ov[0], cl); ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); ov[1] = _mm_and_si128(ov[1], cl); iv[6] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); ov[2] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[3] = _mm_loadl_epi64((__m128i *)(p+=stride)); ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl); iv[7] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); #endif #else iv[0] = _mm_loadu_si128((__m128i *) p ); iv[1] = _mm_loadu_si128((__m128i *)(p+=stride)); #if ESIZE > 2 iv[2] = _mm_loadu_si128((__m128i *)(p+=stride)); iv[3] = _mm_loadu_si128((__m128i *)(p+=stride)); #if ESIZE > 4 iv[4] = _mm_loadu_si128((__m128i *)(p+=stride)); iv[5] = _mm_loadu_si128((__m128i *)(p+=stride)); iv[6] = _mm_loadu_si128((__m128i *)(p+=stride)); iv[7] = _mm_loadu_si128((__m128i *)(p+=stride)); #endif #endif #endif PREFETCH(ip+(ESIZE*16/STRIDE)); #if ESIZE == 2 ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ST128((__m128i *)op, ov[0]); ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]); ST128((__m128i *)(op+16), ov[1]); op += 32; #elif ESIZE == 4 ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); ov[2] = _mm_unpacklo_epi8(iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8(iv[2], iv[3]); iv[0] = _mm_unpacklo_epi16(ov[0], ov[2]); ST128((__m128i *) op, iv[0]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[2]); ST128((__m128i *)(op+16),iv[1]); iv[2] = _mm_unpacklo_epi16(ov[1], ov[3]); ST128((__m128i *)(op+32),iv[2]); iv[3] = _mm_unpackhi_epi16(ov[1], ov[3]); ST128((__m128i *)(op+48),iv[3]); op += 64; #else ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); ov[2] = _mm_unpacklo_epi8(iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8(iv[2], iv[3]); iv[0] = _mm_unpacklo_epi16(ov[0], ov[2]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[2]); iv[2] = _mm_unpacklo_epi16(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi16(ov[1], ov[3]); ov[4] = _mm_unpacklo_epi8(iv[4], iv[5]); ov[5] = _mm_unpackhi_epi8(iv[4], iv[5]); ov[6] = _mm_unpacklo_epi8(iv[6], iv[7]); ov[7] = _mm_unpackhi_epi8(iv[6], iv[7]); iv[4] = _mm_unpacklo_epi16(ov[4], ov[6]); iv[5] = _mm_unpackhi_epi16(ov[4], ov[6]); iv[6] = _mm_unpacklo_epi16(ov[5], ov[7]); iv[7] = _mm_unpackhi_epi16(ov[5], ov[7]); ov[0] = _mm_unpacklo_epi32(iv[0], iv[4]); ST128((__m128i *) op, ov[0]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[4]); ST128((__m128i *)(op+ 16),ov[1]); ov[2] = _mm_unpacklo_epi32(iv[1], iv[5]); ST128((__m128i *)(op+ 32),ov[2]); ov[3] = _mm_unpackhi_epi32(iv[1], iv[5]); ST128((__m128i *)(op+ 48),ov[3]); ov[4] = _mm_unpacklo_epi32(iv[2], iv[6]); ST128((__m128i *)(op+ 64),ov[4]); ov[5] = _mm_unpackhi_epi32(iv[2], iv[6]); ST128((__m128i *)(op+ 80),ov[5]); ov[6] = _mm_unpacklo_epi32(iv[3], iv[7]); ST128((__m128i *)(op+ 96),ov[6]); ov[7] = _mm_unpackhi_epi32(iv[3], iv[7]); ST128((__m128i *)(op+112),ov[7]); op += 128; #endif } TEMPLATE2(tpdec,ESIZE)(in+v, n-v, out+v); } #endif #if defined(__AVX2__) && defined(AVX2_ON) void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned v = n&~(ESIZE*32-1); unsigned stride = v/STRIDE; unsigned char *op,*ip; #if ESIZE == 2 __m256i sv = _mm256_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); __m256i sv0 = _mm256_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0 ); __m256i sv1 = _mm256_set_epi8( 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1 ); #else __m256i pv = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0), #if ESIZE == 4 /*sv0 = _mm256_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0, 15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);*/ sv0= _mm256_set_epi8(15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0, 15, 11, 7, 3, 13, 9, 5, 1, 14, 10, 6, 2, 12, 8, 4, 0), sv1= _mm256_set_epi8( 13, 9, 5, 1, 15, 11, 7, 3, 12, 8, 4, 0, 14, 10, 6, 2, 13, 9, 5, 1, 15, 11, 7, 3, 12, 8, 4, 0, 14, 10, 6, 2); #else sv = _mm256_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0, 15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0 ), tv = _mm256_set_epi8(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0, 15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0); #endif #endif #if STRIDE > ESIZE __m256i cl = _mm256_set1_epi8(0x0f), ch=_mm256_set1_epi8(0xf0), cb = _mm256_set1_epi16(0xff); #endif for(ip = in,op = out; ip != in+v; op += ESIZE*32/STRIDE, ip += ESIZE*32) { unsigned char *p=op; __m256i iv[ESIZE],ov[ESIZE]; #if ESIZE == 2 #if 0 ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv); ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv); PREFETCH(ip); iv[0] = _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(ov[0], ov[1]), _MM_SHUFFLE(3, 1, 2, 0)); iv[1] = _mm256_permute4x64_epi64(_mm256_unpackhi_epi64(ov[0], ov[1]), _MM_SHUFFLE(3, 1, 2, 0)); #else ov[0] = _mm256_shuffle_epi8(LD256((__m256i *)ip), sv0); ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)),sv1); PREFETCH(ip); iv[0] = _mm256_permute4x64_epi64(_mm256_blend_epi32(ov[0], ov[1],0b11001100),_MM_SHUFFLE(3, 1, 2, 0)); iv[1] = _mm256_blend_epi32(ov[0], ov[1],0b00110011); iv[1] = _mm256_permute4x64_epi64(_mm256_shuffle_epi32(iv[1],_MM_SHUFFLE(1, 0, 3, 2)),_MM_SHUFFLE(3, 1, 2, 0)); #endif #elif ESIZE == 4 iv[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0); iv[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1); iv[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv0); iv[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv1); PREFETCH(ip); #if 0 ov[0] = _mm256_unpacklo_epi32(iv[0], iv[1]); ov[1] = _mm256_unpackhi_epi32(iv[0], iv[1]); ov[2] = _mm256_unpacklo_epi32(iv[2], iv[3]); ov[3] = _mm256_unpackhi_epi32(iv[2], iv[3]); iv[0] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[2]), pv); iv[1] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[0], ov[2]), pv); iv[2] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[1], ov[3]), pv); iv[3] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[3]), pv); #else ov[0] = _mm256_blend_epi32(iv[0], iv[1],0b10101010); ov[1] = _mm256_shuffle_epi32(_mm256_blend_epi32(iv[0], iv[1],0b01010101),_MM_SHUFFLE(2, 3, 0, 1)); ov[2] = _mm256_blend_epi32(iv[2], iv[3],0b10101010); ov[3] = _mm256_shuffle_epi32(_mm256_blend_epi32(iv[2], iv[3],0b01010101),_MM_SHUFFLE(2, 3, 0, 1)); iv[0] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[2]), pv); iv[1] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[0], ov[2]), pv); iv[2] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[1], ov[3]), pv); iv[3] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[3]), pv); //iv[0] = _mm256_permutevar8x32_epi32(_mm256_blend_epi32(ov[0], _mm256_shuffle_epi32(ov[2],_MM_SHUFFLE(1, 0, 3, 2)), 0b11001100), pv); //iv[1] = _mm256_permutevar8x32_epi32(_mm256_blend_epi32( _mm256_shuffle_epi32(ov[0],_MM_SHUFFLE(1, 0, 3, 2)), ov[2],0b11001100), pv); //iv[2] = _mm256_permutevar8x32_epi32(_mm256_blend_epi32(ov[1], _mm256_shuffle_epi32(ov[3],_MM_SHUFFLE(1, 0, 3, 2)), 0b11001100), pv); //iv[3] = _mm256_permutevar8x32_epi32(_mm256_blend_epi32( _mm256_shuffle_epi32(ov[1],_MM_SHUFFLE(1, 0, 3, 2)), ov[3],0b11001100), pv); #endif #else ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv); ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv); ov[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv); ov[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv); iv[0] = _mm256_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm256_unpackhi_epi16(ov[0], ov[1]); iv[2] = _mm256_unpacklo_epi16(ov[2], ov[3]); iv[3] = _mm256_unpackhi_epi16(ov[2], ov[3]); ov[0] = _mm256_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm256_unpackhi_epi32(iv[0], iv[2]); ov[2] = _mm256_unpacklo_epi32(iv[1], iv[3]); ov[3] = _mm256_unpackhi_epi32(iv[1], iv[3]); ov[4] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+128)), sv); ov[5] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+160)), sv); ov[6] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+192)), sv); ov[7] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+224)), sv); PREFETCH(ip); iv[4] = _mm256_unpacklo_epi16(ov[4], ov[5]); iv[5] = _mm256_unpackhi_epi16(ov[4], ov[5]); iv[6] = _mm256_unpacklo_epi16(ov[6], ov[7]); iv[7] = _mm256_unpackhi_epi16(ov[6], ov[7]); ov[4] = _mm256_unpacklo_epi32(iv[4], iv[6]); ov[5] = _mm256_unpackhi_epi32(iv[4], iv[6]); ov[6] = _mm256_unpacklo_epi32(iv[5], iv[7]); ov[7] = _mm256_unpackhi_epi32(iv[5], iv[7]); iv[0] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[4]), pv), tv); iv[1] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[0], ov[4]), pv), tv); iv[2] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[1], ov[5]), pv), tv); iv[3] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[5]), pv), tv); iv[4] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[2], ov[6]), pv), tv); iv[5] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[2], ov[6]), pv), tv); iv[6] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[3], ov[7]), pv), tv); iv[7] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[3], ov[7]), pv), tv); #endif #if STRIDE <= ESIZE _mm256_storeu_si256((__m256i *) p, iv[0]); _mm256_storeu_si256((__m256i *)(p+=stride), iv[1]); #if ESIZE > 2 _mm256_storeu_si256((__m256i *)(p+=stride), iv[2]); _mm256_storeu_si256((__m256i *)(p+=stride), iv[3]); #if ESIZE > 4 _mm256_storeu_si256((__m256i *)(p+=stride), iv[4]); _mm256_storeu_si256((__m256i *)(p+=stride), iv[5]); _mm256_storeu_si256((__m256i *)(p+=stride), iv[6]); _mm256_storeu_si256((__m256i *)(p+=stride), iv[7]); #endif #endif #else // Nibble Transpose #define mm256_packus_epi16(a, b) _mm256_permute4x64_epi64(_mm256_packus_epi16(a, b), _MM_SHUFFLE(3, 1, 2, 0)) ov[0] = _mm256_and_si256(iv[0], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[0], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); ov[2] = _mm256_and_si256(iv[1], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[1], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); _mm_storeu_si128((__m256i *) p , _mm256_castsi256_si128(ov[0])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[1])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[2])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[3])); #if ESIZE > 2 ov[0] = _mm256_and_si256(iv[2], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[2], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); ov[2] = _mm256_and_si256(iv[3], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[3], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[0])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[1])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[2])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[3])); #if ESIZE > 4 ov[0] = _mm256_and_si256(iv[4], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[4], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); ov[2] = _mm256_and_si256(iv[5], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[5], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[0])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[1])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[2])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[3])); ov[0] = _mm256_and_si256(iv[6], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2)); ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[6], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2)); ov[2] = _mm256_and_si256(iv[7], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2)); ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[7], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2)); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[0])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[1])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[2])); _mm_storeu_si128((__m256i *)(p+=stride), _mm256_castsi256_si128(ov[3])); #endif #endif #endif } TEMPLATE2(tpenc128v,ESIZE)(in+v, n-v, out+v); } #define NBL0(x,y) ov[x] = _mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(p ))),_MM_SHUFFLE(3, 1, 2, 0));\ ov[y] = _mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(p+=stride))),_MM_SHUFFLE(3, 1, 2, 0)); #define NBL(x,y) ov[x] = _mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(p+=stride))),_MM_SHUFFLE(3, 1, 2, 0));\ ov[y] = _mm256_permute4x64_epi64(_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(p+=stride))),_MM_SHUFFLE(3, 1, 2, 0)); #define NB(x,y,_iv_) {\ ov[x] = _mm256_and_si256(_mm256_unpacklo_epi8(ov[x], _mm256_srli_epi16(ov[x],4)), cl);\ ov[y] = _mm256_and_si256(_mm256_unpacklo_epi8(ov[y], _mm256_srli_epi16(ov[y],4)), cl);\ _iv_ = _mm256_or_si256(_mm256_slli_epi16(ov[y],4), ov[x]); \ } void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned v = n&~(ESIZE*32-1); unsigned stride = v/STRIDE; unsigned char *op,*ip; #if STRIDE > ESIZE __m256i cl = _mm256_set1_epi8(0x0f), ch=_mm256_set1_epi8(0xf0), cb = _mm256_set1_epi16(0xff); #endif for(op = out,ip = in; op != out+v; ip += ESIZE*32/STRIDE, op += ESIZE*32) { unsigned char *p = ip; __m256i iv[ESIZE], ov[ESIZE]; #if STRIDE > ESIZE NBL0(0,1); NBL( 2,3); NB(0,1,iv[0]); NB(2,3,iv[1]); #if ESIZE > 2 NBL( 0,1); NBL( 2,3); NB(0,1,iv[2]); NB(2,3,iv[3]); #if ESIZE > 4 NBL(4,5); NBL( 6,7); NB(4,5,iv[4]); NB(6,7,iv[5]); NBL(4,5); NBL( 6,7); NB(4,5,iv[6]); NB(6,7,iv[7]); #endif #endif #else iv[0] = _mm256_loadu_si256((__m256i *) p ); iv[1] = _mm256_loadu_si256((__m256i *)(p+=stride)); #if ESIZE > 2 iv[2] = _mm256_loadu_si256((__m256i *)(p+=stride)); iv[3] = _mm256_loadu_si256((__m256i *)(p+=stride)); #if ESIZE > 4 iv[4] = _mm256_loadu_si256((__m256i *)(p+=stride)); iv[5] = _mm256_loadu_si256((__m256i *)(p+=stride)); iv[6] = _mm256_loadu_si256((__m256i *)(p+=stride)); iv[7] = _mm256_loadu_si256((__m256i *)(p+=stride)); #endif #endif #endif PREFETCH(ip+ESIZE*32/STRIDE); #if ESIZE == 2 ov[0] = _mm256_permute4x64_epi64(iv[0], _MM_SHUFFLE(3, 1, 2, 0)); ov[1] = _mm256_permute4x64_epi64(iv[1], _MM_SHUFFLE(3, 1, 2, 0)); _mm256_storeu_si256((__m256i *)op, _mm256_unpacklo_epi8(ov[0], ov[1])); _mm256_storeu_si256((__m256i *)(op+32), _mm256_unpackhi_epi8(ov[0], ov[1])); #elif ESIZE == 4 ov[0] = _mm256_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm256_unpackhi_epi8(iv[0], iv[1]); ov[2] = _mm256_unpacklo_epi8(iv[2], iv[3]); ov[3] = _mm256_unpackhi_epi8(iv[2], iv[3]); iv[0] = _mm256_unpacklo_epi16(ov[0], ov[2]); iv[1] = _mm256_unpackhi_epi16(ov[0], ov[2]); iv[2] = _mm256_unpacklo_epi16(ov[1], ov[3]); iv[3] = _mm256_unpackhi_epi16(ov[1], ov[3]); ov[0] = _mm256_permute2x128_si256(iv[0], iv[1], (2 << 4) | 0); ov[1] = _mm256_permute2x128_si256(iv[2], iv[3], (2 << 4) | 0); ov[2] = _mm256_permute2x128_si256(iv[0], iv[1], (3 << 4) | 1); ov[3] = _mm256_permute2x128_si256(iv[2], iv[3], (3 << 4) | 1); _mm256_storeu_si256((__m256i *) op, ov[0]); _mm256_storeu_si256((__m256i *)(op+32), ov[1]); _mm256_storeu_si256((__m256i *)(op+64), ov[2]); _mm256_storeu_si256((__m256i *)(op+96), ov[3]); #else ov[0] = _mm256_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm256_unpackhi_epi8(iv[0], iv[1]); ov[2] = _mm256_unpacklo_epi8(iv[2], iv[3]); ov[3] = _mm256_unpackhi_epi8(iv[2], iv[3]); iv[0] = _mm256_permute4x64_epi64(_mm256_unpacklo_epi16(ov[0], ov[2]), _MM_SHUFFLE(3, 1, 2, 0)); iv[1] = _mm256_permute4x64_epi64(_mm256_unpackhi_epi16(ov[0], ov[2]), _MM_SHUFFLE(3, 1, 2, 0)); iv[2] = _mm256_permute4x64_epi64(_mm256_unpacklo_epi16(ov[1], ov[3]), _MM_SHUFFLE(3, 1, 2, 0)); iv[3] = _mm256_permute4x64_epi64(_mm256_unpackhi_epi16(ov[1], ov[3]), _MM_SHUFFLE(3, 1, 2, 0)); ov[4] = _mm256_unpacklo_epi8(iv[4], iv[5]); ov[5] = _mm256_unpackhi_epi8(iv[4], iv[5]); ov[6] = _mm256_unpacklo_epi8(iv[6], iv[7]); ov[7] = _mm256_unpackhi_epi8(iv[6], iv[7]); iv[4] = _mm256_permute4x64_epi64(_mm256_unpacklo_epi16(ov[4], ov[6]), _MM_SHUFFLE(3, 1, 2, 0)); iv[5] = _mm256_permute4x64_epi64(_mm256_unpackhi_epi16(ov[4], ov[6]), _MM_SHUFFLE(3, 1, 2, 0)); iv[6] = _mm256_permute4x64_epi64(_mm256_unpacklo_epi16(ov[5], ov[7]), _MM_SHUFFLE(3, 1, 2, 0)); iv[7] = _mm256_permute4x64_epi64(_mm256_unpackhi_epi16(ov[5], ov[7]), _MM_SHUFFLE(3, 1, 2, 0)); ov[0] = _mm256_unpacklo_epi32(iv[0], iv[4]); ov[1] = _mm256_unpacklo_epi32(iv[1], iv[5]); ov[2] = _mm256_unpacklo_epi32(iv[2], iv[6]); ov[3] = _mm256_unpacklo_epi32(iv[3], iv[7]); ov[4] = _mm256_unpackhi_epi32(iv[0], iv[4]); ov[5] = _mm256_unpackhi_epi32(iv[1], iv[5]); ov[6] = _mm256_unpackhi_epi32(iv[2], iv[6]); ov[7] = _mm256_unpackhi_epi32(iv[3], iv[7]); ST256((__m256i *) op, ov[0] ); ST256((__m256i *)(op+ 32), ov[1] ); ST256((__m256i *)(op+ 64), ov[2] ); ST256((__m256i *)(op+ 96), ov[3] ); ST256((__m256i *)(op+128), ov[4] ); ST256((__m256i *)(op+160), ov[5] ); ST256((__m256i *)(op+192), ov[6] ); ST256((__m256i *)(op+224), ov[7] ); #endif } if(n-v) TEMPLATE2(tpdec128v,ESIZE)(in+v, n-v, out+v); } #endif #endif #endif