Transform: Byte+Nibble Transpose/Shuffle
This commit is contained in:
429
transpose.c
429
transpose.c
@ -75,7 +75,7 @@
|
||||
#define LD256(ip) _mm256_loadu_si256(ip)
|
||||
#define ST256(op,v) _mm256_storeu_si256(op,v)
|
||||
#define TPENC256V tpenc256v
|
||||
#define TPDEC256V tpdec256v
|
||||
#define TPDEC256V tpdec256v
|
||||
#include "transpose.c"
|
||||
#undef STRIDE
|
||||
|
||||
@ -83,7 +83,7 @@
|
||||
#define TPENC128V tp4enc128v
|
||||
#define TPDEC128V tp4dec128v
|
||||
#define TPENC256V tp4enc256v
|
||||
#define TPDEC256V tp4dec256v
|
||||
#define TPDEC256V tp4dec256v
|
||||
#include "transpose.c"
|
||||
#undef ESIZE
|
||||
|
||||
@ -93,7 +93,7 @@
|
||||
#define TPENC128V tpenc128v
|
||||
#define TPDEC128V tpdec128v
|
||||
#define TPENC256V tpenc256v
|
||||
#define TPDEC256V tpdec256v
|
||||
#define TPDEC256V tpdec256v
|
||||
#include "transpose.c"
|
||||
#undef STRIDE
|
||||
|
||||
@ -101,7 +101,7 @@
|
||||
#define TPENC128V tp4enc128v
|
||||
#define TPDEC128V tp4dec128v
|
||||
#define TPENC256V tp4enc256v
|
||||
#define TPDEC256V tp4dec256v
|
||||
#define TPDEC256V tp4dec256v
|
||||
#include "transpose.c"
|
||||
#undef ESIZE
|
||||
#undef STRIDE
|
||||
@ -112,7 +112,7 @@
|
||||
#define TPENC128V tpenc128v
|
||||
#define TPDEC128V tpdec128v
|
||||
#define TPENC256V tpenc256v
|
||||
#define TPDEC256V tpdec256v
|
||||
#define TPDEC256V tpdec256v
|
||||
#include "transpose.c"
|
||||
#undef STRIDE
|
||||
|
||||
@ -120,7 +120,7 @@
|
||||
#define TPENC128V tp4enc128v
|
||||
#define TPDEC128V tp4dec128v
|
||||
#define TPENC256V tp4enc256v
|
||||
#define TPDEC256V tp4dec256v
|
||||
#define TPDEC256V tp4dec256v
|
||||
#include "transpose.c"
|
||||
|
||||
//--------------------- CPU detection -------------------------------------------
|
||||
@ -186,12 +186,12 @@ int cpuiset(void) {
|
||||
if( c[2] & (1 << 20)) { _cpuiset = 42;
|
||||
if((c[2] & (1 << 28)) &&
|
||||
(c[2] & (1 << 27)) && // OSXSAVE
|
||||
(c[2] & (1 << 26)) && // XSAVE
|
||||
(xgetbv(0) & 6)==6){ _cpuiset = 50; // AVX
|
||||
if(c[2]& (1 << 25)) _cpuiset = 51; // +AES
|
||||
(c[2] & (1 << 26)) && // XSAVE
|
||||
(xgetbv(0) & 6)==6){ _cpuiset = 50; // AVX
|
||||
if(c[2]& (1 << 25)) _cpuiset = 51; // +AES
|
||||
cpuid(c, 7);
|
||||
if(c[1] & (1 << 5)) _cpuiset = 52; // AVX2
|
||||
}}}}}}}}}
|
||||
}}}}}}}}}
|
||||
#endif
|
||||
return _cpuiset;
|
||||
}
|
||||
@ -200,12 +200,12 @@ int cpuiset(void) {
|
||||
typedef void (*TPFUNC)( unsigned char *in, unsigned n, unsigned char *out);
|
||||
|
||||
// 0 1 2 3 4 5 6 7 8 9 16
|
||||
static TPFUNC _tpe[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 };
|
||||
static TPFUNC _tpd[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 };
|
||||
static TPFUNC _tpe[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 };
|
||||
static TPFUNC _tpd[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 };
|
||||
|
||||
#ifdef USE_SSE
|
||||
static TPFUNC _tp4e[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // Nibble
|
||||
static TPFUNC _tp4d[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 };
|
||||
static TPFUNC _tp4e[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // Nibble
|
||||
static TPFUNC _tp4d[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 };
|
||||
#endif
|
||||
|
||||
static int tpset;
|
||||
@ -227,7 +227,7 @@ void tpini(int id) {
|
||||
_tpe[2] = tpenc128v2; _tpd[2] = tpdec128v2; _tp4e[2] = tp4enc128v2; _tp4d[2] = tp4dec128v2;
|
||||
_tpe[4] = tpenc128v4; _tpd[4] = tpdec128v4; _tp4e[4] = tp4enc128v4; _tp4d[4] = tp4dec128v4;
|
||||
_tpe[8] = tpenc128v8; _tpd[8] = tpdec128v8; _tp4e[8] = tp4enc128v8; _tp4d[8] = tp4dec128v8;
|
||||
if(i == 35) { /*_tpe[2] = tpenc2;*/ _tpd[8] = tpdec8; } // ARM NEON scalar is faster
|
||||
if(i == 35) _tpd[8] = tpdec8; // ARM NEON scalar is faster
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -241,7 +241,7 @@ void tpenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
|
||||
unsigned char *op,*ip;
|
||||
for(ip = in,op = out; ip < in+stride*esize; op++)
|
||||
for(i = 0; i < esize; i++)
|
||||
op[i*stride] = *ip++;
|
||||
op[i*stride] = *ip++;
|
||||
for(op = out + esize*stride; ip < in+n;)
|
||||
*op++ = *ip++;
|
||||
}
|
||||
@ -255,10 +255,10 @@ void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
|
||||
unsigned i, stride = n/esize;
|
||||
unsigned char *op,*ip;
|
||||
for(op = out,ip = in; op < out+stride*esize; ip++)
|
||||
for(i = 0; i < esize; i++)
|
||||
*op++ = ip[i*stride];
|
||||
for(i = 0; i < esize; i++)
|
||||
*op++ = ip[i*stride];
|
||||
for(ip = in+esize*stride; op < out+n;)
|
||||
*op++ = *ip++;
|
||||
*op++ = *ip++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -348,39 +348,39 @@ void TEMPLATE2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out)
|
||||
|
||||
#if powof2(ESIZE)
|
||||
e = in+(n&~(ESIZE-1));
|
||||
#else
|
||||
#else
|
||||
e = in+stride*ESIZE;
|
||||
#endif
|
||||
|
||||
for(ip = in,op = out; ip < e; op++, ip+=ESIZE) { unsigned char *p = op;
|
||||
p[0] = ip[ 0];
|
||||
*SIE(p, 1) = ip[ 1];
|
||||
#if ESIZE > 2
|
||||
*SIE(p, 2) = ip[ 2];
|
||||
#if ESIZE > 3
|
||||
*SIE(p, 3) = ip[ 3];
|
||||
#if ESIZE > 4
|
||||
p[0] = ip[ 0];
|
||||
*SIE(p, 1) = ip[ 1];
|
||||
#if ESIZE > 2
|
||||
*SIE(p, 2) = ip[ 2];
|
||||
#if ESIZE > 3
|
||||
*SIE(p, 3) = ip[ 3];
|
||||
#if ESIZE > 4
|
||||
uint32_t u = ctou32(p);
|
||||
*SIE(p, 4) = ip[ 4];
|
||||
*SIE(p, 5) = ip[ 5];
|
||||
*SIE(p, 6) = ip[ 6];
|
||||
*SIE(p, 7) = ip[ 7];
|
||||
#if ESIZE > 8
|
||||
*SIE(p, 8) = ip[ 8];
|
||||
*SIE(p, 9) = ip[ 9];
|
||||
*SIE(p,10) = ip[10];
|
||||
*SIE(p,11) = ip[11];
|
||||
*SIE(p,12) = ip[12];
|
||||
*SIE(p,13) = ip[13];
|
||||
*SIE(p,14) = ip[14];
|
||||
*SIE(p,15) = ip[15];
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
*SIE(p, 4) = ip[ 4];
|
||||
*SIE(p, 5) = ip[ 5];
|
||||
*SIE(p, 6) = ip[ 6];
|
||||
*SIE(p, 7) = ip[ 7];
|
||||
#if ESIZE > 8
|
||||
*SIE(p, 8) = ip[ 8];
|
||||
*SIE(p, 9) = ip[ 9];
|
||||
*SIE(p,10) = ip[10];
|
||||
*SIE(p,11) = ip[11];
|
||||
*SIE(p,12) = ip[12];
|
||||
*SIE(p,13) = ip[13];
|
||||
*SIE(p,14) = ip[14];
|
||||
*SIE(p,15) = ip[15];
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
for(op = out+stride*ESIZE;ip < in+n;)
|
||||
*op++ = *ip++;
|
||||
*op++ = *ip++;
|
||||
}
|
||||
|
||||
void TEMPLATE2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
|
||||
@ -389,37 +389,37 @@ void TEMPLATE2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out)
|
||||
|
||||
#if powof2(ESIZE)
|
||||
e = out+(n&~(ESIZE-1));
|
||||
#else
|
||||
#else
|
||||
e = out+stride*ESIZE;
|
||||
#endif
|
||||
for(op = out,ip = in; op < e; ip++,op+=ESIZE) { unsigned char *p = ip;
|
||||
op[ 0] = *p;
|
||||
op[ 1] = *SID(p,1);
|
||||
#if ESIZE > 2
|
||||
op[ 2] = *SID(p,2);
|
||||
#if ESIZE > 3
|
||||
op[ 3] = *SID(p,3);
|
||||
#if ESIZE > 4
|
||||
op[ 4] = *SID(p,4);
|
||||
op[ 5] = *SID(p,5);
|
||||
op[ 6] = *SID(p,6);
|
||||
op[ 7] = *SID(p,7);
|
||||
#if ESIZE > 8
|
||||
op[ 8] = *SID(p,8);
|
||||
op[ 9] = *SID(p,9);
|
||||
op[10] = *SID(p,10);
|
||||
op[11] = *SID(p,11);
|
||||
op[12] = *SID(p,12);
|
||||
op[13] = *SID(p,13);
|
||||
op[14] = *SID(p,14);
|
||||
op[15] = *SID(p,15);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
op[ 0] = *p;
|
||||
op[ 1] = *SID(p,1);
|
||||
#if ESIZE > 2
|
||||
op[ 2] = *SID(p,2);
|
||||
#if ESIZE > 3
|
||||
op[ 3] = *SID(p,3);
|
||||
#if ESIZE > 4
|
||||
op[ 4] = *SID(p,4);
|
||||
op[ 5] = *SID(p,5);
|
||||
op[ 6] = *SID(p,6);
|
||||
op[ 7] = *SID(p,7);
|
||||
#if ESIZE > 8
|
||||
op[ 8] = *SID(p,8);
|
||||
op[ 9] = *SID(p,9);
|
||||
op[10] = *SID(p,10);
|
||||
op[11] = *SID(p,11);
|
||||
op[12] = *SID(p,12);
|
||||
op[13] = *SID(p,13);
|
||||
op[14] = *SID(p,14);
|
||||
op[15] = *SID(p,15);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
for(ip = in+stride*ESIZE; op < out+n; )
|
||||
*op++ = *ip++;
|
||||
*op++ = *ip++;
|
||||
}
|
||||
#endif // STRIDE
|
||||
#endif
|
||||
@ -437,54 +437,54 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
unsigned char *op,*ip;
|
||||
|
||||
#if defined(__SSE3__) || defined(__ARM_NEON)
|
||||
#if ESIZE == 2
|
||||
#if ESIZE == 2
|
||||
__m128i sv = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1,
|
||||
14, 12, 10, 8, 6, 4, 2, 0);
|
||||
#elif ESIZE == 4
|
||||
__m128i sv = _mm_set_epi8(15, 11, 7,3,
|
||||
14, 10, 6,2,
|
||||
13, 9, 5,1,
|
||||
12, 8, 4,0);
|
||||
13, 9, 5,1,
|
||||
12, 8, 4,0);
|
||||
#else
|
||||
__m128i sv = _mm_set_epi8(15, 7,
|
||||
14, 6,
|
||||
13, 5,
|
||||
12, 4,
|
||||
11, 3,
|
||||
10, 2,
|
||||
9, 1,
|
||||
8, 0 );
|
||||
#endif
|
||||
13, 5,
|
||||
12, 4,
|
||||
11, 3,
|
||||
10, 2,
|
||||
9, 1,
|
||||
8, 0 );
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if STRIDE > ESIZE
|
||||
__m128i cl = _mm_set1_epi8(0x0f), ch=_mm_set1_epi8(0xf0), cb = _mm_set1_epi16(0xff);
|
||||
#endif
|
||||
|
||||
for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op; PREFETCH(ip+(ESIZE*16)*ESIZE,0);
|
||||
for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op; PREFETCH(ip+(ESIZE*16)*ESIZE,0);
|
||||
__m128i iv[ESIZE],ov[ESIZE];
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if ESIZE == 2
|
||||
#ifdef __ARM_NEON
|
||||
#ifdef __ARM_NEON
|
||||
uint8x16x2_t w = vld2q_u8(ip);
|
||||
#if STRIDE <= ESIZE
|
||||
ST0(p,(__m128i)w.val[0]); ST(p,(__m128i)w.val[1],1);
|
||||
#else
|
||||
iv[0] = (__m128i)w.val[0]; iv[1] = (__m128i)w.val[1];
|
||||
#endif
|
||||
#else
|
||||
#else
|
||||
ov[0] = LD128(ip); ov[0] = _mm_shuffle_epi8(ov[0], sv);
|
||||
ov[1] = LD128(ip+16); ov[1] = _mm_shuffle_epi8(ov[1], sv);
|
||||
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[1]);
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[1]);
|
||||
#if STRIDE <= ESIZE
|
||||
ST0(p,iv[0]); ST(p,iv[1],1);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#elif ESIZE == 4
|
||||
#ifdef __ARM_NEON
|
||||
uint8x16x4_t w = vld4q_u8(ip);
|
||||
#ifdef __ARM_NEON
|
||||
uint8x16x4_t w = vld4q_u8(ip);
|
||||
#if STRIDE <= ESIZE
|
||||
ST0(p,(__m128i)w.val[0]); ST(p,(__m128i)w.val[1],1); ST(p,(__m128i)w.val[2],2); ST(p,(__m128i)w.val[3],3);
|
||||
#else
|
||||
@ -496,7 +496,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
iv[2] = LD128(ip+32); iv[2] = _mm_shuffle_epi8(iv[2], sv);
|
||||
iv[3] = LD128(ip+48); iv[3] = _mm_shuffle_epi8(iv[3], sv);
|
||||
|
||||
ov[0] = _mm_unpacklo_epi32(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[1]);
|
||||
ov[0] = _mm_unpacklo_epi32(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[1]);
|
||||
ov[2] = _mm_unpacklo_epi32(iv[2], iv[3]); ov[3] = _mm_unpackhi_epi32(iv[2], iv[3]);
|
||||
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[2]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[2]);
|
||||
@ -504,10 +504,10 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#if STRIDE <= ESIZE
|
||||
ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#elif ESIZE == 8
|
||||
#ifdef __ARM_NEON
|
||||
#ifdef __ARM_NEON
|
||||
#define vzipl_u16(_a_,_b_) vzip_u16(vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_)))
|
||||
#define vziph_u16(_a_,_b_) vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_)))
|
||||
//#define VQ
|
||||
@ -529,7 +529,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[6] = LD128(ip+ 96); //ov[6] = _mm_shuffle_epi8(ov[6], sv);
|
||||
ov[7] = LD128(ip+112); //ov[7] = _mm_shuffle_epi8(ov[7], sv);
|
||||
|
||||
v8[0] = vzipq_u8((uint8x16_t)ov[0], (uint8x16_t)ov[1]);
|
||||
v8[0] = vzipq_u8((uint8x16_t)ov[0], (uint8x16_t)ov[1]);
|
||||
v8[1] = vzipq_u8((uint8x16_t)ov[2], (uint8x16_t)ov[3]);
|
||||
v8[2] = vzipq_u8((uint8x16_t)ov[4], (uint8x16_t)ov[5]);
|
||||
v8[3] = vzipq_u8((uint8x16_t)ov[6], (uint8x16_t)ov[7]);
|
||||
@ -548,11 +548,11 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
v32[2] = vzipq_u32(vreinterpretq_u32_u16(v16[1].val[0]), vreinterpretq_u32_u16(v16[3].val[0]));
|
||||
v32[3] = vzipq_u32(vreinterpretq_u32_u16(v16[1].val[1]), vreinterpretq_u32_u16(v16[3].val[1]));
|
||||
|
||||
iv[0] = _mm_unpacklo_epi64(v32[0].val[0], v32[2].val[0]); iv[1] = _mm_unpackhi_epi64(v32[0].val[0], v32[2].val[0]);
|
||||
iv[0] = _mm_unpacklo_epi64(v32[0].val[0], v32[2].val[0]); iv[1] = _mm_unpackhi_epi64(v32[0].val[0], v32[2].val[0]);
|
||||
iv[2] = _mm_unpacklo_epi64(v32[0].val[1], v32[2].val[1]); iv[3] = _mm_unpackhi_epi64(v32[0].val[1], v32[2].val[1]);
|
||||
iv[4] = _mm_unpacklo_epi64(v32[1].val[0], v32[3].val[0]); iv[5] = _mm_unpackhi_epi64(v32[1].val[0], v32[3].val[0]);
|
||||
iv[4] = _mm_unpacklo_epi64(v32[1].val[0], v32[3].val[0]); iv[5] = _mm_unpackhi_epi64(v32[1].val[0], v32[3].val[0]);
|
||||
iv[6] = _mm_unpacklo_epi64(v32[1].val[1], v32[3].val[1]); iv[7] = _mm_unpackhi_epi64(v32[1].val[1], v32[3].val[1]);
|
||||
#else
|
||||
#else
|
||||
ov[0] = LD128(ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv);
|
||||
ov[1] = LD128(ip+ 16); ov[1] = _mm_shuffle_epi8(ov[1], sv);
|
||||
ov[2] = LD128(ip+ 32); ov[2] = _mm_shuffle_epi8(ov[2], sv);
|
||||
@ -561,9 +561,9 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[5] = LD128(ip+ 80); ov[5] = _mm_shuffle_epi8(ov[5], sv);
|
||||
ov[6] = LD128(ip+ 96); ov[6] = _mm_shuffle_epi8(ov[6], sv);
|
||||
ov[7] = LD128(ip+112); ov[7] = _mm_shuffle_epi8(ov[7], sv);
|
||||
v16[0] = vzipl_u16(ov[0], ov[1]); v16[1] = vziph_u16(ov[0], ov[1]);
|
||||
v16[0] = vzipl_u16(ov[0], ov[1]); v16[1] = vziph_u16(ov[0], ov[1]);
|
||||
v16[2] = vzipl_u16(ov[2], ov[3]); v16[3] = vziph_u16(ov[2], ov[3]);
|
||||
v16[4] = vzipl_u16(ov[4], ov[5]); v16[5] = vziph_u16(ov[4], ov[5]);
|
||||
v16[4] = vzipl_u16(ov[4], ov[5]); v16[5] = vziph_u16(ov[4], ov[5]);
|
||||
v16[6] = vzipl_u16(ov[6], ov[7]); v16[7] = vziph_u16(ov[6], ov[7]);
|
||||
|
||||
v32[0] = vzip_u32(vreinterpret_u32_u16(v16[0].val[0]), vreinterpret_u32_u16(v16[2].val[0]) );
|
||||
@ -588,13 +588,13 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#if STRIDE <= ESIZE
|
||||
ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3); ST(p,iv[4],4); ST(p,iv[5],5); ST(p,iv[6],6); ST(p,iv[7],7);
|
||||
#endif
|
||||
#else // SSE
|
||||
#else // SSE
|
||||
ov[0] = LD128(ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv);
|
||||
ov[1] = LD128(ip+16); ov[1] = _mm_shuffle_epi8(ov[1], sv);
|
||||
ov[2] = LD128(ip+32); ov[2] = _mm_shuffle_epi8(ov[2], sv);
|
||||
ov[3] = LD128(ip+48); ov[3] = _mm_shuffle_epi8(ov[3], sv);
|
||||
|
||||
iv[0] = _mm_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[1]);
|
||||
iv[0] = _mm_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[1]);
|
||||
iv[2] = _mm_unpacklo_epi16(ov[2], ov[3]); iv[3] = _mm_unpackhi_epi16(ov[2], ov[3]);
|
||||
|
||||
ov[0] = _mm_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[2]);
|
||||
@ -605,16 +605,16 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[6] = LD128(ip+ 96); ov[6] = _mm_shuffle_epi8(ov[6], sv);
|
||||
ov[7] = LD128(ip+112); ov[7] = _mm_shuffle_epi8(ov[7], sv);
|
||||
|
||||
iv[4] = _mm_unpacklo_epi16(ov[4], ov[5]); iv[5] = _mm_unpackhi_epi16(ov[4], ov[5]);
|
||||
iv[4] = _mm_unpacklo_epi16(ov[4], ov[5]); iv[5] = _mm_unpackhi_epi16(ov[4], ov[5]);
|
||||
iv[6] = _mm_unpacklo_epi16(ov[6], ov[7]); iv[7] = _mm_unpackhi_epi16(ov[6], ov[7]);
|
||||
|
||||
ov[4] = _mm_unpacklo_epi32(iv[4], iv[6]); ov[5] = _mm_unpackhi_epi32(iv[4], iv[6]);
|
||||
ov[6] = _mm_unpacklo_epi32(iv[5], iv[7]); ov[7] = _mm_unpackhi_epi32(iv[5], iv[7]);
|
||||
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[4]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[4]);
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[4]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[4]);
|
||||
iv[2] = _mm_unpacklo_epi64(ov[1], ov[5]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[5]);
|
||||
|
||||
iv[4] = _mm_unpacklo_epi64(ov[2], ov[6]); iv[5] = _mm_unpackhi_epi64(ov[2], ov[6]);
|
||||
iv[4] = _mm_unpacklo_epi64(ov[2], ov[6]); iv[5] = _mm_unpackhi_epi64(ov[2], ov[6]);
|
||||
iv[6] = _mm_unpacklo_epi64(ov[3], ov[7]); iv[7] = _mm_unpackhi_epi64(ov[3], ov[7]);
|
||||
#if STRIDE <= ESIZE
|
||||
ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3); ST(p,iv[4],4); ST(p,iv[5],5); ST(p,iv[6],6); ST(p,iv[7],7);
|
||||
@ -642,7 +642,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[2]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[2]);
|
||||
|
||||
ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]);
|
||||
iv[2] = _mm_unpacklo_epi8( ov[2], ov[3]); iv[3] = _mm_unpackhi_epi8( ov[2], ov[3]);
|
||||
iv[2] = _mm_unpacklo_epi8( ov[2], ov[3]); iv[3] = _mm_unpackhi_epi8( ov[2], ov[3]);
|
||||
ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]);
|
||||
|
||||
iv[2] = _mm_unpacklo_epi64(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[3]);
|
||||
@ -651,14 +651,14 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
iv[0] = LD128(ip ); iv[1] = LD128(ip+16); iv[2] = LD128(ip+32); iv[3] = LD128(ip+48);
|
||||
iv[4] = LD128(ip+64); iv[5] = LD128(ip+80); iv[6] = LD128(ip+96); iv[7] = LD128(ip+112);
|
||||
|
||||
ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]);
|
||||
ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]);
|
||||
ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]);
|
||||
ov[4] = _mm_unpacklo_epi8( iv[4], iv[5]); ov[5] = _mm_unpackhi_epi8( iv[4], iv[5]);
|
||||
ov[4] = _mm_unpacklo_epi8( iv[4], iv[5]); ov[5] = _mm_unpackhi_epi8( iv[4], iv[5]);
|
||||
ov[6] = _mm_unpacklo_epi8( iv[6], iv[7]); ov[7] = _mm_unpackhi_epi8( iv[6], iv[7]);
|
||||
|
||||
iv[0] = _mm_unpacklo_epi8( ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8( ov[0], ov[1]);
|
||||
iv[0] = _mm_unpacklo_epi8( ov[0], ov[1]); iv[1] = _mm_unpackhi_epi8( ov[0], ov[1]);
|
||||
iv[2] = _mm_unpacklo_epi8( ov[2], ov[3]); iv[3] = _mm_unpackhi_epi8( ov[2], ov[3]);
|
||||
iv[4] = _mm_unpacklo_epi8( ov[4], ov[5]); iv[5] = _mm_unpackhi_epi8( ov[4], ov[5]);
|
||||
iv[4] = _mm_unpacklo_epi8( ov[4], ov[5]); iv[5] = _mm_unpackhi_epi8( ov[4], ov[5]);
|
||||
iv[6] = _mm_unpacklo_epi8( ov[6], ov[7]); iv[7] = _mm_unpackhi_epi8( ov[6], ov[7]);
|
||||
|
||||
ov[0] = _mm_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm_unpackhi_epi32(iv[0], iv[2]);
|
||||
@ -667,12 +667,12 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[6] = _mm_unpacklo_epi32(iv[5], iv[7]); ov[7] = _mm_unpackhi_epi32(iv[5], iv[7]);
|
||||
ST0(p,iv[0]); ST(p,iv[1],1); ST(p,iv[2],2); ST(p,iv[3],3);
|
||||
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[4]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[4]);
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[4]); iv[1] = _mm_unpackhi_epi64(ov[0], ov[4]);
|
||||
iv[2] = _mm_unpacklo_epi64(ov[1], ov[5]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[5]);
|
||||
iv[4] = _mm_unpacklo_epi64(ov[2], ov[6]); iv[5] = _mm_unpackhi_epi64(ov[2], ov[6]);
|
||||
iv[4] = _mm_unpacklo_epi64(ov[2], ov[6]); iv[5] = _mm_unpackhi_epi64(ov[2], ov[6]);
|
||||
iv[6] = _mm_unpacklo_epi64(ov[3], ov[7]); iv[7] = _mm_unpackhi_epi64(ov[3], ov[7]);
|
||||
ST(p,iv[4],4); ST(p,iv[5],5); ST(p,iv[6],6); ST(p,iv[7],7);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if STRIDE > ESIZE // ---------------------- Nibble -------------------------------------------
|
||||
@ -702,7 +702,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[6] = _mm_and_si128(iv[7], cl); ov[6] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[6],4), ov[6]),cb); ov[6] = _mm_packus_epi16(ov[6], _mm_srli_si128(ov[6],2));
|
||||
ov[7] = _mm_srli_epi16(_mm_and_si128(iv[7], ch),4); ov[7] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[7],4), ov[7]),cb); ov[7] = _mm_packus_epi16(ov[7], _mm_srli_si128(ov[7],2));
|
||||
STL(p,ov[4],12); STL(p,ov[5],13);STL(p,ov[6],14);STL(p,ov[7],15);
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
@ -718,7 +718,8 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
__m128i cl = _mm_set1_epi8(0x0f), ch=_mm_set1_epi8(0xf0), cb = _mm_set1_epi16(0xff);
|
||||
#endif
|
||||
|
||||
for(op = out,ip = in; op != out+v; op+=ESIZE*16,ip += ESIZE*16/STRIDE) { unsigned char *p=ip; PREFETCH(ip+(ESIZE*16/STRIDE)*ESIZE,0);
|
||||
for(op = out,ip = in; op != out+v; op+=ESIZE*16,ip += ESIZE*16/STRIDE) {
|
||||
unsigned char *p=ip; PREFETCH(ip+(ESIZE*16/STRIDE)*ESIZE,0);
|
||||
__m128i iv[ESIZE], ov[ESIZE];
|
||||
|
||||
#if STRIDE > ESIZE //------------ Nibble transpose -------------------
|
||||
@ -734,7 +735,7 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); ov[2] = _mm_and_si128(ov[2], cl); // 2,3->1
|
||||
ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); ov[3] = _mm_and_si128(ov[3], cl);
|
||||
iv[1] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]);
|
||||
#if ESIZE > 2
|
||||
#if ESIZE > 2
|
||||
ov[0] = _mm_loadl_epi64((__m128i *)SID(p,4));
|
||||
ov[1] = _mm_loadl_epi64((__m128i *)SID(p,5));
|
||||
ov[2] = _mm_loadl_epi64((__m128i *)SID(p,6));
|
||||
@ -791,27 +792,27 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#endif
|
||||
#endif
|
||||
#if ESIZE == 2
|
||||
#ifdef __ARM_NEON
|
||||
#ifdef __ARM_NEON
|
||||
uint8x16x2_t w; w.val[0] = (uint8x16_t)iv[0];
|
||||
w.val[1] = (uint8x16_t)iv[1]; vst2q_u8(op, w);
|
||||
#else
|
||||
#else
|
||||
ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]);//i(0,1)->o(0,1)
|
||||
ST128(op, ov[0]); ST128(op+16, ov[1]);
|
||||
ST128(op, ov[0]); ST128(op+16, ov[1]);
|
||||
#endif
|
||||
#elif ESIZE == 4
|
||||
#ifdef __ARM_NEON
|
||||
#ifdef __ARM_NEON
|
||||
uint8x16x4_t w; w.val[0] = (uint8x16_t)iv[0];
|
||||
w.val[1] = (uint8x16_t)iv[1];
|
||||
w.val[2] = (uint8x16_t)iv[2];
|
||||
w.val[3] = (uint8x16_t)iv[3]; vst4q_u8(op,w);
|
||||
#else
|
||||
#else
|
||||
ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); //i(0,1)->o(0,1)
|
||||
ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8(iv[2], iv[3]); //i(2,3)->o(2,3)
|
||||
|
||||
iv[0] = _mm_unpacklo_epi16(ov[0], ov[2]); iv[1] = _mm_unpackhi_epi16(ov[0], ov[2]);//o(0,2)->i(0,1)
|
||||
iv[2] = _mm_unpacklo_epi16(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi16(ov[1], ov[3]);//o(1,3)->i(2,3)
|
||||
ST128(op, iv[0]); ST128(op+16,iv[1]); ST128(op+32,iv[2]); ST128(op+48,iv[3]);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]);//i(0,1)->o(0,1)
|
||||
ov[2] = _mm_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm_unpackhi_epi8( iv[2], iv[3]);//i(2,3)->o(2,3)
|
||||
@ -837,9 +838,6 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__) && defined(AVX2_ON)
|
||||
//#define SIE(_p_,_i_) (_p_+ _i_*stride)
|
||||
#define ST128(_p_,_v_,_i_) _mm_storeu_si128((__m256i *)SIE(_p_,_i_), _mm256_castsi256_si128(_v_))
|
||||
#define ST1280(_p_,_v_) _mm_storeu_si128((__m256i *)(_p_), _mm256_castsi256_si128(_v_))
|
||||
void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
|
||||
unsigned v = n&~(ESIZE*32-1);
|
||||
unsigned stride = v/STRIDE;
|
||||
@ -847,122 +845,105 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
|
||||
#if ESIZE == 2
|
||||
__m256i sv = _mm256_set_epi8( 15, 13, 11, 9, 7, 5, 3, 1,
|
||||
14, 12, 10, 8, 6, 4, 2, 0,
|
||||
15, 13, 11, 9, 7, 5, 3, 1,
|
||||
14, 12, 10, 8, 6, 4, 2, 0);
|
||||
14, 12, 10, 8, 6, 4, 2, 0,
|
||||
15, 13, 11, 9, 7, 5, 3, 1,
|
||||
14, 12, 10, 8, 6, 4, 2, 0);
|
||||
__m256i sv0 = _mm256_set_epi8(15, 13, 11, 9,
|
||||
7, 5, 3, 1,
|
||||
14, 12, 10, 8,
|
||||
6, 4, 2, 0,
|
||||
15, 13, 11, 9,
|
||||
7, 5, 3, 1,
|
||||
14, 12, 10, 8,
|
||||
6, 4, 2, 0);
|
||||
7, 5, 3, 1,
|
||||
14, 12, 10, 8,
|
||||
6, 4, 2, 0,
|
||||
15, 13, 11, 9,
|
||||
7, 5, 3, 1,
|
||||
14, 12, 10, 8,
|
||||
6, 4, 2, 0);
|
||||
__m256i sv1 = _mm256_set_epi8(14, 12, 10, 8,
|
||||
6, 4, 2, 0,
|
||||
15, 13, 11, 9,
|
||||
7, 5, 3, 1,
|
||||
14, 12, 10, 8,
|
||||
6, 4, 2, 0,
|
||||
15, 13, 11, 9,
|
||||
7, 5, 3, 1);
|
||||
6, 4, 2, 0,
|
||||
15, 13, 11, 9,
|
||||
7, 5, 3, 1,
|
||||
14, 12, 10, 8,
|
||||
6, 4, 2, 0,
|
||||
15, 13, 11, 9,
|
||||
7, 5, 3, 1);
|
||||
#else
|
||||
__m256i pv = _mm256_set_epi32( 7, 3, 6, 2, 5, 1, 4, 0),
|
||||
#if ESIZE == 4
|
||||
sv0 = _mm256_set_epi8(15, 11, 7, 3,
|
||||
13, 9, 5, 1,
|
||||
13, 9, 5, 1,
|
||||
14, 10, 6, 2,
|
||||
12, 8, 4, 0,
|
||||
15, 11, 7, 3,
|
||||
13, 9, 5, 1,
|
||||
12, 8, 4, 0,
|
||||
15, 11, 7, 3,
|
||||
13, 9, 5, 1,
|
||||
14, 10, 6, 2,
|
||||
12, 8, 4, 0),
|
||||
12, 8, 4, 0),
|
||||
sv1= _mm256_set_epi8(13, 9, 5, 1,
|
||||
15, 11, 7, 3,
|
||||
12, 8, 4, 0,
|
||||
15, 11, 7, 3,
|
||||
12, 8, 4, 0,
|
||||
14, 10, 6, 2,
|
||||
13, 9, 5, 1,
|
||||
15, 11, 7, 3,
|
||||
12, 8, 4, 0,
|
||||
13, 9, 5, 1,
|
||||
15, 11, 7, 3,
|
||||
12, 8, 4, 0,
|
||||
14, 10, 6, 2);
|
||||
#else
|
||||
sv = _mm256_set_epi8(15, 7,
|
||||
14, 6,
|
||||
13, 5,
|
||||
12, 4,
|
||||
11, 3,
|
||||
10, 2,
|
||||
9, 1,
|
||||
8, 0,
|
||||
13, 5,
|
||||
12, 4,
|
||||
11, 3,
|
||||
10, 2,
|
||||
9, 1,
|
||||
8, 0,
|
||||
15, 7,
|
||||
14, 6,
|
||||
13, 5,
|
||||
12, 4,
|
||||
11, 3,
|
||||
10, 2,
|
||||
9, 1,
|
||||
8, 0 ),
|
||||
13, 5,
|
||||
12, 4,
|
||||
11, 3,
|
||||
10, 2,
|
||||
9, 1,
|
||||
8, 0 ),
|
||||
tv = _mm256_set_epi8(15, 14, 11, 10, 13, 12, 9, 8,
|
||||
7, 6, 3, 2, 5, 4, 1, 0,
|
||||
15, 14, 11, 10, 13, 12, 9, 8,
|
||||
7, 6, 3, 2, 5, 4, 1, 0);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if STRIDE > ESIZE // ------------------ byte transpose ----------------------------------
|
||||
__m256i cl = _mm256_set1_epi8(0x0f), ch=_mm256_set1_epi8(0xf0), cb = _mm256_set1_epi16(0xff);
|
||||
7, 6, 3, 2, 5, 4, 1, 0,
|
||||
15, 14, 11, 10, 13, 12, 9, 8,
|
||||
7, 6, 3, 2, 5, 4, 1, 0);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
for(ip = in,op = out; ip != in+v; ip += ESIZE*32, op += ESIZE*32/STRIDE) { unsigned char *p = op; PREFETCH(ip+ESIZE*192,0);
|
||||
#if STRIDE > ESIZE // ------------------ byte transpose ----------------------------------
|
||||
__m256i cl = _mm256_set1_epi8( 0x0f),
|
||||
ch = _mm256_set1_epi8( 0xf0),
|
||||
cb = _mm256_set1_epi16(0xff);
|
||||
#endif
|
||||
|
||||
for(ip = in,op = out; ip != in+v; ip += ESIZE*32, op += ESIZE*32/STRIDE) {
|
||||
unsigned char *p = op; PREFETCH(ip+ESIZE*192,0);
|
||||
__m256i iv[ESIZE],ov[ESIZE];
|
||||
#if ESIZE == 2
|
||||
#if 0
|
||||
ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv);
|
||||
ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv);
|
||||
iv[0] = _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(ov[0], ov[1]), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
iv[1] = _mm256_permute4x64_epi64(_mm256_unpackhi_epi64(ov[0], ov[1]), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
#else
|
||||
#if ESIZE == 2
|
||||
ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0);
|
||||
ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1);
|
||||
iv[0] = _mm256_permute4x64_epi64(_mm256_blend_epi32(ov[0], ov[1],0b11001100),_MM_SHUFFLE(3, 1, 2, 0));
|
||||
iv[1] = _mm256_blend_epi32(ov[0], ov[1],0b00110011);
|
||||
iv[1] = _mm256_permute4x64_epi64(_mm256_shuffle_epi32(iv[1],_MM_SHUFFLE(1, 0, 3, 2)),_MM_SHUFFLE(3, 1, 2, 0));
|
||||
#endif
|
||||
#elif ESIZE == 4
|
||||
iv[1] = _mm256_permute4x64_epi64(_mm256_shuffle_epi32(iv[1],_MM_SHUFFLE(1, 0, 3, 2)),_MM_SHUFFLE(3, 1, 2, 0));
|
||||
#elif ESIZE == 4
|
||||
iv[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0);
|
||||
iv[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1);
|
||||
iv[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv0);
|
||||
iv[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv1);
|
||||
#if 0
|
||||
ov[0] = _mm256_unpacklo_epi32(iv[0], iv[1]);
|
||||
ov[1] = _mm256_unpackhi_epi32(iv[0], iv[1]);
|
||||
ov[2] = _mm256_unpacklo_epi32(iv[2], iv[3]);
|
||||
ov[3] = _mm256_unpackhi_epi32(iv[2], iv[3]);
|
||||
|
||||
ov[0] = _mm256_blend_epi32(iv[0], iv[1],0b10101010);
|
||||
ov[1] = _mm256_shuffle_epi32(_mm256_blend_epi32(iv[0], iv[1],0b01010101),_MM_SHUFFLE(2, 3, 0, 1));
|
||||
ov[2] = _mm256_blend_epi32(iv[2], iv[3],0b10101010);
|
||||
ov[3] = _mm256_shuffle_epi32(_mm256_blend_epi32(iv[2], iv[3],0b01010101),_MM_SHUFFLE(2, 3, 0, 1));
|
||||
|
||||
iv[0] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[2]), pv);
|
||||
iv[1] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[0], ov[2]), pv);
|
||||
iv[2] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[1], ov[3]), pv);
|
||||
iv[3] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[3]), pv);
|
||||
#else
|
||||
ov[0] = _mm256_blend_epi32(iv[0], iv[1],0b10101010); ov[1] = _mm256_shuffle_epi32(_mm256_blend_epi32(iv[0], iv[1],0b01010101),_MM_SHUFFLE(2, 3, 0, 1));
|
||||
ov[2] = _mm256_blend_epi32(iv[2], iv[3],0b10101010); ov[3] = _mm256_shuffle_epi32(_mm256_blend_epi32(iv[2], iv[3],0b01010101),_MM_SHUFFLE(2, 3, 0, 1));
|
||||
|
||||
iv[0] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[2]), pv);
|
||||
iv[1] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[0], ov[2]), pv);
|
||||
iv[2] = _mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[1], ov[3]), pv);
|
||||
iv[3] = _mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[3]), pv);
|
||||
//iv[0] = _mm256_permutevar8x32_epi32(_mm256_blend_epi32(ov[0], _mm256_shuffle_epi32(ov[2],_MM_SHUFFLE(1, 0, 3, 2)), 0b11001100), pv);
|
||||
//iv[1] = _mm256_permutevar8x32_epi32(_mm256_blend_epi32( _mm256_shuffle_epi32(ov[0],_MM_SHUFFLE(1, 0, 3, 2)), ov[2],0b11001100), pv);
|
||||
//iv[2] = _mm256_permutevar8x32_epi32(_mm256_blend_epi32(ov[1], _mm256_shuffle_epi32(ov[3],_MM_SHUFFLE(1, 0, 3, 2)), 0b11001100), pv);
|
||||
//iv[3] = _mm256_permutevar8x32_epi32(_mm256_blend_epi32( _mm256_shuffle_epi32(ov[1],_MM_SHUFFLE(1, 0, 3, 2)), ov[3],0b11001100), pv);
|
||||
#endif
|
||||
#else
|
||||
#else
|
||||
ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv);
|
||||
ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv);
|
||||
ov[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv);
|
||||
ov[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv);
|
||||
|
||||
iv[0] = _mm256_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm256_unpackhi_epi16(ov[0], ov[1]);
|
||||
iv[0] = _mm256_unpacklo_epi16(ov[0], ov[1]); iv[1] = _mm256_unpackhi_epi16(ov[0], ov[1]);
|
||||
iv[2] = _mm256_unpacklo_epi16(ov[2], ov[3]); iv[3] = _mm256_unpackhi_epi16(ov[2], ov[3]);
|
||||
|
||||
ov[0] = _mm256_unpacklo_epi32(iv[0], iv[2]); ov[1] = _mm256_unpackhi_epi32(iv[0], iv[2]);
|
||||
@ -974,30 +955,30 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[6] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+192)), sv);
|
||||
ov[7] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+224)), sv);
|
||||
|
||||
iv[4] = _mm256_unpacklo_epi16(ov[4], ov[5]); iv[5] = _mm256_unpackhi_epi16(ov[4], ov[5]);
|
||||
iv[4] = _mm256_unpacklo_epi16(ov[4], ov[5]); iv[5] = _mm256_unpackhi_epi16(ov[4], ov[5]);
|
||||
iv[6] = _mm256_unpacklo_epi16(ov[6], ov[7]); iv[7] = _mm256_unpackhi_epi16(ov[6], ov[7]);
|
||||
|
||||
ov[4] = _mm256_unpacklo_epi32(iv[4], iv[6]); ov[5] = _mm256_unpackhi_epi32(iv[4], iv[6]);
|
||||
ov[6] = _mm256_unpacklo_epi32(iv[5], iv[7]); ov[7] = _mm256_unpackhi_epi32(iv[5], iv[7]);
|
||||
|
||||
iv[0] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[4]), pv), tv);
|
||||
iv[0] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[0], ov[4]), pv), tv);
|
||||
iv[1] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[0], ov[4]), pv), tv);
|
||||
iv[2] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[1], ov[5]), pv), tv);
|
||||
iv[3] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[1], ov[5]), pv), tv);
|
||||
|
||||
iv[4] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[2], ov[6]), pv), tv);
|
||||
iv[4] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[2], ov[6]), pv), tv);
|
||||
iv[5] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[2], ov[6]), pv), tv);
|
||||
iv[6] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpacklo_epi64(ov[3], ov[7]), pv), tv);
|
||||
iv[7] = _mm256_shuffle_epi8(_mm256_permutevar8x32_epi32(_mm256_unpackhi_epi64(ov[3], ov[7]), pv), tv);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if STRIDE <= ESIZE
|
||||
_mm256_storeu_si256((__m256i *) p, iv[0]);
|
||||
_mm256_storeu_si256((__m256i *)(p+=stride), iv[1]);
|
||||
#if ESIZE > 2
|
||||
#if ESIZE > 2
|
||||
_mm256_storeu_si256((__m256i *)(p+=stride), iv[2]);
|
||||
_mm256_storeu_si256((__m256i *)(p+=stride), iv[3]);
|
||||
#if ESIZE > 4
|
||||
#if ESIZE > 4
|
||||
_mm256_storeu_si256((__m256i *)(p+=stride), iv[4]);
|
||||
_mm256_storeu_si256((__m256i *)(p+=stride), iv[5]);
|
||||
_mm256_storeu_si256((__m256i *)(p+=stride), iv[6]);
|
||||
@ -1007,6 +988,8 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
|
||||
#else //---------------------- Nibble Transpose ------------------------
|
||||
#define mm256_packus_epi16(a, b) _mm256_permute4x64_epi64(_mm256_packus_epi16(a, b), _MM_SHUFFLE(3, 1, 2, 0))
|
||||
#define ST128(_p_,_v_,_i_) _mm_storeu_si128((__m256i *)SIE(_p_,_i_), _mm256_castsi256_si128(_v_))
|
||||
#define ST1280(_p_,_v_) _mm_storeu_si128((__m256i *)(_p_), _mm256_castsi256_si128(_v_))
|
||||
|
||||
ov[0] = _mm256_and_si256(iv[0], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2));
|
||||
ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[0], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2));
|
||||
@ -1020,10 +1003,10 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[3], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2));
|
||||
ST128(p,ov[0],4); ST128(p,ov[1],5); ST128(p,ov[2],6); ST128(p,ov[3],7);
|
||||
#if ESIZE > 4
|
||||
ov[0] = _mm256_and_si256(iv[4], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2));
|
||||
ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[4], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2));
|
||||
ov[2] = _mm256_and_si256(iv[5], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2));
|
||||
ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[5], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2));
|
||||
ov[0] = _mm256_and_si256(iv[4], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2));
|
||||
ov[1] = _mm256_srli_epi16(_mm256_and_si256(iv[4], ch),4); ov[1] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[1],4), ov[1]),cb); ov[1] = mm256_packus_epi16(ov[1], _mm256_srli_si256( ov[1],2));
|
||||
ov[2] = _mm256_and_si256(iv[5], cl); ov[2] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[2],4), ov[2]),cb); ov[2] = mm256_packus_epi16(ov[2], _mm256_srli_si256( ov[2],2));
|
||||
ov[3] = _mm256_srli_epi16(_mm256_and_si256(iv[5], ch),4); ov[3] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[3],4), ov[3]),cb); ov[3] = mm256_packus_epi16(ov[3], _mm256_srli_si256( ov[3],2));
|
||||
ST128(p,ov[0],8); ST128(p,ov[1],9); ST128(p,ov[2],10); ST128(p,ov[3],11);
|
||||
|
||||
ov[0] = _mm256_and_si256(iv[6], cl); ov[0] = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi16(ov[0],4), ov[0]),cb); ov[0] = mm256_packus_epi16(ov[0], _mm256_srli_si256( ov[0],2));
|
||||
@ -1059,18 +1042,18 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
__m256i cl = _mm256_set1_epi8(0x0f), ch=_mm256_set1_epi8(0xf0), cb = _mm256_set1_epi16(0xff);
|
||||
#endif
|
||||
|
||||
for(op = out,ip = in; op != out+v; ip += ESIZE*32/STRIDE, op += ESIZE*32) { unsigned char *p = ip; PREFETCH(ip+ESIZE*192,0);
|
||||
for(op = out,ip = in; op != out+v; ip += ESIZE*32/STRIDE, op += ESIZE*32) { unsigned char *p = ip; PREFETCH(ip+ESIZE*192,0);
|
||||
__m256i iv[ESIZE], ov[ESIZE];
|
||||
|
||||
#if STRIDE > ESIZE
|
||||
NBL0(0,1); NBL( 2,3); NB(0,1,iv[0]); NB(2,3,iv[1]);
|
||||
NBL0(0,1); NBL( 2,3); NB(0,1,iv[0]); NB(2,3,iv[1]);
|
||||
#if ESIZE > 2
|
||||
NBL( 0,1); NBL( 2,3); NB(0,1,iv[2]); NB(2,3,iv[3]);
|
||||
NBL( 0,1); NBL( 2,3); NB(0,1,iv[2]); NB(2,3,iv[3]);
|
||||
#if ESIZE > 4
|
||||
NBL(4,5); NBL( 6,7); NB(4,5,iv[4]); NB(6,7,iv[5]);
|
||||
NBL(4,5); NBL( 6,7); NB(4,5,iv[6]); NB(6,7,iv[7]);
|
||||
#endif
|
||||
#endif
|
||||
NBL(4,5); NBL( 6,7); NB(4,5,iv[4]); NB(6,7,iv[5]);
|
||||
NBL(4,5); NBL( 6,7); NB(4,5,iv[6]); NB(6,7,iv[7]);
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
iv[0] = _mm256_loadu_si256((__m256i *) p );
|
||||
iv[1] = _mm256_loadu_si256((__m256i *)(p+=stride));
|
||||
@ -1087,10 +1070,10 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#endif
|
||||
|
||||
#if ESIZE == 2
|
||||
ov[0] = _mm256_permute4x64_epi64(iv[0], _MM_SHUFFLE(3, 1, 2, 0));
|
||||
ov[1] = _mm256_permute4x64_epi64(iv[1], _MM_SHUFFLE(3, 1, 2, 0));
|
||||
_mm256_storeu_si256((__m256i *)op, _mm256_unpacklo_epi8(ov[0], ov[1]));
|
||||
_mm256_storeu_si256((__m256i *)(op+32), _mm256_unpackhi_epi8(ov[0], ov[1]));
|
||||
ov[0] = _mm256_permute4x64_epi64(iv[0], _MM_SHUFFLE(3, 1, 2, 0));
|
||||
ov[1] = _mm256_permute4x64_epi64(iv[1], _MM_SHUFFLE(3, 1, 2, 0));
|
||||
_mm256_storeu_si256((__m256i *)op, _mm256_unpacklo_epi8(ov[0], ov[1]));
|
||||
_mm256_storeu_si256((__m256i *)(op+32), _mm256_unpackhi_epi8(ov[0], ov[1]));
|
||||
#elif ESIZE == 4
|
||||
ov[0] = _mm256_unpacklo_epi8( iv[0], iv[1]); ov[1] = _mm256_unpackhi_epi8( iv[0], iv[1]);
|
||||
ov[2] = _mm256_unpacklo_epi8( iv[2], iv[3]); ov[3] = _mm256_unpackhi_epi8( iv[2], iv[3]);
|
||||
@ -1098,11 +1081,11 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
iv[0] = _mm256_unpacklo_epi16(ov[0], ov[2]); iv[1] = _mm256_unpackhi_epi16(ov[0], ov[2]);
|
||||
iv[2] = _mm256_unpacklo_epi16(ov[1], ov[3]); iv[3] = _mm256_unpackhi_epi16(ov[1], ov[3]);
|
||||
|
||||
ov[0] = _mm256_permute2x128_si256(iv[0], iv[1], (2 << 4) | 0);
|
||||
ov[0] = _mm256_permute2x128_si256(iv[0], iv[1], (2 << 4) | 0);
|
||||
ov[1] = _mm256_permute2x128_si256(iv[2], iv[3], (2 << 4) | 0);
|
||||
ov[2] = _mm256_permute2x128_si256(iv[0], iv[1], (3 << 4) | 1);
|
||||
ov[3] = _mm256_permute2x128_si256(iv[2], iv[3], (3 << 4) | 1);
|
||||
_mm256_storeu_si256((__m256i *) op, ov[0]);
|
||||
_mm256_storeu_si256((__m256i *) op, ov[0]);
|
||||
_mm256_storeu_si256((__m256i *)(op+32), ov[1]);
|
||||
_mm256_storeu_si256((__m256i *)(op+64), ov[2]);
|
||||
_mm256_storeu_si256((__m256i *)(op+96), ov[3]);
|
||||
|
Reference in New Issue
Block a user