Transform: Byte+Nibble Transpose/Shuffle

This commit is contained in:
x
2019-10-21 21:10:58 +02:00
parent 47ce7a4e8b
commit fb4535e1b1

View File

@ -31,6 +31,8 @@
#include <smmintrin.h>
#elif defined(__SSSE3__)
#include <tmmintrin.h>
#elif defined(__SSE3__)
#include <pmmintrin.h>
#elif defined(__SSE2__)
#include <emmintrin.h>
#elif defined(__ARM_NEON)
@ -151,6 +153,7 @@ static inline uint64_t xgetbv (int ctr) {
return (uint64_t)d << 32 | a;
#endif
}
#endif
static int _cpuiset;
int cpuini(int cpuiset) { if(cpuiset) _cpuiset = cpuiset; return _cpuiset; }
@ -159,6 +162,7 @@ char *cpustr(int cpuiset) {
if(cpuiset >= 52) return "avx2";
else if(cpuiset >= 50) return "avx";
else if(cpuiset >= 41) return "sse4.1";
else if(cpuiset >= 35) return "arm_neon";
else if(cpuiset >= 31) return "sse3";
else if(cpuiset >= 20) return "sse2";
else return "none";
@ -168,6 +172,9 @@ int cpuiset(void) {
int c[4] = {0};
if(_cpuiset) return _cpuiset;
_cpuiset++;
#ifdef __ARM_NEON
_cpuiset = 35; // ARM_NEON
#elif defined(__i386__) || defined(__x86_64__)
cpuid(c, 0);
if(c[0]) {
cpuid(c, 1);
@ -186,12 +193,10 @@ int cpuiset(void) {
cpuid(c, 7);
if(c[1] & (1 << 5)) _cpuiset = 52; // AVX2
}}}}}}}}}
#endif
return _cpuiset;
}
#else
int cpuini(int cpuiset) { return 0; }
int cpuiset(void) { return 0; }
#endif
//---------------------------------------------------------------------------------
typedef void (*TPFUNC)( unsigned char *in, unsigned n, unsigned char *out);
@ -224,8 +229,10 @@ void tpini(int id) {
_tpe[4] = tpenc128v4; _tpd[4] = tpdec128v4; _tp4e[4] = tp4enc128v4; _tp4d[4] = tp4dec128v4;
_tpe[8] = tpenc128v8; _tpd[8] = tpdec128v8; _tp4e[8] = tp4enc128v8; _tp4d[8] = tp4dec128v8;
}
if(i >= 35) {
_tpd[8] = tpdec8;
}
#endif
;
}
void tpenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
@ -270,7 +277,7 @@ void tp2ddec(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsi
unsigned _x,_y; int e; uint8_t *op=out,*ip=in;
for(e = esize-1; e >= 0; e--)
for( _x = 0; _x < x; _x++)
for(_y = 0; _y < y; _y++) *op++/*[_x * y * z + _y * z + _z]*/ = ip[ODX2];
for(_y = 0; _y < y; _y++) *op++ = ip[ODX2];
}
#define ODX3 e + (_x + _y * x + _z * y * x) * esize
@ -287,7 +294,7 @@ void tp3ddec(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned cha
for(e = esize-1; e >= 0; e--)
for(_x = 0; _x < x; ++_x)
for(_y = 0; _y < y; ++_y)
for(_z = 0; _z < z; ++_z) *op++= ip[ODX3]; /*[_x * y * z + _y * z + _z]*/
for(_z = 0; _z < z; ++_z) *op++= ip[ODX3];
}
#define ODX4 e + (_w + _x * w + _y * x * w + _z * x * y * w) * esize
@ -306,7 +313,7 @@ void tp4ddec(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z,
for( _w = 0; _w < w; _w++)
for( _x = 0; _x < x; ++_x)
for( _y = 0; _y < y; ++_y)
for(_z = 0; _z < z; ++_z) *op++= ip[ODX4]; /*[_x * y * z + _y * z + _z]*/
for(_z = 0; _z < z; ++_z) *op++= ip[ODX4];
}
#ifdef USE_SSE
@ -326,9 +333,17 @@ void tp4dec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
#endif
#endif
#else
//#define SI(p,i) (p+=stride)
//#define SE(p,i)
#define SI(_p_,_i_) (_p_+ _i_*stride)
#define SE(_p_,_i_) _p_+=_i_*stride
#if !defined(SSE2_ON) && !defined(AVX2_ON)
#if STRIDE == ESIZE
void TEMPLATE2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
unsigned char *op,*ip,*e;
@ -342,25 +357,26 @@ void TEMPLATE2(TPENC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out)
for(ip = in,op = out; ip < e; op++, ip+=ESIZE) { unsigned char *p = op;
p[0] = ip[ 0];
*(p+=stride) = ip[ 1];
*SI(p, 1) = ip[ 1];
#if ESIZE > 2
*(p+=stride) = ip[ 2];
*SI(p, 2) = ip[ 2];
#if ESIZE > 3
*(p+=stride) = ip[ 3];
*SI(p, 3) = ip[ 3];
#if ESIZE > 4
*(p+=stride) = ip[ 4];
*(p+=stride) = ip[ 5];
*(p+=stride) = ip[ 6];
*(p+=stride) = ip[ 7];
uint32_t u = ctou32(p);
*SI(p, 4) = ip[ 4];
*SI(p, 5) = ip[ 5];
*SI(p, 6) = ip[ 6];
*SI(p, 7) = ip[ 7];
#if ESIZE > 8
*(p+=stride) = ip[ 8];
*(p+=stride) = ip[ 9];
*(p+=stride) = ip[10];
*(p+=stride) = ip[11];
*(p+=stride) = ip[12];
*(p+=stride) = ip[13];
*(p+=stride) = ip[14];
*(p+=stride) = ip[15];
*SI(p, 8) = ip[ 8];
*SI(p, 9) = ip[ 9];
*SI(p,10) = ip[10];
*SI(p,11) = ip[11];
*SI(p,12) = ip[12];
*SI(p,13) = ip[13];
*SI(p,14) = ip[14];
*SI(p,15) = ip[15];
#endif
#endif
#endif
@ -381,25 +397,25 @@ void TEMPLATE2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out)
#endif
for(op = out,ip = in; op < e; ip++,op+=ESIZE) { unsigned char *p = ip;
op[ 0] = *p;
op[ 1] = *(p+=stride);
op[ 1] = *SI(p,1);
#if ESIZE > 2
op[ 2] = *(p+=stride);
op[ 2] = *SI(p,2);
#if ESIZE > 3
op[ 3] = *(p+=stride);
op[ 3] = *SI(p,3);
#if ESIZE > 4
op[ 4] = *(p+=stride);
op[ 5] = *(p+=stride);
op[ 6] = *(p+=stride);
op[ 7] = *(p+=stride);
op[ 4] = *SI(p,4);
op[ 5] = *SI(p,5);
op[ 6] = *SI(p,6);
op[ 7] = *SI(p,7);
#if ESIZE > 8
op[ 8] = *(p+=stride);
op[ 9] = *(p+=stride);
op[10] = *(p+=stride);
op[11] = *(p+=stride);
op[12] = *(p+=stride);
op[13] = *(p+=stride);
op[14] = *(p+=stride);
op[15] = *(p+=stride);
op[ 8] = *SI(p,8);
op[ 9] = *SI(p,9);
op[10] = *SI(p,10);
op[11] = *SI(p,11);
op[12] = *SI(p,12);
op[13] = *SI(p,13);
op[14] = *SI(p,14);
op[15] = *SI(p,15);
#endif
#endif
#endif
@ -413,6 +429,7 @@ void TEMPLATE2(TPDEC, ESIZE)(unsigned char *in, unsigned n, unsigned char *out)
#if ESIZE == 2 || ESIZE == 4 || ESIZE == 8
#if (defined(__SSE2__) || defined(__ARM_NEON)) && defined(SSE2_ON)
void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
unsigned v = n&~(ESIZE*16-1);
@ -448,15 +465,24 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
__m128i iv[ESIZE],ov[ESIZE];
#if defined(__SSSE3__) || defined(__ARM_NEON)
#if ESIZE == 2
#ifdef __ARM_NEON
uint8x16x2_t w = vld2q_u8(ip); iv[0] = (__m128i)w.val[0]; iv[1] = (__m128i)w.val[1];
#else
ov[0] = LD128((__m128i *)ip); ov[0] = _mm_shuffle_epi8(ov[0], sv);
ov[1] = LD128((__m128i *)(ip+16)); ov[1] = _mm_shuffle_epi8(ov[1], sv); ip+= 32; PREFETCH(ip+512,0);
ov[1] = LD128((__m128i *)(ip+16)); ov[1] = _mm_shuffle_epi8(ov[1], sv);
iv[0] = _mm_unpacklo_epi64(ov[0], ov[1]);
iv[1] = _mm_unpackhi_epi64(ov[0], ov[1]);
#endif
ip+= 32; PREFETCH(ip+512,0);
#elif ESIZE == 4
#ifdef __ARM_NEON
uint8x16x4_t w = vld4q_u8(ip); iv[0] = (__m128i)w.val[0]; iv[1] = (__m128i)w.val[1];
iv[2] = (__m128i)w.val[2]; iv[3] = (__m128i)w.val[3];
#else
iv[0] = LD128((__m128i *) ip ); iv[0] = _mm_shuffle_epi8(iv[0], sv);
iv[1] = LD128((__m128i *)(ip+16)); iv[1] = _mm_shuffle_epi8(iv[1], sv);
iv[2] = LD128((__m128i *)(ip+32)); iv[2] = _mm_shuffle_epi8(iv[2], sv);
iv[3] = LD128((__m128i *)(ip+48)); iv[3] = _mm_shuffle_epi8(iv[3], sv); ip += 64; PREFETCH(ip+512,0);
iv[3] = LD128((__m128i *)(ip+48)); iv[3] = _mm_shuffle_epi8(iv[3], sv);
ov[0] = _mm_unpacklo_epi32(iv[0], iv[1]);
ov[1] = _mm_unpackhi_epi32(iv[0], iv[1]);
@ -467,7 +493,101 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
iv[1] = _mm_unpackhi_epi64(ov[0], ov[2]);
iv[2] = _mm_unpacklo_epi64(ov[1], ov[3]);
iv[3] = _mm_unpackhi_epi64(ov[1], ov[3]);
#endif
ip += 64; PREFETCH(ip+512,0);
#elif ESIZE == 8
#ifdef __ARM_NEON
#define vzipl_u16(_a_,_b_) vzip_u16(vget_low_u16((uint16x8_t)(_a_)), vget_low_u16((uint16x8_t)(_b_)))
#define vziph_u16(_a_,_b_) vzip_u16(vget_high_u16((uint16x8_t)(_a_)), vget_high_u16((uint16x8_t)(_b_)))
//#define VQ
#ifndef VQ
uint16x4x2_t v16[8];
uint32x2x2_t v32[8];
#else
uint8x16x2_t v8[4];
uint16x8x2_t v16[4];
uint32x4x2_t v32[4]; //uint64x2x2_t v64[4];
#endif
#ifdef VQ
ov[0] = LD128((__m128i *) ip ); //ov[0] = _mm_shuffle_epi8(ov[0], sv);
ov[1] = LD128((__m128i *)(ip+16)); //ov[1] = _mm_shuffle_epi8(ov[1], sv);
ov[2] = LD128((__m128i *)(ip+32)); //ov[2] = _mm_shuffle_epi8(ov[2], sv);
ov[3] = LD128((__m128i *)(ip+48)); //ov[3] = _mm_shuffle_epi8(ov[3], sv);
ip += 64;
ov[4] = LD128((__m128i *) ip ); //ov[4] = _mm_shuffle_epi8(ov[4], sv);
ov[5] = LD128((__m128i *)(ip+16)); //ov[5] = _mm_shuffle_epi8(ov[5], sv);
ov[6] = LD128((__m128i *)(ip+32)); //ov[6] = _mm_shuffle_epi8(ov[6], sv);
ov[7] = LD128((__m128i *)(ip+48)); //ov[7] = _mm_shuffle_epi8(ov[7], sv);
ip += 64; PREFETCH(ip+512,0);
v8[0] = vzipq_u8((uint8x16_t)ov[0], (uint8x16_t)ov[1]);
v8[1] = vzipq_u8((uint8x16_t)ov[2], (uint8x16_t)ov[3]);
v8[2] = vzipq_u8((uint8x16_t)ov[4], (uint8x16_t)ov[5]);
v8[3] = vzipq_u8((uint8x16_t)ov[6], (uint8x16_t)ov[7]);
/* v16[0] = vzipq_u16((uint16x8_t)ov[0], (uint16x8_t)ov[1]);
v16[1] = vzipq_u16((uint16x8_t)ov[2], (uint16x8_t)ov[3]);
v16[2] = vzipq_u16((uint16x8_t)ov[4], (uint16x8_t)ov[5]);
v16[3] = vzipq_u16((uint16x8_t)ov[6], (uint16x8_t)ov[7]);*/
v16[0] = vzipq_u16(vreinterpretq_u16_u8( v8[0].val[0]), vreinterpretq_u16_u8(v8[1].val[0]));
v16[1] = vzipq_u16(vreinterpretq_u16_u8( v8[0].val[1]), vreinterpretq_u16_u8(v8[1].val[1]));
v16[2] = vzipq_u16(vreinterpretq_u16_u8( v8[2].val[0]), vreinterpretq_u16_u8(v8[3].val[0]));
v16[3] = vzipq_u16(vreinterpretq_u16_u8( v8[2].val[1]), vreinterpretq_u16_u8(v8[3].val[1]));
v32[0] = vzipq_u32(vreinterpretq_u32_u16(v16[0].val[0]), vreinterpretq_u32_u16(v16[2].val[0]));
v32[1] = vzipq_u32(vreinterpretq_u32_u16(v16[0].val[1]), vreinterpretq_u32_u16(v16[2].val[1]));
v32[2] = vzipq_u32(vreinterpretq_u32_u16(v16[1].val[0]), vreinterpretq_u32_u16(v16[3].val[0]));
v32[3] = vzipq_u32(vreinterpretq_u32_u16(v16[1].val[1]), vreinterpretq_u32_u16(v16[3].val[1]));
iv[0] = _mm_unpacklo_epi64(v32[0].val[0], v32[2].val[0]);
iv[1] = _mm_unpackhi_epi64(v32[0].val[0], v32[2].val[0]);
iv[2] = _mm_unpacklo_epi64(v32[0].val[1], v32[2].val[1]);
iv[3] = _mm_unpackhi_epi64(v32[0].val[1], v32[2].val[1]);
iv[4] = _mm_unpacklo_epi64(v32[1].val[0], v32[3].val[0]);
iv[5] = _mm_unpackhi_epi64(v32[1].val[0], v32[3].val[0]);
iv[6] = _mm_unpacklo_epi64(v32[1].val[1], v32[3].val[1]);
iv[7] = _mm_unpackhi_epi64(v32[1].val[1], v32[3].val[1]);
#else
ov[0] = LD128((__m128i *) ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv);
ov[1] = LD128((__m128i *)(ip+16)); ov[1] = _mm_shuffle_epi8(ov[1], sv);
ov[2] = LD128((__m128i *)(ip+32)); ov[2] = _mm_shuffle_epi8(ov[2], sv);
ov[3] = LD128((__m128i *)(ip+48)); ov[3] = _mm_shuffle_epi8(ov[3], sv); ip += 64;
ov[4] = LD128((__m128i *) ip ); ov[4] = _mm_shuffle_epi8(ov[4], sv);
ov[5] = LD128((__m128i *)(ip+16)); ov[5] = _mm_shuffle_epi8(ov[5], sv);
ov[6] = LD128((__m128i *)(ip+32)); ov[6] = _mm_shuffle_epi8(ov[6], sv);
ov[7] = LD128((__m128i *)(ip+48)); ov[7] = _mm_shuffle_epi8(ov[7], sv); ip += 64; PREFETCH(ip+512,0);
v16[0] = vzipl_u16(ov[0], ov[1]);
v16[1] = vziph_u16(ov[0], ov[1]);
v16[2] = vzipl_u16(ov[2], ov[3]);
v16[3] = vziph_u16(ov[2], ov[3]);
v16[4] = vzipl_u16(ov[4], ov[5]);
v16[5] = vziph_u16(ov[4], ov[5]);
v16[6] = vzipl_u16(ov[6], ov[7]);
v16[7] = vziph_u16(ov[6], ov[7]);
v32[0] = vzip_u32(vreinterpret_u32_u16(v16[0].val[0]), vreinterpret_u32_u16(v16[2].val[0]) );
v32[1] = vzip_u32(vreinterpret_u32_u16(v16[0].val[1]), vreinterpret_u32_u16(v16[2].val[1]) );
v32[2] = vzip_u32(vreinterpret_u32_u16(v16[1].val[0]), vreinterpret_u32_u16(v16[3].val[0]) );
v32[3] = vzip_u32(vreinterpret_u32_u16(v16[1].val[1]), vreinterpret_u32_u16(v16[3].val[1]) );
v32[4] = vzip_u32(vreinterpret_u32_u16(v16[4].val[0]), vreinterpret_u32_u16(v16[6].val[0]) );
v32[5] = vzip_u32(vreinterpret_u32_u16(v16[4].val[1]), vreinterpret_u32_u16(v16[6].val[1]) );
v32[6] = vzip_u32(vreinterpret_u32_u16(v16[5].val[0]), vreinterpret_u32_u16(v16[7].val[0]) );
v32[7] = vzip_u32(vreinterpret_u32_u16(v16[5].val[1]), vreinterpret_u32_u16(v16[7].val[1]) );
iv[0] = (__m128i)vcombine_u64(vreinterpret_u64_u32(v32[0].val[0]), vreinterpret_u64_u32(v32[4].val[0]) );
iv[1] = (__m128i)vcombine_u64(vreinterpret_u64_u32(v32[0].val[1]), vreinterpret_u64_u32(v32[4].val[1]) );
iv[2] = (__m128i)vcombine_u64(vreinterpret_u64_u32(v32[1].val[0]), vreinterpret_u64_u32(v32[5].val[0]) );
iv[3] = (__m128i)vcombine_u64(vreinterpret_u64_u32(v32[1].val[1]), vreinterpret_u64_u32(v32[5].val[1]) );
iv[4] = (__m128i)vcombine_u64(vreinterpret_u64_u32(v32[2].val[0]), vreinterpret_u64_u32(v32[6].val[0]) );
iv[5] = (__m128i)vcombine_u64(vreinterpret_u64_u32(v32[2].val[1]), vreinterpret_u64_u32(v32[6].val[1]) );
iv[6] = (__m128i)vcombine_u64(vreinterpret_u64_u32(v32[3].val[0]), vreinterpret_u64_u32(v32[7].val[0]) );
iv[7] = (__m128i)vcombine_u64(vreinterpret_u64_u32(v32[3].val[1]), vreinterpret_u64_u32(v32[7].val[1]) );
#endif
#else
ov[0] = LD128((__m128i *) ip ); ov[0] = _mm_shuffle_epi8(ov[0], sv);
ov[1] = LD128((__m128i *)(ip+16)); ov[1] = _mm_shuffle_epi8(ov[1], sv);
ov[2] = LD128((__m128i *)(ip+32)); ov[2] = _mm_shuffle_epi8(ov[2], sv);
@ -509,6 +629,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
iv[6] = _mm_unpacklo_epi64(ov[3], ov[7]);
iv[7] = _mm_unpackhi_epi64(ov[3], ov[7]);
#endif
#endif
#elif defined(__SSE2__) || defined(__ARM_NEON)
#if ESIZE == 2
@ -601,16 +722,20 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
#if STRIDE <= ESIZE
_mm_storeu_si128((__m128i *) p, iv[0]);
_mm_storeu_si128((__m128i *)(p+=stride), iv[1]);
_mm_storeu_si128((__m128i *)SI(p,1), iv[1]);
#if ESIZE > 2
_mm_storeu_si128((__m128i *)(p+=stride), iv[2]);
_mm_storeu_si128((__m128i *)(p+=stride), iv[3]);
_mm_storeu_si128((__m128i *)SI(p,2), iv[2]);
_mm_storeu_si128((__m128i *)SI(p,3), iv[3]);
#if ESIZE > 4
_mm_storeu_si128((__m128i *)(p+=stride), iv[4]);
_mm_storeu_si128((__m128i *)(p+=stride), iv[5]);
_mm_storeu_si128((__m128i *)(p+=stride), iv[6]);
_mm_storeu_si128((__m128i *)(p+=stride), iv[7]);
_mm_storeu_si128((__m128i *)SI(p,4), iv[4]);
_mm_storeu_si128((__m128i *)SI(p,5), iv[5]);
_mm_storeu_si128((__m128i *)SI(p,6), iv[6]);
_mm_storeu_si128((__m128i *)SI(p,7), iv[7]); SE(p,8);
#else
SE(p,4);
#endif
#else
SE(p,2);
#endif
#else // Nibble
@ -631,9 +756,9 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2));
_mm_storel_epi64((__m128i *) p, ov[0]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[1]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[2]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[3]);
_mm_storel_epi64((__m128i *)SI(p,1), ov[1]);
_mm_storel_epi64((__m128i *)SI(p,2), ov[2]);
_mm_storel_epi64((__m128i *)SI(p,3), ov[3]);
#if ESIZE > 2
ov[0] = _mm_and_si128(iv[2], cl);
ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb);
@ -651,10 +776,10 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb);
ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2));
_mm_storel_epi64((__m128i *)(p+=stride), ov[0]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[1]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[2]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[3]);
_mm_storel_epi64((__m128i *)SI(p,4), ov[0]);
_mm_storel_epi64((__m128i *)SI(p,5), ov[1]);
_mm_storel_epi64((__m128i *)SI(p,6), ov[2]);
_mm_storel_epi64((__m128i *)SI(p,7), ov[3]);
#if ESIZE > 4
ov[0] = _mm_and_si128(iv[4], cl);
ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb);
@ -672,10 +797,10 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb);
ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2));
_mm_storel_epi64((__m128i *)(p+=stride), ov[0]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[1]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[2]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[3]);
_mm_storel_epi64((__m128i *)SI(p, 8), ov[0]);
_mm_storel_epi64((__m128i *)SI(p, 9), ov[1]);
_mm_storel_epi64((__m128i *)SI(p,10), ov[2]);
_mm_storel_epi64((__m128i *)SI(p,11), ov[3]);
ov[0] = _mm_and_si128(iv[6], cl);
ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb);
@ -693,11 +818,13 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb);
ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2));
_mm_storel_epi64((__m128i *)(p+=stride), ov[0]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[1]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[2]);
_mm_storel_epi64((__m128i *)(p+=stride), ov[3]);
_mm_storel_epi64((__m128i *)SI(p,12), ov[0]);
_mm_storel_epi64((__m128i *)SI(p,13), ov[1]);
_mm_storel_epi64((__m128i *)SI(p,14), ov[2]);
_mm_storel_epi64((__m128i *)SI(p,15), ov[3]); SE(p,16);
#endif
#else
SE(p, 8);
#endif
#endif
}
@ -718,7 +845,7 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
#if STRIDE > ESIZE
ov[0] = _mm_loadl_epi64((__m128i *) p );
ov[1] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[1] = _mm_loadl_epi64((__m128i *)SI(p,1));
ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4));
ov[0] = _mm_and_si128(ov[0], cl);
@ -728,8 +855,8 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
iv[0] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]);
ov[2] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[3] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[2] = _mm_loadl_epi64((__m128i *)SI(p,2));
ov[3] = _mm_loadl_epi64((__m128i *)SI(p,3));
ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4));
ov[2] = _mm_and_si128(ov[2], cl);
@ -739,8 +866,8 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
iv[1] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]);
#if ESIZE > 2
ov[0] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[1] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[0] = _mm_loadl_epi64((__m128i *)SI(p,4));
ov[1] = _mm_loadl_epi64((__m128i *)SI(p,5));
ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4));
ov[0] = _mm_and_si128(ov[0], cl);
@ -750,19 +877,19 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
iv[2] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]);
ov[2] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[3] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[2] = _mm_loadl_epi64((__m128i *)SI(p,6));
ov[3] = _mm_loadl_epi64((__m128i *)SI(p,7));
ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4));
ov[2] = _mm_and_si128(ov[2], cl);
ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4));
ov[3] = _mm_and_si128(ov[3], cl);
iv[3] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]);
iv[3] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); SE(p,8);
#endif
#if ESIZE > 4
ov[0] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[1] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[0] = _mm_loadl_epi64((__m128i *)SI(p,0));
ov[1] = _mm_loadl_epi64((__m128i *)SI(p,1));
ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4));
ov[0] = _mm_and_si128(ov[0], cl);
@ -772,8 +899,8 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
iv[4] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]);
ov[2] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[3] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[2] = _mm_loadl_epi64((__m128i *)SI(p,2));
ov[3] = _mm_loadl_epi64((__m128i *)SI(p,3));
ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4));
ov[2] = _mm_and_si128(ov[2], cl);
@ -783,8 +910,8 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
iv[5] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]);
ov[0] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[1] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[0] = _mm_loadl_epi64((__m128i *)SI(p,4));
ov[1] = _mm_loadl_epi64((__m128i *)SI(p,5));
ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4));
ov[0] = _mm_and_si128(ov[0], cl);
@ -794,43 +921,59 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
iv[6] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]);
ov[2] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[3] = _mm_loadl_epi64((__m128i *)(p+=stride));
ov[2] = _mm_loadl_epi64((__m128i *)SI(p,6));
ov[3] = _mm_loadl_epi64((__m128i *)SI(p,7));
ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4));
ov[2] = _mm_and_si128(ov[2], cl);
ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4));
ov[3] = _mm_and_si128(ov[3], cl);
iv[7] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]);
iv[7] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); SE(p,8);
#endif
#else
iv[0] = _mm_loadu_si128((__m128i *) p );
iv[1] = _mm_loadu_si128((__m128i *)(p+=stride));
iv[1] = _mm_loadu_si128((__m128i *)SI(p,1));
#if ESIZE > 2
iv[2] = _mm_loadu_si128((__m128i *)(p+=stride));
iv[3] = _mm_loadu_si128((__m128i *)(p+=stride));
iv[2] = _mm_loadu_si128((__m128i *)SI(p,2));
iv[3] = _mm_loadu_si128((__m128i *)SI(p,3));
#if ESIZE > 4
iv[4] = _mm_loadu_si128((__m128i *)(p+=stride));
iv[5] = _mm_loadu_si128((__m128i *)(p+=stride));
iv[6] = _mm_loadu_si128((__m128i *)(p+=stride));
iv[7] = _mm_loadu_si128((__m128i *)(p+=stride));
iv[4] = _mm_loadu_si128((__m128i *)SI(p,4));
iv[5] = _mm_loadu_si128((__m128i *)SI(p,5));
iv[6] = _mm_loadu_si128((__m128i *)SI(p,6));
iv[7] = _mm_loadu_si128((__m128i *)SI(p,7));
#endif
#endif
#endif
PREFETCH(ip+(ESIZE*16/STRIDE),0);
#if ESIZE == 2
#ifdef __ARM_NEON
uint8x16x2_t w; w.val[0] = (uint8x16_t)iv[0];
w.val[1] = (uint8x16_t)iv[1];
vst2q_u8(op, w);
#else
ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ST128((__m128i *)op, ov[0]);
ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]); ST128((__m128i *)(op+16), ov[1]); op += 32;
ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]); ST128((__m128i *)(op+16), ov[1]);
#endif
op += 32;
#elif ESIZE == 4
#ifdef __ARM_NEON
uint8x16x4_t w; w.val[0] = (uint8x16_t)iv[0];
w.val[1] = (uint8x16_t)iv[1];
w.val[2] = (uint8x16_t)iv[2];
w.val[3] = (uint8x16_t)iv[3]; vst4q_u8(op,w);
#else
ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]);
ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]);
ov[2] = _mm_unpacklo_epi8(iv[2], iv[3]);
ov[3] = _mm_unpackhi_epi8(iv[2], iv[3]);
iv[0] = _mm_unpacklo_epi16(ov[0], ov[2]); ST128((__m128i *) op, iv[0]);
iv[1] = _mm_unpackhi_epi16(ov[0], ov[2]); ST128((__m128i *)(op+16),iv[1]);
iv[2] = _mm_unpacklo_epi16(ov[1], ov[3]); ST128((__m128i *)(op+32),iv[2]);
iv[3] = _mm_unpackhi_epi16(ov[1], ov[3]); ST128((__m128i *)(op+48),iv[3]); op += 64;
iv[3] = _mm_unpackhi_epi16(ov[1], ov[3]); ST128((__m128i *)(op+48),iv[3]);
#endif
op += 64;
#else
ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]);
ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]);