Transform: Byte+Nibble Transpose/Shuffle
This commit is contained in:
92
transpose.c
92
transpose.c
@ -42,8 +42,11 @@
|
||||
#include "conf.h"
|
||||
#include "transpose.h"
|
||||
|
||||
#define PREFETCH(_ip_) __builtin_prefetch(_ip_+512,0)
|
||||
//#define PREFETCH(ip)
|
||||
#ifdef __ARM_NEON
|
||||
#define PREFETCH(_ip_,_rw_)
|
||||
#else
|
||||
#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
|
||||
#endif
|
||||
|
||||
#define powof2(n) !((n)&((n)-1))
|
||||
|
||||
@ -120,9 +123,7 @@
|
||||
#include "transpose.c"
|
||||
|
||||
//--------------------- CPU detection -------------------------------------------
|
||||
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
|
||||
#include <intrin.h>
|
||||
#elif defined(__INTEL_COMPILER)
|
||||
#if (_MSC_VER >=1300) || defined (__INTEL_COMPILER)
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
@ -245,7 +246,7 @@ void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
|
||||
if(!tpset) tpini(0);
|
||||
if(esize <= 16 && (f = _tpd[esize])) f(in,n,out);
|
||||
else {
|
||||
unsigned i,stride=n/esize;
|
||||
unsigned i, stride = n/esize;
|
||||
unsigned char *op,*ip;
|
||||
for(op = out,ip = in; op < out+stride*esize; ip++)
|
||||
for(i = 0; i < esize; i++)
|
||||
@ -255,6 +256,57 @@ void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
|
||||
}
|
||||
}
|
||||
|
||||
#define ODX2 e + (_x + _y * x) * esize //_x + e + _y * x //+ e * y * x
|
||||
void tp2denc(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize) {
|
||||
unsigned _x,_y; int e; uint8_t *op = out, *ip = in;
|
||||
for(e = esize-1; e >= 0; e--)
|
||||
for( _x = 0; _x < x; _x++)
|
||||
for(_y = 0; _y < y; _y++) op[ODX2] = *ip++;
|
||||
}
|
||||
|
||||
void tp2ddec(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize) {
|
||||
unsigned _x,_y; int e; uint8_t *op=out,*ip=in;
|
||||
for(e = esize-1; e >= 0; e--)
|
||||
for( _x = 0; _x < x; _x++)
|
||||
for(_y = 0; _y < y; _y++) *op++/*[_x * y * z + _y * z + _z]*/ = ip[ODX2];
|
||||
}
|
||||
|
||||
#define ODX3 e + (_x + _y * x + _z * y * x) * esize
|
||||
void tp3denc(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { //memcpy(out,in, x*y*z*esize); return;
|
||||
unsigned _x,_y,_z; int e; uint8_t *op = out, *ip=in;
|
||||
for(e = esize-1; e >= 0; e--)
|
||||
for( _x = 0; _x < x; _x++)
|
||||
for( _y = 0; _y < y; _y++)
|
||||
for(_z = 0; _z < z; _z++) op[ODX3] = *ip++;
|
||||
}
|
||||
|
||||
void tp3ddec(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { //memcpy(out,in, x*y*z*esize); return;
|
||||
unsigned _x,_y,_z; int e; uint8_t *op=out,*ip=in;
|
||||
for(e = esize-1; e >= 0; e--)
|
||||
for(_x = 0; _x < x; ++_x)
|
||||
for(_y = 0; _y < y; ++_y)
|
||||
for(_z = 0; _z < z; ++_z) *op++= ip[ODX3]; /*[_x * y * z + _y * z + _z]*/
|
||||
}
|
||||
|
||||
#define ODX4 e + (_w + _x * w + _y * x * w + _z * x * y * w) * esize
|
||||
void tp4denc(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { //memcpy(out,in, x*y*z*esize); return;
|
||||
unsigned _w,_x,_y,_z; int e; uint8_t *op = out, *ip=in;
|
||||
for(e = esize-1; e >= 0; e--)
|
||||
for( _w = 0; _w < w; _w++)
|
||||
for( _x = 0; _x < x; _x++)
|
||||
for( _y = 0; _y < y; _y++)
|
||||
for(_z = 0; _z < z; _z++) op[ODX4] = *ip++;
|
||||
}
|
||||
|
||||
void tp4ddec(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize) { //memcpy(out,in, x*y*z*esize); return;
|
||||
unsigned _w,_x,_y,_z; int e; uint8_t *op=out,*ip=in;
|
||||
for(e = esize-1; e >= 0; e--)
|
||||
for( _w = 0; _w < w; _w++)
|
||||
for( _x = 0; _x < x; ++_x)
|
||||
for( _y = 0; _y < y; ++_y)
|
||||
for(_z = 0; _z < z; ++_z) *op++= ip[ODX4]; /*[_x * y * z + _y * z + _z]*/
|
||||
}
|
||||
|
||||
#ifdef USE_SSE
|
||||
void tp4enc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
|
||||
TPFUNC f;
|
||||
@ -395,14 +447,14 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if ESIZE == 2
|
||||
ov[0] = LD128((__m128i *)ip); ov[0] = _mm_shuffle_epi8(ov[0], sv);
|
||||
ov[1] = LD128((__m128i *)(ip+16)); ov[1] = _mm_shuffle_epi8(ov[1], sv); ip+= 32; PREFETCH(ip);
|
||||
ov[1] = LD128((__m128i *)(ip+16)); ov[1] = _mm_shuffle_epi8(ov[1], sv); ip+= 32; PREFETCH(ip+512,0);
|
||||
iv[0] = _mm_unpacklo_epi64(ov[0], ov[1]);
|
||||
iv[1] = _mm_unpackhi_epi64(ov[0], ov[1]);
|
||||
#elif ESIZE == 4
|
||||
iv[0] = LD128((__m128i *) ip ); iv[0] = _mm_shuffle_epi8(iv[0], sv);
|
||||
iv[1] = LD128((__m128i *)(ip+16)); iv[1] = _mm_shuffle_epi8(iv[1], sv);
|
||||
iv[2] = LD128((__m128i *)(ip+32)); iv[2] = _mm_shuffle_epi8(iv[2], sv);
|
||||
iv[3] = LD128((__m128i *)(ip+48)); iv[3] = _mm_shuffle_epi8(iv[3], sv); ip += 64; PREFETCH(ip);
|
||||
iv[3] = LD128((__m128i *)(ip+48)); iv[3] = _mm_shuffle_epi8(iv[3], sv); ip += 64; PREFETCH(ip+512,0);
|
||||
|
||||
ov[0] = _mm_unpacklo_epi32(iv[0], iv[1]);
|
||||
ov[1] = _mm_unpackhi_epi32(iv[0], iv[1]);
|
||||
@ -433,7 +485,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[4] = LD128((__m128i *) ip ); ov[4] = _mm_shuffle_epi8(ov[4], sv);
|
||||
ov[5] = LD128((__m128i *)(ip+16)); ov[5] = _mm_shuffle_epi8(ov[5], sv);
|
||||
ov[6] = LD128((__m128i *)(ip+32)); ov[6] = _mm_shuffle_epi8(ov[6], sv);
|
||||
ov[7] = LD128((__m128i *)(ip+48)); ov[7] = _mm_shuffle_epi8(ov[7], sv); ip += 64; PREFETCH(ip);
|
||||
ov[7] = LD128((__m128i *)(ip+48)); ov[7] = _mm_shuffle_epi8(ov[7], sv); ip += 64; PREFETCH(ip+512,0);
|
||||
|
||||
iv[4] = _mm_unpacklo_epi16(ov[4], ov[5]);
|
||||
iv[5] = _mm_unpackhi_epi16(ov[4], ov[5]);
|
||||
@ -459,7 +511,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if ESIZE == 2
|
||||
iv[0] = LD128((__m128i *)ip); ip += 16;
|
||||
iv[1] = LD128((__m128i *)ip); ip += 16; PREFETCH(ip);
|
||||
iv[1] = LD128((__m128i *)ip); ip += 16; PREFETCH(ip+512,0);
|
||||
|
||||
ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]);
|
||||
ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]);
|
||||
@ -476,7 +528,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
iv[0] = LD128((__m128i *) ip );
|
||||
iv[1] = LD128((__m128i *)(ip+16));
|
||||
iv[2] = LD128((__m128i *)(ip+32));
|
||||
iv[3] = LD128((__m128i *)(ip+48)); ip += 64; PREFETCH(ip);
|
||||
iv[3] = LD128((__m128i *)(ip+48)); ip += 64; PREFETCH(ip+512,0);
|
||||
|
||||
ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]);
|
||||
ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]);
|
||||
@ -505,7 +557,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
iv[4] = LD128((__m128i *)(ip+ 64));
|
||||
iv[5] = LD128((__m128i *)(ip+ 80));
|
||||
iv[6] = LD128((__m128i *)(ip+ 96));
|
||||
iv[7] = LD128((__m128i *)(ip+112)); ip += 128; PREFETCH(ip);
|
||||
iv[7] = LD128((__m128i *)(ip+112)); ip += 128; PREFETCH(ip+512,0);
|
||||
|
||||
ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]);
|
||||
ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]);
|
||||
@ -546,7 +598,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#endif
|
||||
|
||||
#if STRIDE <= ESIZE
|
||||
_mm_storeu_si128((__m128i *) p, iv[0]);
|
||||
_mm_storeu_si128((__m128i *) p, iv[0]);
|
||||
_mm_storeu_si128((__m128i *)(p+=stride), iv[1]);
|
||||
#if ESIZE > 2
|
||||
_mm_storeu_si128((__m128i *)(p+=stride), iv[2]);
|
||||
@ -576,7 +628,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb);
|
||||
ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2));
|
||||
|
||||
_mm_storel_epi64((__m128i *) p, ov[0]);
|
||||
_mm_storel_epi64((__m128i *) p, ov[0]);
|
||||
_mm_storel_epi64((__m128i *)(p+=stride), ov[1]);
|
||||
_mm_storel_epi64((__m128i *)(p+=stride), ov[2]);
|
||||
_mm_storel_epi64((__m128i *)(p+=stride), ov[3]);
|
||||
@ -764,7 +816,7 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
PREFETCH(ip+(ESIZE*16/STRIDE));
|
||||
PREFETCH(ip+(ESIZE*16/STRIDE),0);
|
||||
#if ESIZE == 2
|
||||
ov[0] = _mm_unpacklo_epi8( iv[0], iv[1]); ST128((__m128i *)op, ov[0]);
|
||||
ov[1] = _mm_unpackhi_epi8( iv[0], iv[1]); ST128((__m128i *)(op+16), ov[1]); op += 32;
|
||||
@ -903,12 +955,12 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#if ESIZE == 2
|
||||
#if 0
|
||||
ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv);
|
||||
ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv); PREFETCH(ip);
|
||||
ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv); PREFETCH(ip+512,0);
|
||||
iv[0] = _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(ov[0], ov[1]), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
iv[1] = _mm256_permute4x64_epi64(_mm256_unpackhi_epi64(ov[0], ov[1]), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
#else
|
||||
ov[0] = _mm256_shuffle_epi8(LD256((__m256i *)ip), sv0);
|
||||
ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)),sv1); PREFETCH(ip);
|
||||
ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)),sv1); PREFETCH(ip+512,0);
|
||||
iv[0] = _mm256_permute4x64_epi64(_mm256_blend_epi32(ov[0], ov[1],0b11001100),_MM_SHUFFLE(3, 1, 2, 0));
|
||||
iv[1] = _mm256_blend_epi32(ov[0], ov[1],0b00110011);
|
||||
iv[1] = _mm256_permute4x64_epi64(_mm256_shuffle_epi32(iv[1],_MM_SHUFFLE(1, 0, 3, 2)),_MM_SHUFFLE(3, 1, 2, 0));
|
||||
@ -917,7 +969,7 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
iv[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0);
|
||||
iv[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1);
|
||||
iv[2] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+64)), sv0);
|
||||
iv[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv1); PREFETCH(ip);
|
||||
iv[3] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+96)), sv1); PREFETCH(ip+512,0);
|
||||
#if 0
|
||||
ov[0] = _mm256_unpacklo_epi32(iv[0], iv[1]);
|
||||
ov[1] = _mm256_unpackhi_epi32(iv[0], iv[1]);
|
||||
@ -963,7 +1015,7 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
ov[4] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+128)), sv);
|
||||
ov[5] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+160)), sv);
|
||||
ov[6] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+192)), sv);
|
||||
ov[7] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+224)), sv); PREFETCH(ip);
|
||||
ov[7] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+224)), sv); PREFETCH(ip+512,0);
|
||||
|
||||
iv[4] = _mm256_unpacklo_epi16(ov[4], ov[5]);
|
||||
iv[5] = _mm256_unpackhi_epi16(ov[4], ov[5]);
|
||||
@ -1143,7 +1195,7 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
PREFETCH(ip+ESIZE*32/STRIDE);
|
||||
PREFETCH(ip+ESIZE*32/STRIDE,0);
|
||||
#if ESIZE == 2
|
||||
ov[0] = _mm256_permute4x64_epi64(iv[0], _MM_SHUFFLE(3, 1, 2, 0));
|
||||
ov[1] = _mm256_permute4x64_epi64(iv[1], _MM_SHUFFLE(3, 1, 2, 0));
|
||||
|
||||
Reference in New Issue
Block a user