From 99ae5cd7872e1e9ed424b89efc976a9d373691e4 Mon Sep 17 00:00:00 2001 From: powturbo Date: Sun, 17 Apr 2016 16:21:23 +0200 Subject: [PATCH] Byte+Nibble Transpose/Shuffle --- transpose.c | 421 ++++++++++++++++++++++++++++++++++------------------ transpose.h | 35 +++-- 2 files changed, 294 insertions(+), 162 deletions(-) diff --git a/transpose.c b/transpose.c index f293ce7..d3f0e48 100644 --- a/transpose.c +++ b/transpose.c @@ -21,47 +21,196 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// transpose.c - byte transpose +// transpose.c - nibble/byte transpose - #ifndef ESIZE +#if !defined(TRANSPOSE) && !defined(TRANSPOSEV) #include + #ifdef __SSSE3__ +#include + #elif defined(__SSE2__) +#include + #endif + +#include "conf.h" #include "transpose.h" - + +#define TRANSPOSE transpose +#define UNTRANSPOSE untranspose + #define ESIZE 2 -#include __FILE__ +#include "transpose.c" #undef ESIZE #define ESIZE 3 -#include __FILE__ -#undef ESIZE - -#define ESIZE 4 -#include __FILE__ +#include "transpose.c" #undef ESIZE #define ESIZE 8 -#include __FILE__ +#include "transpose.c" #undef ESIZE #define ESIZE 16 -#include __FILE__ +#include "transpose.c" #undef ESIZE -#ifdef __SSSE3__ -#include -#elif defined(__SSE2__) -#include -#endif +#undef TRANSPOSE +#undef UNTRANSPOSE -void transposev4(unsigned char *in, unsigned n, unsigned char *out) { +#define ESIZE 4 +#define TRANSPOSE _transpose +#define UNTRANSPOSE _untranspose + +#define STRIDE 4 +#define TRANSPOSEV transpose +#define UNTRANSPOSEV untranspose +#include "transpose.c" +#undef ESIZE +#undef TRANSPOSE +#undef UNTRANSPOSE + +#undef STRIDE +#undef TRANSPOSEV +#undef UNTRANSPOSEV + +#define STRIDE 8 +#define TRANSPOSEV transposen +#define UNTRANSPOSEV untransposen +#include "transpose.c" + +void transpose(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + switch(esize) { + case 2: transpose2 (in,n,out); break; + case 3: transpose3 (in,n,out); break; + case 4: transpose4( in,n,out); break; + case 8: transpose8 (in,n,out); break; + case 16: transpose16(in,n,out); break; + default: { + unsigned bsize = n/esize,i; + unsigned char *op,*ip; + for(ip = in,op = out; ip < in+bsize*esize; op++) + for(i = 0; i < esize; i++) + op[i*bsize] = *ip++; + for(op = out + esize*bsize; ip < in+n;) + *op++ = *ip++; + } + } +} + +void untranspose(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + switch(esize) { + case 2: untranspose2 (in,n,out); break; + case 3: untranspose3 (in,n,out); break; + case 4: untranspose4( in,n,out); break; + case 8: untranspose8 (in,n,out); break; + case 16: untranspose16(in,n,out); break; + default: { + unsigned bsize = n/esize,i; + unsigned char *op,*ip; + for(op = out,ip = in; op < out+bsize*esize; ip++) + for(i = 0; i < esize; i++) + *op++ = ip[i*bsize]; + for(ip = in+esize*bsize; op < out+n;) + *op++ = *ip++; + } + } +} +#else + + #ifdef TRANSPOSE +#define powof2(n) !((n)&((n)-1)) + +void TEMPLATE2(TRANSPOSE, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { + unsigned bsize = n/ESIZE,i; + unsigned char *op,*ip,*e; + #if powof2(ESIZE) + e = in+(n&~(ESIZE-1)); + #else + e = in+bsize*ESIZE; + #endif + + for(ip = in,op = out; ip < e; op++) { + op[0] = *ip++; + op[i =bsize] = *ip++; + #if ESIZE > 2 + op[i+=bsize] = *ip++; + #if ESIZE > 3 + op[i+=bsize] = *ip++; + #if ESIZE > 4 + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + #if ESIZE > 8 + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + op[i+=bsize] = *ip++; + #endif + #endif + #endif + #endif + } + op = out+bsize*ESIZE; while(ip < in+n) *op++ = *ip++; +} + +void TEMPLATE2(UNTRANSPOSE, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { + unsigned bsize = n/ESIZE,i; + unsigned char *op,*ip,*e; + + #if powof2(ESIZE) + e = out+(n&~(ESIZE-1)); + #else + e = out+bsize*ESIZE; + #endif + for(op = out,ip = in; op < e; ip++) { + *op++ = ip[0]; + *op++ = ip[i =bsize]; + #if ESIZE > 2 + *op++ = ip[i+=bsize]; + #if ESIZE > 3 + *op++ = ip[i+=bsize]; + #if ESIZE > 4 + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + #if ESIZE > 8 + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + *op++ = ip[i+=bsize]; + #endif + #endif + #endif + #endif + } + ip = in+bsize*ESIZE; + while(op < out+n) + *op++ = *ip++; +} + #endif + + #ifdef TRANSPOSEV +void TEMPLATE2(TRANSPOSEV, 4)(unsigned char *in, unsigned n, unsigned char *out) { #ifdef __SSE2__ - unsigned v = n&~(64-1), bsize = v/4,i; + unsigned v = n&~(64-1), bsize = v/STRIDE,i; unsigned char *op,*ip; #ifdef __SSE3__ static unsigned char s[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; __m128i sv = _mm_loadu_si128((__m128i *)s); #endif - for(ip = in,op = out; ip != in+v; op += 16) { + #if STRIDE == 8 + __m128i cl = _mm_set1_epi8(0x0f),ch=_mm_set1_epi8(0xf0), cb = _mm_set1_epi16(0xff); + #endif + for(ip = in,op = out; ip != in+v; op += 64/STRIDE) { __m128i iv[4],ov[4]; #ifdef __SSSE3__ @@ -105,27 +254,124 @@ void transposev4(unsigned char *in, unsigned n, unsigned char *out) { iv[2] = _mm_unpacklo_epi64(ov[1], ov[3]); iv[3] = _mm_unpackhi_epi64(ov[1], ov[3]); #endif + + #if STRIDE == 8 + ov[0] = _mm_and_si128(iv[0], cl); + ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); + ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128( ov[0],2)); + + ov[1] = _mm_srli_epi16(_mm_and_si128(iv[0], ch),4); + ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); + ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128( ov[1],2)); + + ov[2] = _mm_and_si128(iv[1], cl); + ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); + ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128( ov[2],2)); + + ov[3] = _mm_srli_epi16(_mm_and_si128(iv[1], ch),4); + ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); + ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2)); + + _mm_storel_epi64((__m128i *)op, ov[0]); + _mm_storel_epi64((__m128i *)(op+(i =bsize)), ov[1]); + _mm_storel_epi64((__m128i *)(op+(i+=bsize)), ov[2]); + _mm_storel_epi64((__m128i *)(op+(i+=bsize)), ov[3]); + + ov[0] = _mm_and_si128(iv[2], cl); + ov[0] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[0],4), ov[0]),cb); + ov[0] = _mm_packus_epi16(ov[0], _mm_srli_si128( ov[0],2)); + + ov[1] = _mm_srli_epi16(_mm_and_si128(iv[2], ch),4); + ov[1] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[1],4), ov[1]),cb); + ov[1] = _mm_packus_epi16(ov[1], _mm_srli_si128( ov[1],2)); + + ov[2] = _mm_and_si128(iv[3], cl); + ov[2] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[2],4), ov[2]),cb); + ov[2] = _mm_packus_epi16(ov[2], _mm_srli_si128( ov[2],2)); + + ov[3] = _mm_srli_epi16(_mm_and_si128(iv[3], ch),4); + ov[3] = _mm_and_si128(_mm_or_si128(_mm_srli_epi16(ov[3],4), ov[3]),cb); + ov[3] = _mm_packus_epi16(ov[3], _mm_srli_si128( ov[3],2)); + + _mm_storel_epi64((__m128i *)(op+(i+=bsize)), ov[0]); + _mm_storel_epi64((__m128i *)(op+(i+=bsize)), ov[1]); + _mm_storel_epi64((__m128i *)(op+(i+=bsize)), ov[2]); + _mm_storel_epi64((__m128i *)(op+(i+=bsize)), ov[3]); + + #else _mm_storeu_si128((__m128i *)op, iv[0]); _mm_storeu_si128((__m128i *)(op+(i =bsize)), iv[1]); _mm_storeu_si128((__m128i *)(op+(i+=bsize)), iv[2]); _mm_storeu_si128((__m128i *)(op+(i+=bsize)), iv[3]); + #endif } - transpose4(in+v, n-v, out+v); + _transpose4(in+v, n-v, out+v); #else - transpose4(in, n, out); + _transpose4(in, n, out); #endif } + #endif -void untransposev4(unsigned char *in, unsigned n, unsigned char *out) { + #ifdef UNTRANSPOSEV +void TEMPLATE2(UNTRANSPOSEV, 4)(unsigned char *in, unsigned n, unsigned char *out) { #ifdef __SSE2__ - unsigned v = n&~(64-1), bsize = v/4,i; + #if STRIDE == 8 + __m128i cl = _mm_set1_epi8(0x0f),ch=_mm_set1_epi8(0xf0), cb = _mm_set1_epi16(0xff); + #endif + unsigned v = n&~(64-1), bsize = v/STRIDE,i; unsigned char *op,*ip; - for(op = out,ip = in; op != out+v; ip += 16) { + for(op = out,ip = in; op != out+v; ip += 64/STRIDE) { __m128i iv[4], ov[4]; + + #if STRIDE == 8 + ov[0] = _mm_loadl_epi64((__m128i *) ip) ; + ov[1] = _mm_loadl_epi64((__m128i *)(ip+(i =bsize))); + + ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); + ov[0] = _mm_and_si128(ov[0], cl); + + ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); + ov[1] = _mm_and_si128(ov[1], cl); + iv[0] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); + + + ov[2] = _mm_loadl_epi64((__m128i *)(ip+(i+=bsize))); + ov[3] = _mm_loadl_epi64((__m128i *)(ip+(i+=bsize))); + + ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); + ov[2] = _mm_and_si128(ov[2], cl); + + ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); + ov[3] = _mm_and_si128(ov[3], cl); + iv[1] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); + + + ov[0] = _mm_loadl_epi64((__m128i *)(ip+(i+=bsize))); + ov[1] = _mm_loadl_epi64((__m128i *)(ip+(i+=bsize))); + + ov[0] = _mm_unpacklo_epi8(ov[0], _mm_srli_epi16(ov[0],4)); + ov[0] = _mm_and_si128(ov[0], cl); + + ov[1] = _mm_unpacklo_epi8(ov[1], _mm_srli_epi16(ov[1],4)); + ov[1] = _mm_and_si128(ov[1], cl); + iv[2] = _mm_or_si128(_mm_slli_epi16(ov[1],4), ov[0]); + + + ov[2] = _mm_loadl_epi64((__m128i *)(ip+(i+=bsize))); + ov[3] = _mm_loadl_epi64((__m128i *)(ip+(i+=bsize))); + + ov[2] = _mm_unpacklo_epi8(ov[2], _mm_srli_epi16(ov[2],4)); + ov[2] = _mm_and_si128(ov[2], cl); + + ov[3] = _mm_unpacklo_epi8(ov[3], _mm_srli_epi16(ov[3],4)); + ov[3] = _mm_and_si128(ov[3], cl); + iv[3] = _mm_or_si128(_mm_slli_epi16(ov[3],4), ov[2]); + #else iv[0] = _mm_loadu_si128((__m128i *) ip) ; iv[1] = _mm_loadu_si128((__m128i *)(ip+(i =bsize))); iv[2] = _mm_loadu_si128((__m128i *)(ip+(i+=bsize))); iv[3] = _mm_loadu_si128((__m128i *)(ip+(i+=bsize))); + #endif ov[0] = _mm_unpacklo_epi8(iv[0], iv[1]); ov[1] = _mm_unpackhi_epi8(iv[0], iv[1]); @@ -137,130 +383,11 @@ void untransposev4(unsigned char *in, unsigned n, unsigned char *out) { _mm_storeu_si128((__m128i *)op, _mm_unpacklo_epi16(ov[1], ov[3])); op += 16; _mm_storeu_si128((__m128i *)op, _mm_unpackhi_epi16(ov[1], ov[3])); op += 16; } - untranspose4(in+v, n-v, out+v); + _untranspose4(in+v, n-v, out+v); #else - transpose4(in, n, out); + _untranspose4(in, n, out); #endif } - -void transpose(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - switch(esize) { - case 2: transpose2 (in,n,out); break; - case 3: transpose3 (in,n,out); break; - case 4: transpose4 (in,n,out); break; - case 8: transpose8 (in,n,out); break; - case 16: transpose16(in,n,out); break; - default: { - unsigned bsize = n/esize,i; - unsigned char *op,*ip; - for(ip = in,op = out; ip < in+bsize*esize; op++) - for(i = 0; i < esize; i++) - op[i*bsize] = *ip++; - for(op += esize*bsize; ip < in+n;) - *op++ = *ip++; - } - } -} - -void untranspose(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { - switch(esize) { - case 2: untranspose2 (in,n,out); break; - case 3: untranspose3 (in,n,out); break; - case 4: untranspose4 (in,n,out); break; - case 8: untranspose8 (in,n,out); break; - case 16: untranspose16(in,n,out); break; - default: { - unsigned bsize = n/esize,i; - unsigned char *op,*ip; - for(op = out,ip = in; op < out+bsize*esize; ip++) - for(i = 0; i < esize; i++) - *op++ = ip[i*bsize]; - for(ip += esize*bsize; op < out+n;) - *op++ = *ip++; - } - } -} - - #else - -#include "conf.h" -#define powof2(n) !((n)&((n)-1)) - -void TEMPLATE2(transpose, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned bsize = n/ESIZE,i; - unsigned char *op,*ip,*e; - #if powof2(ESIZE) - e = in+(n&~(ESIZE-1)); - #else - e = in+bsize*ESIZE; - #endif - - for(ip = in,op = out; ip < e; op++) { - op[0] = *ip++; - op[i =bsize] = *ip++; - #if ESIZE > 2 - op[i+=bsize] = *ip++; - #if ESIZE > 3 - op[i+=bsize] = *ip++; - #if ESIZE > 4 - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - #if ESIZE > 8 - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - op[i+=bsize] = *ip++; - #endif - #endif - #endif - #endif - } - op = out+bsize*ESIZE; while(ip < in+n) *op++ = *ip++; -} - -void TEMPLATE2(untranspose, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { - unsigned bsize = n/ESIZE,i; - unsigned char *op,*ip,*e; - - #if powof2(ESIZE) - e = out+(n&~(ESIZE-1)); - #else - e = out+bsize*ESIZE; - #endif - for(op = out,ip = in; op < e; ip++) { - *op++ = ip[0]; - *op++ = ip[i =bsize]; - #if ESIZE > 2 - *op++ = ip[i+=bsize]; - #if ESIZE > 3 - *op++ = ip[i+=bsize]; - #if ESIZE > 4 - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - #if ESIZE > 8 - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - *op++ = ip[i+=bsize]; - #endif - #endif - #endif - #endif - } - ip = in+bsize*ESIZE; - while(op < out+n) - *op++ = *ip++; -} #endif + +#endif diff --git a/transpose.h b/transpose.h index bd49b68..449ee8f 100644 --- a/transpose.h +++ b/transpose.h @@ -21,27 +21,32 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// transpose.h - byte transpose +// transpose.h - byte transpose #ifdef __cplusplus extern "C" { #endif +// Transpose/Shuffle block for further compressing with lz77 or other compressors +void transpose2 (unsigned char *in, unsigned n, unsigned char *out); +void transpose3 (unsigned char *in, unsigned n, unsigned char *out); +void transpose4 (unsigned char *in, unsigned n, unsigned char *out); +void transpose8 (unsigned char *in, unsigned n, unsigned char *out); +void transpose16 (unsigned char *in, unsigned n, unsigned char *out); +void transpose (unsigned char *in, unsigned n, unsigned char *out, unsigned esize); -void transpose2 (unsigned char *in, unsigned n, unsigned char *out); -void transpose3 (unsigned char *in, unsigned n, unsigned char *out); -void transpose4 (unsigned char *in, unsigned n, unsigned char *out); -void transpose8 (unsigned char *in, unsigned n, unsigned char *out); -void transpose16 (unsigned char *in, unsigned n, unsigned char *out); -void transpose (unsigned char *in, unsigned n, unsigned char *out, unsigned esize); +void untranspose2 (unsigned char *in, unsigned n, unsigned char *out); +void untranspose3 (unsigned char *in, unsigned n, unsigned char *out); +void untranspose4 (unsigned char *in, unsigned n, unsigned char *out); +void untranspose8 (unsigned char *in, unsigned n, unsigned char *out); +void untranspose16 (unsigned char *in, unsigned n, unsigned char *out); +void untranspose (unsigned char *in, unsigned n, unsigned char *out, unsigned esize); -void untranspose2 (unsigned char *in, unsigned n, unsigned char *out); -void untranspose3 (unsigned char *in, unsigned n, unsigned char *out); -void untranspose4 (unsigned char *in, unsigned n, unsigned char *out); -void untranspose8 (unsigned char *in, unsigned n, unsigned char *out); -void untranspose16 (unsigned char *in, unsigned n, unsigned char *out); -void untranspose (unsigned char *in, unsigned n, unsigned char *out, unsigned esize); +// scalar transpose +void _transpose4 (unsigned char *in, unsigned n, unsigned char *out); +void _untranspose4 (unsigned char *in, unsigned n, unsigned char *out); -void transposev4 (unsigned char *in, unsigned n, unsigned char *out); -void untransposev4(unsigned char *in, unsigned n, unsigned char *out); +// Nibble transpose +void transposen4 (unsigned char *in, unsigned n, unsigned char *out); +void untransposen4 (unsigned char *in, unsigned n, unsigned char *out); #ifdef __cplusplus }