TurboPFor: Bit Unpacking SIMD/AVX2
This commit is contained in:
479
bitunpackv.c
479
bitunpackv.c
@ -22,21 +22,18 @@
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Integer Compression" SIMD Bit Packing
|
||||
#ifndef VSTO
|
||||
#include <stdio.h>
|
||||
|
||||
#include <strings.h>
|
||||
#include <stdio.h>
|
||||
#include <emmintrin.h>
|
||||
#include "conf.h"
|
||||
#include "bitutil.h"
|
||||
#include "bitunpack.h"
|
||||
#include "bitpack.h"
|
||||
|
||||
#define PAD8(__x) (((__x)+7)/8)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO( _op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, ov)
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#include "bitunpack128v.c"
|
||||
#include "bitunpack128v_.h"
|
||||
|
||||
#define BITUNBLK128V32_0(ip, _i_, _op_, _parm_) {__m128i ov;\
|
||||
VSTO0(_op_, 0, ov, _parm_);\
|
||||
@ -110,7 +107,7 @@ static ALIGNED(char, shuffles[16][16], 16) = {
|
||||
#define VSTO( _op_, _i_, _ov_, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m)
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m)
|
||||
#define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128()
|
||||
#include "bitunpack128v.c"
|
||||
#include "bitunpack128v_.h"
|
||||
|
||||
unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(128*b); unsigned m;
|
||||
@ -126,7 +123,7 @@ unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG128x32(__ov); SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv)
|
||||
#include "bitunpack128v.c"
|
||||
#include "bitunpack128v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
@ -141,7 +138,7 @@ unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO(__op, i, __ov, __sv) SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv)
|
||||
#include "bitunpack128v.c"
|
||||
#include "bitunpack128v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
@ -163,7 +160,7 @@ unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
#define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m)
|
||||
#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack128v.c"
|
||||
#include "bitunpack128v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
@ -180,7 +177,7 @@ unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO(__op, i, __ov, __sv) SCANI128x32(__ov,__sv,cv); _mm_storeu_si128(__op++, __sv);
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv)
|
||||
#include "bitunpack128v.c"
|
||||
#include "bitunpack128v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4)
|
||||
|
||||
@ -201,7 +198,7 @@ unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n
|
||||
#define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m)
|
||||
#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCANI128x32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack128v.c"
|
||||
#include "bitunpack128v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_) mv = _mm_set1_epi32(0) //_parm_ = _mm_setzero_si128()
|
||||
|
||||
@ -216,47 +213,425 @@ unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned
|
||||
#undef BITUNPACK0
|
||||
#endif
|
||||
|
||||
#else
|
||||
#include "bitunpack128v_.h"
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
|
||||
#define BITUNPACK128V32(__ip, __nbits, __op, _parm_) { __m128i mv,*_ov=(__m128i *)__op,*_iv=(__m128i *)__ip; \
|
||||
switch(__nbits&0x3f) {\
|
||||
case 0: BITUNPACK0(_parm_); BITUNPACK128V32_0( _iv, _ov, _parm_); break;\
|
||||
case 1: mv = _mm_set1_epi32((1u<< 1)-1); BITUNPACK128V32_1( _iv, _ov, _parm_); break;\
|
||||
case 2: mv = _mm_set1_epi32((1u<< 2)-1); BITUNPACK128V32_2( _iv, _ov, _parm_); break;\
|
||||
case 3: mv = _mm_set1_epi32((1u<< 3)-1); BITUNPACK128V32_3( _iv, _ov, _parm_); break;\
|
||||
case 4: mv = _mm_set1_epi32((1u<< 4)-1); BITUNPACK128V32_4( _iv, _ov, _parm_); break;\
|
||||
case 5: mv = _mm_set1_epi32((1u<< 5)-1); BITUNPACK128V32_5( _iv, _ov, _parm_); break;\
|
||||
case 6: mv = _mm_set1_epi32((1u<< 6)-1); BITUNPACK128V32_6( _iv, _ov, _parm_); break;\
|
||||
case 7: mv = _mm_set1_epi32((1u<< 7)-1); BITUNPACK128V32_7( _iv, _ov, _parm_); break;\
|
||||
case 8: mv = _mm_set1_epi32((1u<< 8)-1); BITUNPACK128V32_8( _iv, _ov, _parm_); break;\
|
||||
case 9: mv = _mm_set1_epi32((1u<< 9)-1); BITUNPACK128V32_9( _iv, _ov, _parm_); break;\
|
||||
case 10: mv = _mm_set1_epi32((1u<<10)-1); BITUNPACK128V32_10(_iv, _ov, _parm_); break;\
|
||||
case 11: mv = _mm_set1_epi32((1u<<11)-1); BITUNPACK128V32_11(_iv, _ov, _parm_); break;\
|
||||
case 12: mv = _mm_set1_epi32((1u<<12)-1); BITUNPACK128V32_12(_iv, _ov, _parm_); break;\
|
||||
case 13: mv = _mm_set1_epi32((1u<<13)-1); BITUNPACK128V32_13(_iv, _ov, _parm_); break;\
|
||||
case 14: mv = _mm_set1_epi32((1u<<14)-1); BITUNPACK128V32_14(_iv, _ov, _parm_); break;\
|
||||
case 15: mv = _mm_set1_epi32((1u<<15)-1); BITUNPACK128V32_15(_iv, _ov, _parm_); break;\
|
||||
case 16: mv = _mm_set1_epi32((1u<<16)-1); BITUNPACK128V32_16(_iv, _ov, _parm_); break;\
|
||||
case 17: mv = _mm_set1_epi32((1u<<17)-1); BITUNPACK128V32_17(_iv, _ov, _parm_); break;\
|
||||
case 18: mv = _mm_set1_epi32((1u<<18)-1); BITUNPACK128V32_18(_iv, _ov, _parm_); break;\
|
||||
case 19: mv = _mm_set1_epi32((1u<<19)-1); BITUNPACK128V32_19(_iv, _ov, _parm_); break;\
|
||||
case 20: mv = _mm_set1_epi32((1u<<20)-1); BITUNPACK128V32_20(_iv, _ov, _parm_); break;\
|
||||
case 21: mv = _mm_set1_epi32((1u<<21)-1); BITUNPACK128V32_21(_iv, _ov, _parm_); break;\
|
||||
case 22: mv = _mm_set1_epi32((1u<<22)-1); BITUNPACK128V32_22(_iv, _ov, _parm_); break;\
|
||||
case 23: mv = _mm_set1_epi32((1u<<23)-1); BITUNPACK128V32_23(_iv, _ov, _parm_); break;\
|
||||
case 24: mv = _mm_set1_epi32((1u<<24)-1); BITUNPACK128V32_24(_iv, _ov, _parm_); break;\
|
||||
case 25: mv = _mm_set1_epi32((1u<<25)-1); BITUNPACK128V32_25(_iv, _ov, _parm_); break;\
|
||||
case 26: mv = _mm_set1_epi32((1u<<26)-1); BITUNPACK128V32_26(_iv, _ov, _parm_); break;\
|
||||
case 27: mv = _mm_set1_epi32((1u<<27)-1); BITUNPACK128V32_27(_iv, _ov, _parm_); break;\
|
||||
case 28: mv = _mm_set1_epi32((1u<<28)-1); BITUNPACK128V32_28(_iv, _ov, _parm_); break;\
|
||||
case 29: mv = _mm_set1_epi32((1u<<29)-1); BITUNPACK128V32_29(_iv, _ov, _parm_); break;\
|
||||
case 30: mv = _mm_set1_epi32((1u<<30)-1); BITUNPACK128V32_30(_iv, _ov, _parm_); break;\
|
||||
case 31: mv = _mm_set1_epi32((1u<<31)-1); BITUNPACK128V32_31(_iv, _ov, _parm_); break;\
|
||||
case 32: mv = _mm_set1_epi32((1ull<<32)-1);BITUNPACK128V32_32(_iv, _ov, _parm_); break;\
|
||||
case 33 ... 63: break;\
|
||||
}\
|
||||
}
|
||||
#ifdef __AVX512F__
|
||||
#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_)
|
||||
#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_)
|
||||
#else
|
||||
static unsigned char permv[256][8] __attribute__((aligned(32))) = {
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,1,1,1,1,
|
||||
1,0,1,1,1,1,1,1,
|
||||
0,1,2,2,2,2,2,2,
|
||||
1,1,0,1,1,1,1,1,
|
||||
0,2,1,2,2,2,2,2,
|
||||
2,0,1,2,2,2,2,2,
|
||||
0,1,2,3,3,3,3,3,
|
||||
1,1,1,0,1,1,1,1,
|
||||
0,2,2,1,2,2,2,2,
|
||||
2,0,2,1,2,2,2,2,
|
||||
0,1,3,2,3,3,3,3,
|
||||
2,2,0,1,2,2,2,2,
|
||||
0,3,1,2,3,3,3,3,
|
||||
3,0,1,2,3,3,3,3,
|
||||
0,1,2,3,4,4,4,4,
|
||||
1,1,1,1,0,1,1,1,
|
||||
0,2,2,2,1,2,2,2,
|
||||
2,0,2,2,1,2,2,2,
|
||||
0,1,3,3,2,3,3,3,
|
||||
2,2,0,2,1,2,2,2,
|
||||
0,3,1,3,2,3,3,3,
|
||||
3,0,1,3,2,3,3,3,
|
||||
0,1,2,4,3,4,4,4,
|
||||
2,2,2,0,1,2,2,2,
|
||||
0,3,3,1,2,3,3,3,
|
||||
3,0,3,1,2,3,3,3,
|
||||
0,1,4,2,3,4,4,4,
|
||||
3,3,0,1,2,3,3,3,
|
||||
0,4,1,2,3,4,4,4,
|
||||
4,0,1,2,3,4,4,4,
|
||||
0,1,2,3,4,5,5,5,
|
||||
1,1,1,1,1,0,1,1,
|
||||
0,2,2,2,2,1,2,2,
|
||||
2,0,2,2,2,1,2,2,
|
||||
0,1,3,3,3,2,3,3,
|
||||
2,2,0,2,2,1,2,2,
|
||||
0,3,1,3,3,2,3,3,
|
||||
3,0,1,3,3,2,3,3,
|
||||
0,1,2,4,4,3,4,4,
|
||||
2,2,2,0,2,1,2,2,
|
||||
0,3,3,1,3,2,3,3,
|
||||
3,0,3,1,3,2,3,3,
|
||||
0,1,4,2,4,3,4,4,
|
||||
3,3,0,1,3,2,3,3,
|
||||
0,4,1,2,4,3,4,4,
|
||||
4,0,1,2,4,3,4,4,
|
||||
0,1,2,3,5,4,5,5,
|
||||
2,2,2,2,0,1,2,2,
|
||||
0,3,3,3,1,2,3,3,
|
||||
3,0,3,3,1,2,3,3,
|
||||
0,1,4,4,2,3,4,4,
|
||||
3,3,0,3,1,2,3,3,
|
||||
0,4,1,4,2,3,4,4,
|
||||
4,0,1,4,2,3,4,4,
|
||||
0,1,2,5,3,4,5,5,
|
||||
3,3,3,0,1,2,3,3,
|
||||
0,4,4,1,2,3,4,4,
|
||||
4,0,4,1,2,3,4,4,
|
||||
0,1,5,2,3,4,5,5,
|
||||
4,4,0,1,2,3,4,4,
|
||||
0,5,1,2,3,4,5,5,
|
||||
5,0,1,2,3,4,5,5,
|
||||
0,1,2,3,4,5,6,6,
|
||||
1,1,1,1,1,1,0,1,
|
||||
0,2,2,2,2,2,1,2,
|
||||
2,0,2,2,2,2,1,2,
|
||||
0,1,3,3,3,3,2,3,
|
||||
2,2,0,2,2,2,1,2,
|
||||
0,3,1,3,3,3,2,3,
|
||||
3,0,1,3,3,3,2,3,
|
||||
0,1,2,4,4,4,3,4,
|
||||
2,2,2,0,2,2,1,2,
|
||||
0,3,3,1,3,3,2,3,
|
||||
3,0,3,1,3,3,2,3,
|
||||
0,1,4,2,4,4,3,4,
|
||||
3,3,0,1,3,3,2,3,
|
||||
0,4,1,2,4,4,3,4,
|
||||
4,0,1,2,4,4,3,4,
|
||||
0,1,2,3,5,5,4,5,
|
||||
2,2,2,2,0,2,1,2,
|
||||
0,3,3,3,1,3,2,3,
|
||||
3,0,3,3,1,3,2,3,
|
||||
0,1,4,4,2,4,3,4,
|
||||
3,3,0,3,1,3,2,3,
|
||||
0,4,1,4,2,4,3,4,
|
||||
4,0,1,4,2,4,3,4,
|
||||
0,1,2,5,3,5,4,5,
|
||||
3,3,3,0,1,3,2,3,
|
||||
0,4,4,1,2,4,3,4,
|
||||
4,0,4,1,2,4,3,4,
|
||||
0,1,5,2,3,5,4,5,
|
||||
4,4,0,1,2,4,3,4,
|
||||
0,5,1,2,3,5,4,5,
|
||||
5,0,1,2,3,5,4,5,
|
||||
0,1,2,3,4,6,5,6,
|
||||
2,2,2,2,2,0,1,2,
|
||||
0,3,3,3,3,1,2,3,
|
||||
3,0,3,3,3,1,2,3,
|
||||
0,1,4,4,4,2,3,4,
|
||||
3,3,0,3,3,1,2,3,
|
||||
0,4,1,4,4,2,3,4,
|
||||
4,0,1,4,4,2,3,4,
|
||||
0,1,2,5,5,3,4,5,
|
||||
3,3,3,0,3,1,2,3,
|
||||
0,4,4,1,4,2,3,4,
|
||||
4,0,4,1,4,2,3,4,
|
||||
0,1,5,2,5,3,4,5,
|
||||
4,4,0,1,4,2,3,4,
|
||||
0,5,1,2,5,3,4,5,
|
||||
5,0,1,2,5,3,4,5,
|
||||
0,1,2,3,6,4,5,6,
|
||||
3,3,3,3,0,1,2,3,
|
||||
0,4,4,4,1,2,3,4,
|
||||
4,0,4,4,1,2,3,4,
|
||||
0,1,5,5,2,3,4,5,
|
||||
4,4,0,4,1,2,3,4,
|
||||
0,5,1,5,2,3,4,5,
|
||||
5,0,1,5,2,3,4,5,
|
||||
0,1,2,6,3,4,5,6,
|
||||
4,4,4,0,1,2,3,4,
|
||||
0,5,5,1,2,3,4,5,
|
||||
5,0,5,1,2,3,4,5,
|
||||
0,1,6,2,3,4,5,6,
|
||||
5,5,0,1,2,3,4,5,
|
||||
0,6,1,2,3,4,5,6,
|
||||
6,0,1,2,3,4,5,6,
|
||||
0,1,2,3,4,5,6,7,
|
||||
1,1,1,1,1,1,1,0,
|
||||
0,2,2,2,2,2,2,1,
|
||||
2,0,2,2,2,2,2,1,
|
||||
0,1,3,3,3,3,3,2,
|
||||
2,2,0,2,2,2,2,1,
|
||||
0,3,1,3,3,3,3,2,
|
||||
3,0,1,3,3,3,3,2,
|
||||
0,1,2,4,4,4,4,3,
|
||||
2,2,2,0,2,2,2,1,
|
||||
0,3,3,1,3,3,3,2,
|
||||
3,0,3,1,3,3,3,2,
|
||||
0,1,4,2,4,4,4,3,
|
||||
3,3,0,1,3,3,3,2,
|
||||
0,4,1,2,4,4,4,3,
|
||||
4,0,1,2,4,4,4,3,
|
||||
0,1,2,3,5,5,5,4,
|
||||
2,2,2,2,0,2,2,1,
|
||||
0,3,3,3,1,3,3,2,
|
||||
3,0,3,3,1,3,3,2,
|
||||
0,1,4,4,2,4,4,3,
|
||||
3,3,0,3,1,3,3,2,
|
||||
0,4,1,4,2,4,4,3,
|
||||
4,0,1,4,2,4,4,3,
|
||||
0,1,2,5,3,5,5,4,
|
||||
3,3,3,0,1,3,3,2,
|
||||
0,4,4,1,2,4,4,3,
|
||||
4,0,4,1,2,4,4,3,
|
||||
0,1,5,2,3,5,5,4,
|
||||
4,4,0,1,2,4,4,3,
|
||||
0,5,1,2,3,5,5,4,
|
||||
5,0,1,2,3,5,5,4,
|
||||
0,1,2,3,4,6,6,5,
|
||||
2,2,2,2,2,0,2,1,
|
||||
0,3,3,3,3,1,3,2,
|
||||
3,0,3,3,3,1,3,2,
|
||||
0,1,4,4,4,2,4,3,
|
||||
3,3,0,3,3,1,3,2,
|
||||
0,4,1,4,4,2,4,3,
|
||||
4,0,1,4,4,2,4,3,
|
||||
0,1,2,5,5,3,5,4,
|
||||
3,3,3,0,3,1,3,2,
|
||||
0,4,4,1,4,2,4,3,
|
||||
4,0,4,1,4,2,4,3,
|
||||
0,1,5,2,5,3,5,4,
|
||||
4,4,0,1,4,2,4,3,
|
||||
0,5,1,2,5,3,5,4,
|
||||
5,0,1,2,5,3,5,4,
|
||||
0,1,2,3,6,4,6,5,
|
||||
3,3,3,3,0,1,3,2,
|
||||
0,4,4,4,1,2,4,3,
|
||||
4,0,4,4,1,2,4,3,
|
||||
0,1,5,5,2,3,5,4,
|
||||
4,4,0,4,1,2,4,3,
|
||||
0,5,1,5,2,3,5,4,
|
||||
5,0,1,5,2,3,5,4,
|
||||
0,1,2,6,3,4,6,5,
|
||||
4,4,4,0,1,2,4,3,
|
||||
0,5,5,1,2,3,5,4,
|
||||
5,0,5,1,2,3,5,4,
|
||||
0,1,6,2,3,4,6,5,
|
||||
5,5,0,1,2,3,5,4,
|
||||
0,6,1,2,3,4,6,5,
|
||||
6,0,1,2,3,4,6,5,
|
||||
0,1,2,3,4,5,7,6,
|
||||
2,2,2,2,2,2,0,1,
|
||||
0,3,3,3,3,3,1,2,
|
||||
3,0,3,3,3,3,1,2,
|
||||
0,1,4,4,4,4,2,3,
|
||||
3,3,0,3,3,3,1,2,
|
||||
0,4,1,4,4,4,2,3,
|
||||
4,0,1,4,4,4,2,3,
|
||||
0,1,2,5,5,5,3,4,
|
||||
3,3,3,0,3,3,1,2,
|
||||
0,4,4,1,4,4,2,3,
|
||||
4,0,4,1,4,4,2,3,
|
||||
0,1,5,2,5,5,3,4,
|
||||
4,4,0,1,4,4,2,3,
|
||||
0,5,1,2,5,5,3,4,
|
||||
5,0,1,2,5,5,3,4,
|
||||
0,1,2,3,6,6,4,5,
|
||||
3,3,3,3,0,3,1,2,
|
||||
0,4,4,4,1,4,2,3,
|
||||
4,0,4,4,1,4,2,3,
|
||||
0,1,5,5,2,5,3,4,
|
||||
4,4,0,4,1,4,2,3,
|
||||
0,5,1,5,2,5,3,4,
|
||||
5,0,1,5,2,5,3,4,
|
||||
0,1,2,6,3,6,4,5,
|
||||
4,4,4,0,1,4,2,3,
|
||||
0,5,5,1,2,5,3,4,
|
||||
5,0,5,1,2,5,3,4,
|
||||
0,1,6,2,3,6,4,5,
|
||||
5,5,0,1,2,5,3,4,
|
||||
0,6,1,2,3,6,4,5,
|
||||
6,0,1,2,3,6,4,5,
|
||||
0,1,2,3,4,7,5,6,
|
||||
3,3,3,3,3,0,1,2,
|
||||
0,4,4,4,4,1,2,3,
|
||||
4,0,4,4,4,1,2,3,
|
||||
0,1,5,5,5,2,3,4,
|
||||
4,4,0,4,4,1,2,3,
|
||||
0,5,1,5,5,2,3,4,
|
||||
5,0,1,5,5,2,3,4,
|
||||
0,1,2,6,6,3,4,5,
|
||||
4,4,4,0,4,1,2,3,
|
||||
0,5,5,1,5,2,3,4,
|
||||
5,0,5,1,5,2,3,4,
|
||||
0,1,6,2,6,3,4,5,
|
||||
5,5,0,1,5,2,3,4,
|
||||
0,6,1,2,6,3,4,5,
|
||||
6,0,1,2,6,3,4,5,
|
||||
0,1,2,3,7,4,5,6,
|
||||
4,4,4,4,0,1,2,3,
|
||||
0,5,5,5,1,2,3,4,
|
||||
5,0,5,5,1,2,3,4,
|
||||
0,1,6,6,2,3,4,5,
|
||||
5,5,0,5,1,2,3,4,
|
||||
0,6,1,6,2,3,4,5,
|
||||
6,0,1,6,2,3,4,5,
|
||||
0,1,2,7,3,4,5,6,
|
||||
5,5,5,0,1,2,3,4,
|
||||
0,6,6,1,2,3,4,5,
|
||||
6,0,6,1,2,3,4,5,
|
||||
0,1,7,2,3,4,5,6,
|
||||
6,6,0,1,2,3,4,5,
|
||||
0,7,1,2,3,4,5,6,
|
||||
7,0,1,2,3,4,5,6,
|
||||
0,1,2,3,4,5,6,7
|
||||
};
|
||||
#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_)
|
||||
#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(permv[_m_]))) )
|
||||
#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv))
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO( _op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, ov)
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_)
|
||||
#include "bitunpack256v_.h"
|
||||
|
||||
#define BITUNBLK256V32_0(ip, _i_, _op_, _parm_) {__m256i ov;\
|
||||
VSTO0(_op_, 0, ov, _parm_);\
|
||||
VSTO0(_op_, 1, ov, _parm_);\
|
||||
VSTO0(_op_, 2, ov, _parm_);\
|
||||
VSTO0(_op_, 3, ov, _parm_);\
|
||||
VSTO0(_op_, 4, ov, _parm_);\
|
||||
VSTO0(_op_, 5, ov, _parm_);\
|
||||
VSTO0(_op_, 6, ov, _parm_);\
|
||||
VSTO0(_op_, 7, ov, _parm_);\
|
||||
VSTO0(_op_, 8, ov, _parm_);\
|
||||
VSTO0(_op_, 9, ov, _parm_);\
|
||||
VSTO0(_op_, 10, ov, _parm_);\
|
||||
VSTO0(_op_, 11, ov, _parm_);\
|
||||
VSTO0(_op_, 12, ov, _parm_);\
|
||||
VSTO0(_op_, 13, ov, _parm_);\
|
||||
VSTO0(_op_, 14, ov, _parm_);\
|
||||
VSTO0(_op_, 15, ov, _parm_);\
|
||||
VSTO0(_op_, 16, ov, _parm_);\
|
||||
VSTO0(_op_, 17, ov, _parm_);\
|
||||
VSTO0(_op_, 18, ov, _parm_);\
|
||||
VSTO0(_op_, 19, ov, _parm_);\
|
||||
VSTO0(_op_, 20, ov, _parm_);\
|
||||
VSTO0(_op_, 21, ov, _parm_);\
|
||||
VSTO0(_op_, 22, ov, _parm_);\
|
||||
VSTO0(_op_, 23, ov, _parm_);\
|
||||
VSTO0(_op_, 24, ov, _parm_);\
|
||||
VSTO0(_op_, 25, ov, _parm_);\
|
||||
VSTO0(_op_, 26, ov, _parm_);\
|
||||
VSTO0(_op_, 27, ov, _parm_);\
|
||||
VSTO0(_op_, 28, ov, _parm_);\
|
||||
VSTO0(_op_, 29, ov, _parm_);\
|
||||
VSTO0(_op_, 30, ov, _parm_);\
|
||||
VSTO0(_op_, 31, ov, _parm_);\
|
||||
}
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm256_setzero_si256()
|
||||
|
||||
unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv;
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
|
||||
//--------------------------------------- zeromask unpack for TurboPFor vp4d.c --------------------------------------
|
||||
#define VSTO(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) )); pex += popcnt32(xm)
|
||||
#define VSTO0(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm)
|
||||
#define BITUNPACK0(_parm_)
|
||||
#include "bitunpack256v_.h"
|
||||
|
||||
unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
//--------------------------------
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_)
|
||||
#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG256x32(__ov); SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv)
|
||||
#include "bitunpack256v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256();
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef BITUNPACK0
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO(__op, i, __ov, __sv) SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv)
|
||||
#include "bitunpack256v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256();
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VEXP(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm)
|
||||
#define VEXP0(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
|
||||
|
||||
#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack256v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(256*b); unsigned xm;
|
||||
__m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO(__op, i, __ov, __sv) SCANI256x32(__ov,__sv,cv); _mm256_storeu_si256(__op++, __sv);
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv)
|
||||
#include "bitunpack256v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8)
|
||||
|
||||
unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256();
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack256v_.h"
|
||||
|
||||
#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128()
|
||||
|
||||
unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(256*b); unsigned xm;
|
||||
__m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user