diff --git a/bitunpackv.c b/bitunpackv.c index 2e20117..d03ffdf 100644 --- a/bitunpackv.c +++ b/bitunpackv.c @@ -22,21 +22,18 @@ - email : powturbo [_AT_] gmail [_DOT_] com **/ // "Integer Compression" SIMD Bit Packing - #ifndef VSTO -#include - -#include +#include #include #include "conf.h" #include "bitutil.h" -#include "bitunpack.h" +#include "bitpack.h" #define PAD8(__x) (((__x)+7)/8) //----------------------------------------------------------------------------- #define VSTO( _op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, ov) #define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_) -#include "bitunpack128v.c" +#include "bitunpack128v_.h" #define BITUNBLK128V32_0(ip, _i_, _op_, _parm_) {__m128i ov;\ VSTO0(_op_, 0, ov, _parm_);\ @@ -110,7 +107,7 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define VSTO( _op_, _i_, _ov_, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m) #define VSTO0(_op_, _i_, ov, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) #define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128() -#include "bitunpack128v.c" +#include "bitunpack128v_.h" unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { const unsigned char *ip = in+PAD8(128*b); unsigned m; @@ -126,7 +123,7 @@ unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, //----------------------------------------------------------------------------- #define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_) #define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG128x32(__ov); SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv) -#include "bitunpack128v.c" +#include "bitunpack128v_.h" #define BITUNPACK0(_parm_) @@ -141,7 +138,7 @@ unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, //----------------------------------------------------------------------------- #define VSTO(__op, i, __ov, __sv) SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv) -#include "bitunpack128v.c" +#include "bitunpack128v_.h" #define BITUNPACK0(_parm_) @@ -163,7 +160,7 @@ unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, #define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m) #define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); -#include "bitunpack128v.c" +#include "bitunpack128v_.h" #define BITUNPACK0(_parm_) @@ -180,7 +177,7 @@ unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n //----------------------------------------------------------------------------- #define VSTO(__op, i, __ov, __sv) SCANI128x32(__ov,__sv,cv); _mm_storeu_si128(__op++, __sv); #define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv) -#include "bitunpack128v.c" +#include "bitunpack128v_.h" #define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4) @@ -201,7 +198,7 @@ unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n #define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m) #define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCANI128x32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); -#include "bitunpack128v.c" +#include "bitunpack128v_.h" #define BITUNPACK0(_parm_) mv = _mm_set1_epi32(0) //_parm_ = _mm_setzero_si128() @@ -216,47 +213,425 @@ unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned #undef BITUNPACK0 #endif - #else -#include "bitunpack128v_.h" + #ifdef __AVX2__ +#include -#define BITUNPACK128V32(__ip, __nbits, __op, _parm_) { __m128i mv,*_ov=(__m128i *)__op,*_iv=(__m128i *)__ip; \ - switch(__nbits&0x3f) {\ - case 0: BITUNPACK0(_parm_); BITUNPACK128V32_0( _iv, _ov, _parm_); break;\ - case 1: mv = _mm_set1_epi32((1u<< 1)-1); BITUNPACK128V32_1( _iv, _ov, _parm_); break;\ - case 2: mv = _mm_set1_epi32((1u<< 2)-1); BITUNPACK128V32_2( _iv, _ov, _parm_); break;\ - case 3: mv = _mm_set1_epi32((1u<< 3)-1); BITUNPACK128V32_3( _iv, _ov, _parm_); break;\ - case 4: mv = _mm_set1_epi32((1u<< 4)-1); BITUNPACK128V32_4( _iv, _ov, _parm_); break;\ - case 5: mv = _mm_set1_epi32((1u<< 5)-1); BITUNPACK128V32_5( _iv, _ov, _parm_); break;\ - case 6: mv = _mm_set1_epi32((1u<< 6)-1); BITUNPACK128V32_6( _iv, _ov, _parm_); break;\ - case 7: mv = _mm_set1_epi32((1u<< 7)-1); BITUNPACK128V32_7( _iv, _ov, _parm_); break;\ - case 8: mv = _mm_set1_epi32((1u<< 8)-1); BITUNPACK128V32_8( _iv, _ov, _parm_); break;\ - case 9: mv = _mm_set1_epi32((1u<< 9)-1); BITUNPACK128V32_9( _iv, _ov, _parm_); break;\ - case 10: mv = _mm_set1_epi32((1u<<10)-1); BITUNPACK128V32_10(_iv, _ov, _parm_); break;\ - case 11: mv = _mm_set1_epi32((1u<<11)-1); BITUNPACK128V32_11(_iv, _ov, _parm_); break;\ - case 12: mv = _mm_set1_epi32((1u<<12)-1); BITUNPACK128V32_12(_iv, _ov, _parm_); break;\ - case 13: mv = _mm_set1_epi32((1u<<13)-1); BITUNPACK128V32_13(_iv, _ov, _parm_); break;\ - case 14: mv = _mm_set1_epi32((1u<<14)-1); BITUNPACK128V32_14(_iv, _ov, _parm_); break;\ - case 15: mv = _mm_set1_epi32((1u<<15)-1); BITUNPACK128V32_15(_iv, _ov, _parm_); break;\ - case 16: mv = _mm_set1_epi32((1u<<16)-1); BITUNPACK128V32_16(_iv, _ov, _parm_); break;\ - case 17: mv = _mm_set1_epi32((1u<<17)-1); BITUNPACK128V32_17(_iv, _ov, _parm_); break;\ - case 18: mv = _mm_set1_epi32((1u<<18)-1); BITUNPACK128V32_18(_iv, _ov, _parm_); break;\ - case 19: mv = _mm_set1_epi32((1u<<19)-1); BITUNPACK128V32_19(_iv, _ov, _parm_); break;\ - case 20: mv = _mm_set1_epi32((1u<<20)-1); BITUNPACK128V32_20(_iv, _ov, _parm_); break;\ - case 21: mv = _mm_set1_epi32((1u<<21)-1); BITUNPACK128V32_21(_iv, _ov, _parm_); break;\ - case 22: mv = _mm_set1_epi32((1u<<22)-1); BITUNPACK128V32_22(_iv, _ov, _parm_); break;\ - case 23: mv = _mm_set1_epi32((1u<<23)-1); BITUNPACK128V32_23(_iv, _ov, _parm_); break;\ - case 24: mv = _mm_set1_epi32((1u<<24)-1); BITUNPACK128V32_24(_iv, _ov, _parm_); break;\ - case 25: mv = _mm_set1_epi32((1u<<25)-1); BITUNPACK128V32_25(_iv, _ov, _parm_); break;\ - case 26: mv = _mm_set1_epi32((1u<<26)-1); BITUNPACK128V32_26(_iv, _ov, _parm_); break;\ - case 27: mv = _mm_set1_epi32((1u<<27)-1); BITUNPACK128V32_27(_iv, _ov, _parm_); break;\ - case 28: mv = _mm_set1_epi32((1u<<28)-1); BITUNPACK128V32_28(_iv, _ov, _parm_); break;\ - case 29: mv = _mm_set1_epi32((1u<<29)-1); BITUNPACK128V32_29(_iv, _ov, _parm_); break;\ - case 30: mv = _mm_set1_epi32((1u<<30)-1); BITUNPACK128V32_30(_iv, _ov, _parm_); break;\ - case 31: mv = _mm_set1_epi32((1u<<31)-1); BITUNPACK128V32_31(_iv, _ov, _parm_); break;\ - case 32: mv = _mm_set1_epi32((1ull<<32)-1);BITUNPACK128V32_32(_iv, _ov, _parm_); break;\ - case 33 ... 63: break;\ - }\ -} + #ifdef __AVX512F__ +#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_) +#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_) + #else +static unsigned char permv[256][8] __attribute__((aligned(32))) = { +0,0,0,0,0,0,0,0, +0,1,1,1,1,1,1,1, +1,0,1,1,1,1,1,1, +0,1,2,2,2,2,2,2, +1,1,0,1,1,1,1,1, +0,2,1,2,2,2,2,2, +2,0,1,2,2,2,2,2, +0,1,2,3,3,3,3,3, +1,1,1,0,1,1,1,1, +0,2,2,1,2,2,2,2, +2,0,2,1,2,2,2,2, +0,1,3,2,3,3,3,3, +2,2,0,1,2,2,2,2, +0,3,1,2,3,3,3,3, +3,0,1,2,3,3,3,3, +0,1,2,3,4,4,4,4, +1,1,1,1,0,1,1,1, +0,2,2,2,1,2,2,2, +2,0,2,2,1,2,2,2, +0,1,3,3,2,3,3,3, +2,2,0,2,1,2,2,2, +0,3,1,3,2,3,3,3, +3,0,1,3,2,3,3,3, +0,1,2,4,3,4,4,4, +2,2,2,0,1,2,2,2, +0,3,3,1,2,3,3,3, +3,0,3,1,2,3,3,3, +0,1,4,2,3,4,4,4, +3,3,0,1,2,3,3,3, +0,4,1,2,3,4,4,4, +4,0,1,2,3,4,4,4, +0,1,2,3,4,5,5,5, +1,1,1,1,1,0,1,1, +0,2,2,2,2,1,2,2, +2,0,2,2,2,1,2,2, +0,1,3,3,3,2,3,3, +2,2,0,2,2,1,2,2, +0,3,1,3,3,2,3,3, +3,0,1,3,3,2,3,3, +0,1,2,4,4,3,4,4, +2,2,2,0,2,1,2,2, +0,3,3,1,3,2,3,3, +3,0,3,1,3,2,3,3, +0,1,4,2,4,3,4,4, +3,3,0,1,3,2,3,3, +0,4,1,2,4,3,4,4, +4,0,1,2,4,3,4,4, +0,1,2,3,5,4,5,5, +2,2,2,2,0,1,2,2, +0,3,3,3,1,2,3,3, +3,0,3,3,1,2,3,3, +0,1,4,4,2,3,4,4, +3,3,0,3,1,2,3,3, +0,4,1,4,2,3,4,4, +4,0,1,4,2,3,4,4, +0,1,2,5,3,4,5,5, +3,3,3,0,1,2,3,3, +0,4,4,1,2,3,4,4, +4,0,4,1,2,3,4,4, +0,1,5,2,3,4,5,5, +4,4,0,1,2,3,4,4, +0,5,1,2,3,4,5,5, +5,0,1,2,3,4,5,5, +0,1,2,3,4,5,6,6, +1,1,1,1,1,1,0,1, +0,2,2,2,2,2,1,2, +2,0,2,2,2,2,1,2, +0,1,3,3,3,3,2,3, +2,2,0,2,2,2,1,2, +0,3,1,3,3,3,2,3, +3,0,1,3,3,3,2,3, +0,1,2,4,4,4,3,4, +2,2,2,0,2,2,1,2, +0,3,3,1,3,3,2,3, +3,0,3,1,3,3,2,3, +0,1,4,2,4,4,3,4, +3,3,0,1,3,3,2,3, +0,4,1,2,4,4,3,4, +4,0,1,2,4,4,3,4, +0,1,2,3,5,5,4,5, +2,2,2,2,0,2,1,2, +0,3,3,3,1,3,2,3, +3,0,3,3,1,3,2,3, +0,1,4,4,2,4,3,4, +3,3,0,3,1,3,2,3, +0,4,1,4,2,4,3,4, +4,0,1,4,2,4,3,4, +0,1,2,5,3,5,4,5, +3,3,3,0,1,3,2,3, +0,4,4,1,2,4,3,4, +4,0,4,1,2,4,3,4, +0,1,5,2,3,5,4,5, +4,4,0,1,2,4,3,4, +0,5,1,2,3,5,4,5, +5,0,1,2,3,5,4,5, +0,1,2,3,4,6,5,6, +2,2,2,2,2,0,1,2, +0,3,3,3,3,1,2,3, +3,0,3,3,3,1,2,3, +0,1,4,4,4,2,3,4, +3,3,0,3,3,1,2,3, +0,4,1,4,4,2,3,4, +4,0,1,4,4,2,3,4, +0,1,2,5,5,3,4,5, +3,3,3,0,3,1,2,3, +0,4,4,1,4,2,3,4, +4,0,4,1,4,2,3,4, +0,1,5,2,5,3,4,5, +4,4,0,1,4,2,3,4, +0,5,1,2,5,3,4,5, +5,0,1,2,5,3,4,5, +0,1,2,3,6,4,5,6, +3,3,3,3,0,1,2,3, +0,4,4,4,1,2,3,4, +4,0,4,4,1,2,3,4, +0,1,5,5,2,3,4,5, +4,4,0,4,1,2,3,4, +0,5,1,5,2,3,4,5, +5,0,1,5,2,3,4,5, +0,1,2,6,3,4,5,6, +4,4,4,0,1,2,3,4, +0,5,5,1,2,3,4,5, +5,0,5,1,2,3,4,5, +0,1,6,2,3,4,5,6, +5,5,0,1,2,3,4,5, +0,6,1,2,3,4,5,6, +6,0,1,2,3,4,5,6, +0,1,2,3,4,5,6,7, +1,1,1,1,1,1,1,0, +0,2,2,2,2,2,2,1, +2,0,2,2,2,2,2,1, +0,1,3,3,3,3,3,2, +2,2,0,2,2,2,2,1, +0,3,1,3,3,3,3,2, +3,0,1,3,3,3,3,2, +0,1,2,4,4,4,4,3, +2,2,2,0,2,2,2,1, +0,3,3,1,3,3,3,2, +3,0,3,1,3,3,3,2, +0,1,4,2,4,4,4,3, +3,3,0,1,3,3,3,2, +0,4,1,2,4,4,4,3, +4,0,1,2,4,4,4,3, +0,1,2,3,5,5,5,4, +2,2,2,2,0,2,2,1, +0,3,3,3,1,3,3,2, +3,0,3,3,1,3,3,2, +0,1,4,4,2,4,4,3, +3,3,0,3,1,3,3,2, +0,4,1,4,2,4,4,3, +4,0,1,4,2,4,4,3, +0,1,2,5,3,5,5,4, +3,3,3,0,1,3,3,2, +0,4,4,1,2,4,4,3, +4,0,4,1,2,4,4,3, +0,1,5,2,3,5,5,4, +4,4,0,1,2,4,4,3, +0,5,1,2,3,5,5,4, +5,0,1,2,3,5,5,4, +0,1,2,3,4,6,6,5, +2,2,2,2,2,0,2,1, +0,3,3,3,3,1,3,2, +3,0,3,3,3,1,3,2, +0,1,4,4,4,2,4,3, +3,3,0,3,3,1,3,2, +0,4,1,4,4,2,4,3, +4,0,1,4,4,2,4,3, +0,1,2,5,5,3,5,4, +3,3,3,0,3,1,3,2, +0,4,4,1,4,2,4,3, +4,0,4,1,4,2,4,3, +0,1,5,2,5,3,5,4, +4,4,0,1,4,2,4,3, +0,5,1,2,5,3,5,4, +5,0,1,2,5,3,5,4, +0,1,2,3,6,4,6,5, +3,3,3,3,0,1,3,2, +0,4,4,4,1,2,4,3, +4,0,4,4,1,2,4,3, +0,1,5,5,2,3,5,4, +4,4,0,4,1,2,4,3, +0,5,1,5,2,3,5,4, +5,0,1,5,2,3,5,4, +0,1,2,6,3,4,6,5, +4,4,4,0,1,2,4,3, +0,5,5,1,2,3,5,4, +5,0,5,1,2,3,5,4, +0,1,6,2,3,4,6,5, +5,5,0,1,2,3,5,4, +0,6,1,2,3,4,6,5, +6,0,1,2,3,4,6,5, +0,1,2,3,4,5,7,6, +2,2,2,2,2,2,0,1, +0,3,3,3,3,3,1,2, +3,0,3,3,3,3,1,2, +0,1,4,4,4,4,2,3, +3,3,0,3,3,3,1,2, +0,4,1,4,4,4,2,3, +4,0,1,4,4,4,2,3, +0,1,2,5,5,5,3,4, +3,3,3,0,3,3,1,2, +0,4,4,1,4,4,2,3, +4,0,4,1,4,4,2,3, +0,1,5,2,5,5,3,4, +4,4,0,1,4,4,2,3, +0,5,1,2,5,5,3,4, +5,0,1,2,5,5,3,4, +0,1,2,3,6,6,4,5, +3,3,3,3,0,3,1,2, +0,4,4,4,1,4,2,3, +4,0,4,4,1,4,2,3, +0,1,5,5,2,5,3,4, +4,4,0,4,1,4,2,3, +0,5,1,5,2,5,3,4, +5,0,1,5,2,5,3,4, +0,1,2,6,3,6,4,5, +4,4,4,0,1,4,2,3, +0,5,5,1,2,5,3,4, +5,0,5,1,2,5,3,4, +0,1,6,2,3,6,4,5, +5,5,0,1,2,5,3,4, +0,6,1,2,3,6,4,5, +6,0,1,2,3,6,4,5, +0,1,2,3,4,7,5,6, +3,3,3,3,3,0,1,2, +0,4,4,4,4,1,2,3, +4,0,4,4,4,1,2,3, +0,1,5,5,5,2,3,4, +4,4,0,4,4,1,2,3, +0,5,1,5,5,2,3,4, +5,0,1,5,5,2,3,4, +0,1,2,6,6,3,4,5, +4,4,4,0,4,1,2,3, +0,5,5,1,5,2,3,4, +5,0,5,1,5,2,3,4, +0,1,6,2,6,3,4,5, +5,5,0,1,5,2,3,4, +0,6,1,2,6,3,4,5, +6,0,1,2,6,3,4,5, +0,1,2,3,7,4,5,6, +4,4,4,4,0,1,2,3, +0,5,5,5,1,2,3,4, +5,0,5,5,1,2,3,4, +0,1,6,6,2,3,4,5, +5,5,0,5,1,2,3,4, +0,6,1,6,2,3,4,5, +6,0,1,6,2,3,4,5, +0,1,2,7,3,4,5,6, +5,5,5,0,1,2,3,4, +0,6,6,1,2,3,4,5, +6,0,6,1,2,3,4,5, +0,1,7,2,3,4,5,6, +6,6,0,1,2,3,4,5, +0,7,1,2,3,4,5,6, +7,0,1,2,3,4,5,6, +0,1,2,3,4,5,6,7 +}; +#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_) +#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(permv[_m_]))) ) +#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv)) #endif +//----------------------------------------------------------------------------- +#define VSTO( _op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, ov) +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) +#include "bitunpack256v_.h" +#define BITUNBLK256V32_0(ip, _i_, _op_, _parm_) {__m256i ov;\ + VSTO0(_op_, 0, ov, _parm_);\ + VSTO0(_op_, 1, ov, _parm_);\ + VSTO0(_op_, 2, ov, _parm_);\ + VSTO0(_op_, 3, ov, _parm_);\ + VSTO0(_op_, 4, ov, _parm_);\ + VSTO0(_op_, 5, ov, _parm_);\ + VSTO0(_op_, 6, ov, _parm_);\ + VSTO0(_op_, 7, ov, _parm_);\ + VSTO0(_op_, 8, ov, _parm_);\ + VSTO0(_op_, 9, ov, _parm_);\ + VSTO0(_op_, 10, ov, _parm_);\ + VSTO0(_op_, 11, ov, _parm_);\ + VSTO0(_op_, 12, ov, _parm_);\ + VSTO0(_op_, 13, ov, _parm_);\ + VSTO0(_op_, 14, ov, _parm_);\ + VSTO0(_op_, 15, ov, _parm_);\ + VSTO0(_op_, 16, ov, _parm_);\ + VSTO0(_op_, 17, ov, _parm_);\ + VSTO0(_op_, 18, ov, _parm_);\ + VSTO0(_op_, 19, ov, _parm_);\ + VSTO0(_op_, 20, ov, _parm_);\ + VSTO0(_op_, 21, ov, _parm_);\ + VSTO0(_op_, 22, ov, _parm_);\ + VSTO0(_op_, 23, ov, _parm_);\ + VSTO0(_op_, 24, ov, _parm_);\ + VSTO0(_op_, 25, ov, _parm_);\ + VSTO0(_op_, 26, ov, _parm_);\ + VSTO0(_op_, 27, ov, _parm_);\ + VSTO0(_op_, 28, ov, _parm_);\ + VSTO0(_op_, 29, ov, _parm_);\ + VSTO0(_op_, 30, ov, _parm_);\ + VSTO0(_op_, 31, ov, _parm_);\ +} +#define BITUNPACK0(_parm_) _parm_ = _mm256_setzero_si256() + +unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv; + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//--------------------------------------- zeromask unpack for TurboPFor vp4d.c -------------------------------------- +#define VSTO(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) )); pex += popcnt32(xm) +#define VSTO0(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm) +#define BITUNPACK0(_parm_) +#include "bitunpack256v_.h" + +unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +//-------------------------------- +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) +#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG256x32(__ov); SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) +#include "bitunpack256v_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VSTO(__op, i, __ov, __sv) SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) +#include "bitunpack256v_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VEXP(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm) +#define VEXP0(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm) + +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); + +#include "bitunpack256v_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; + __m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VSTO(__op, i, __ov, __sv) SCANI256x32(__ov,__sv,cv); _mm256_storeu_si256(__op++, __sv); +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv) +#include "bitunpack256v_.h" + +#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8) + +unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +//----------------------------------------------------------------------------- +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); + +#include "bitunpack256v_.h" + +#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128() + +unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; + __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +#endif