TurboPFor: Bit Unpacking
This commit is contained in:
32
bitunpack.c
32
bitunpack.c
@ -529,13 +529,34 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
|
||||
#define VX32(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm)
|
||||
#define VXZ32(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define SCAN32x8( _v_, _sv_) {\
|
||||
_v_ = _mm256_add_epi32(_v_, _mm256_slli_si256(_v_, 4));\
|
||||
_v_ = _mm256_add_epi32(_v_, _mm256_slli_si256(_v_, 8));\
|
||||
_sv_ = _mm256_add_epi32( _mm256_permute2x128_si256( _mm256_shuffle_epi32(_sv_,_MM_SHUFFLE(3, 3, 3, 3)), _sv_, 0x11), \
|
||||
_mm256_add_epi32(_v_, _mm256_permute2x128_si256(zv,_mm256_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));\
|
||||
}
|
||||
#define SCANI32x8(_v_, _sv_, _vi_) SCAN32x8(_v_, _sv_); _sv_ = _mm256_add_epi32(_sv_, _vi_)
|
||||
#define ZIGZAG32x8(_v_) _mm256_xor_si256(_mm256_slli_epi32(_v_,1), _mm256_srai_epi32(_v_,31))
|
||||
#define UNZIGZAG32x8(_v_) _mm256_xor_si256(_mm256_srli_epi32(_v_,1), _mm256_srai_epi32(_mm256_slli_epi32(_v_,31),31) )
|
||||
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _ov_ = UNZIGZAG32x8(_ov_); SCAN32x8(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _ov_ = UNZIGZAG32x8(_ov_); SCAN32x8(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#else
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#endif
|
||||
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip;
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
unsigned xm;
|
||||
const __m256i zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
__m256i sv = _mm256_set1_epi32(start);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO32(_op_, i, _ov_, _sv_) _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
@ -544,7 +565,8 @@ unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8)
|
||||
unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256();
|
||||
const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256();
|
||||
__m256i sv = _mm256_set1_epi32(start);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
@ -555,7 +577,8 @@ unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm256_set1_epi32(8);
|
||||
const __m256i cv = _mm256_set1_epi32(8);
|
||||
__m256i sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
@ -566,7 +589,8 @@ unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128()
|
||||
unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(256*b); unsigned xm;
|
||||
__m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1), zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
__m256i sv = _mm256_set1_epi32(start);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user