TurboPFor: Bit Unpacking

This commit is contained in:
x
2019-12-22 13:57:39 +01:00
parent b18e553fdd
commit 2bea29b1b3

View File

@ -529,13 +529,34 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n
#define VX32(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm)
#define VXZ32(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
#ifdef _MSC_VER
#define SCAN32x8( _v_, _sv_) {\
_v_ = _mm256_add_epi32(_v_, _mm256_slli_si256(_v_, 4));\
_v_ = _mm256_add_epi32(_v_, _mm256_slli_si256(_v_, 8));\
_sv_ = _mm256_add_epi32( _mm256_permute2x128_si256( _mm256_shuffle_epi32(_sv_,_MM_SHUFFLE(3, 3, 3, 3)), _sv_, 0x11), \
_mm256_add_epi32(_v_, _mm256_permute2x128_si256(zv,_mm256_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));\
}
#define SCANI32x8(_v_, _sv_, _vi_) SCAN32x8(_v_, _sv_); _sv_ = _mm256_add_epi32(_sv_, _vi_)
#define ZIGZAG32x8(_v_) _mm256_xor_si256(_mm256_slli_epi32(_v_,1), _mm256_srai_epi32(_v_,31))
#define UNZIGZAG32x8(_v_) _mm256_xor_si256(_mm256_srli_epi32(_v_,1), _mm256_srai_epi32(_mm256_slli_epi32(_v_,31),31) )
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _ov_ = UNZIGZAG32x8(_ov_); SCAN32x8(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _ov_ = UNZIGZAG32x8(_ov_); SCAN32x8(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
#else
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
#endif
#include "bitunpack_.h"
#define BITUNPACK0(_parm_)
unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip;
const unsigned char *ip = in+PAD8(256*b);
unsigned xm;
const __m256i zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
__m256i sv = _mm256_set1_epi32(start);
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
#define VO32(_op_, i, _ov_, _sv_) _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
@ -544,7 +565,8 @@ unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n
#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8)
unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
const unsigned char *ip = in+PAD8(256*b);
__m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256();
const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256();
__m256i sv = _mm256_set1_epi32(start);
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
@ -555,7 +577,8 @@ unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n
#define BITUNPACK0(_parm_)
unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
const unsigned char *ip = in+PAD8(256*b);
__m256i sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm256_set1_epi32(8);
const __m256i cv = _mm256_set1_epi32(8);
__m256i sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1);
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}
@ -566,7 +589,8 @@ unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n
#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128()
unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
const unsigned char *ip = in+PAD8(256*b); unsigned xm;
__m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1), zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
__m256i sv = _mm256_set1_epi32(start);
BITUNPACK256V32(in, b, out, sv);
return (unsigned char *)ip;
}