TurboPFor: Bit Unpacking
This commit is contained in:
33
bitunpack.c
33
bitunpack.c
@ -43,7 +43,9 @@ typedef unsigned char *(*BITUNPACK_F64)(const unsigned char *__restrict in, unsi
|
||||
typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
#define PREFETCH(_ip_) __builtin_prefetch(_ip_+512,0)//#define PREFETCH(ip)
|
||||
|
||||
|
||||
#if !defined(SSE2_ON) && !defined(AVX2_ON)
|
||||
|
||||
#if 0
|
||||
#define OP(_op_, _x_) *_op_++
|
||||
#define OPX(_op_)
|
||||
@ -120,9 +122,9 @@ size_t bitnzunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restri
|
||||
size_t bitnzunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitzunpacka); }
|
||||
size_t bitnzunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitzunpacka); }
|
||||
size_t bitnzunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitzunpacka); }
|
||||
#endif
|
||||
|
||||
//--------------------------------------------------------------------------------------------------------------------------------------
|
||||
#ifdef __SSE2__
|
||||
#if defined(__SSE2__) && defined(SSE2_ON)
|
||||
#include <emmintrin.h>
|
||||
|
||||
#define VSTO( _op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, ov)
|
||||
@ -171,6 +173,13 @@ unsigned char *bitunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
BITUNPACK128V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
unsigned char *bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) {
|
||||
const unsigned char *_in=in; unsigned *_out=out;
|
||||
__m128i sv;
|
||||
BITUNPACK128V32(in, b, out, sv); out = _out+128; in=_in+PAD8(128*b);
|
||||
BITUNPACK128V32(in, b, out, sv);
|
||||
return (unsigned char *)_in+PAD8(256*b);
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
@ -198,7 +207,8 @@ static char shuffles[16][16] = {
|
||||
{ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 },
|
||||
#undef _
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#define VSTO( _op_, _i_, _ov_, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m)
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m)
|
||||
#define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128()
|
||||
@ -210,10 +220,17 @@ unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
BITUNPACK128V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *_in=in; unsigned *_out=out;
|
||||
unsigned m;
|
||||
__m128i sv;
|
||||
BITUNPACK128V32(in, b, out, sv); out = _out+128; in=_in+PAD8(128*b);
|
||||
BITUNPACK128V32(in, b, out, sv);
|
||||
return (unsigned char *)_in+PAD8(256*b);
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
#endif
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
@ -249,7 +266,6 @@ unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
|
||||
//---------------------------- SSE ----------------------------------------------
|
||||
#ifdef __SSSE3__
|
||||
|
||||
#define VEXP(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m)
|
||||
#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m)
|
||||
@ -318,9 +334,8 @@ unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned
|
||||
}
|
||||
#endif
|
||||
#endif // __SSE2__
|
||||
|
||||
//----------------------------------- AVX2 -----------------------------------------------
|
||||
#ifdef __AVX2__
|
||||
|
||||
#if defined(__AVX2__) && defined(AVX2_ON)
|
||||
#include <immintrin.h>
|
||||
|
||||
#ifdef __AVX512F__
|
||||
|
Reference in New Issue
Block a user