TurboPFor: Bit Unpacking
This commit is contained in:
390
bitunpack.c
Normal file → Executable file
390
bitunpack.c
Normal file → Executable file
@ -58,31 +58,31 @@ typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsi
|
||||
#define OPX(_op_) _op_ += 32
|
||||
#endif
|
||||
|
||||
#define OPI(_op_,_parm_) OPX(_op_)
|
||||
#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = _w_
|
||||
#define OPI(_op_,_nb_,_parm_) OPX(_op_)
|
||||
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = _w_
|
||||
#define _BITUNPACK_ bitunpack
|
||||
#include "bitunpack_.h"
|
||||
|
||||
#define DELTA
|
||||
|
||||
#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = (_parm_ += (_w_))
|
||||
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += (_w_))
|
||||
#define _BITUNPACK_ bitdunpack // delta + 0
|
||||
#include "bitunpack_.h"
|
||||
|
||||
#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = (_parm_ += TEMPLATE2(zigzagdec, USIZE)(_w_))
|
||||
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += TEMPLATE2(zigzagdec, USIZE)(_w_))
|
||||
#define _BITUNPACK_ bitzunpack // zigzag
|
||||
#include "bitunpack_.h"
|
||||
|
||||
#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = (_parm_ + (_w_))
|
||||
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ + (_w_))
|
||||
#define _BITUNPACK_ bitfunpack // for
|
||||
#include "bitunpack_.h"
|
||||
|
||||
#define OPI(_op_,_parm_) OPX(_op_); _parm_ += 32
|
||||
#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = (_parm_ += (_w_)) + (_x_+1)
|
||||
#define OPI(_op_,_nb_,_parm_) OPX(_op_); _parm_ += 32
|
||||
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += (_w_)) + (_x_+1)
|
||||
#define _BITUNPACK_ bitd1unpack // delta + 1
|
||||
#include "bitunpack_.h"
|
||||
|
||||
#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = _parm_ + (_w_)+(_x_+1)
|
||||
#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = _parm_ + (_w_)+(_x_+1)
|
||||
#define _BITUNPACK_ bitf1unpack // for + 1
|
||||
#include "bitunpack_.h"
|
||||
#undef OPI
|
||||
@ -135,8 +135,8 @@ size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restri
|
||||
#define _BITNUNPACKV(in, n, out, _csize_, _usize_, _bitunpackv_) {\
|
||||
unsigned char *ip = in;\
|
||||
for(op = out; op != out+(n&~(_csize_-1)); op += _csize_) { PREFETCH(in+512,0);\
|
||||
unsigned b = *ip++; ip = TEMPLATE2(_bitunpackv_, _usize_)(ip, _csize_, op,b);\
|
||||
} if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(bitunpack, _usize_)(ip, n, op,b); }\
|
||||
unsigned b = *ip++; ip = TEMPLATE2(_bitunpackv_, _usize_)(ip, _csize_, op,b);\
|
||||
} if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(bitunpack, _usize_)(ip, n, op,b); }\
|
||||
return ip - in;\
|
||||
}
|
||||
|
||||
@ -419,47 +419,47 @@ static ALIGNED(unsigned char, permv[256][8], 32) = {
|
||||
};
|
||||
#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_)
|
||||
#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(permv[_m_]))) )
|
||||
#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv))
|
||||
#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv)) // emulate AVX512 _mm256_maskz_loadu_epi32 on AVX2
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VO32( _op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, ov)
|
||||
#define VOZ32(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_)
|
||||
#define VO32( _op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, ov)
|
||||
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_)
|
||||
#include "bitunpack_.h"
|
||||
|
||||
#define BITUNBLK256V32_0(ip, _i_, _op_, _parm_) {__m256i ov;\
|
||||
VOZ32(_op_, 0, ov, _parm_);\
|
||||
VOZ32(_op_, 1, ov, _parm_);\
|
||||
VOZ32(_op_, 2, ov, _parm_);\
|
||||
VOZ32(_op_, 3, ov, _parm_);\
|
||||
VOZ32(_op_, 4, ov, _parm_);\
|
||||
VOZ32(_op_, 5, ov, _parm_);\
|
||||
VOZ32(_op_, 6, ov, _parm_);\
|
||||
VOZ32(_op_, 7, ov, _parm_);\
|
||||
VOZ32(_op_, 8, ov, _parm_);\
|
||||
VOZ32(_op_, 9, ov, _parm_);\
|
||||
VOZ32(_op_, 10, ov, _parm_);\
|
||||
VOZ32(_op_, 11, ov, _parm_);\
|
||||
VOZ32(_op_, 12, ov, _parm_);\
|
||||
VOZ32(_op_, 13, ov, _parm_);\
|
||||
VOZ32(_op_, 14, ov, _parm_);\
|
||||
VOZ32(_op_, 15, ov, _parm_);\
|
||||
VOZ32(_op_, 16, ov, _parm_);\
|
||||
VOZ32(_op_, 17, ov, _parm_);\
|
||||
VOZ32(_op_, 18, ov, _parm_);\
|
||||
VOZ32(_op_, 19, ov, _parm_);\
|
||||
VOZ32(_op_, 20, ov, _parm_);\
|
||||
VOZ32(_op_, 21, ov, _parm_);\
|
||||
VOZ32(_op_, 22, ov, _parm_);\
|
||||
VOZ32(_op_, 23, ov, _parm_);\
|
||||
VOZ32(_op_, 24, ov, _parm_);\
|
||||
VOZ32(_op_, 25, ov, _parm_);\
|
||||
VOZ32(_op_, 26, ov, _parm_);\
|
||||
VOZ32(_op_, 27, ov, _parm_);\
|
||||
VOZ32(_op_, 28, ov, _parm_);\
|
||||
VOZ32(_op_, 29, ov, _parm_);\
|
||||
VOZ32(_op_, 30, ov, _parm_);\
|
||||
VOZ32(_op_, 31, ov, _parm_);\
|
||||
#define BITUNBLK256V32_0(ip, _i_, _op_, _nb_,_parm_) {__m256i ov;\
|
||||
VOZ32(_op_, 0, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 1, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 2, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 3, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 4, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 5, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 6, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 7, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 8, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 9, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 10, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 11, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 12, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 13, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 14, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 15, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 16, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 17, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 18, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 19, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 20, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 21, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 22, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 23, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 24, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 25, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 26, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 27, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 28, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 29, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 30, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 31, ov, _nb_,_parm_);\
|
||||
}
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm256_setzero_si256()
|
||||
|
||||
@ -471,8 +471,8 @@ unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n,
|
||||
}
|
||||
|
||||
//--------------------------------------- zeromask unpack for TurboPFor vp4d.c --------------------------------------
|
||||
#define VO32(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) )); pex += popcnt32(xm)
|
||||
#define VOZ32(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm)
|
||||
#define VO32(_op_, _i_, _ov_, _nb_,_parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) )); pex += popcnt32(xm)
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm)
|
||||
#define BITUNPACK0(_parm_)
|
||||
#include "bitunpack_.h"
|
||||
unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
@ -481,8 +481,8 @@ unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n,
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VOZ32(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_)
|
||||
#define VO32(_op_, i, _ov_, _sv_) _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
|
||||
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_)
|
||||
#define VO32(_op_, i, _ov_, _nb_,_sv_) _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
@ -493,7 +493,7 @@ unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n,
|
||||
}
|
||||
|
||||
|
||||
#define VO32(_op_, i, _ov_, _sv_) _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
|
||||
#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_)
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
@ -503,7 +503,7 @@ unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n,
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO32( _op_, _i_, _ov_, _parm_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, sv))
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_parm_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, sv))
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
@ -513,11 +513,11 @@ unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n,
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VX32(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm)
|
||||
#define VXZ32(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
|
||||
#define VX32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) ); pex += popcnt32(xm)
|
||||
#define VXZ32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
|
||||
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32(_i_, _nb_,_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
@ -527,8 +527,8 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VX32(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm)
|
||||
#define VXZ32(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
|
||||
#define VX32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) ); pex += popcnt32(xm)
|
||||
#define VXZ32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define SCAN32x8( _v_, _sv_) {\
|
||||
@ -541,11 +541,11 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
#define ZIGZAG32x8(_v_) _mm256_xor_si256(_mm256_slli_epi32(_v_,1), _mm256_srai_epi32(_v_,31))
|
||||
#define UNZIGZAG32x8(_v_) _mm256_xor_si256(_mm256_srli_epi32(_v_,1), _mm256_srai_epi32(_mm256_slli_epi32(_v_,31),31) )
|
||||
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _ov_ = UNZIGZAG32x8(_ov_); SCAN32x8(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _ov_ = UNZIGZAG32x8(_ov_); SCAN32x8(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _ov_ = UNZIGZAG32x8(_ov_); SCAN32x8(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32(_i_, _nb_,_ov_); _ov_ = UNZIGZAG32x8(_ov_); SCAN32x8(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#else
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32(_i_, _nb_,_ov_); _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#endif
|
||||
|
||||
#include "bitunpack_.h"
|
||||
@ -559,8 +559,8 @@ unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO32(_op_, i, _ov_, _sv_) _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv)
|
||||
#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv)
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8)
|
||||
unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
@ -571,8 +571,8 @@ unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _sv_)); _sv_ = _mm256_add_epi32(_sv_, cv)
|
||||
#define VOZ32(_op_, _i_, ov, _sv_) _mm256_storeu_si256(_op_++, _sv_); _sv_ = _mm256_add_epi32(_sv_, cv);
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _sv_)); _sv_ = _mm256_add_epi32(_sv_, cv)
|
||||
#define VOZ32(_op_, _i_, ov, _nb_,_sv_) _mm256_storeu_si256(_op_++, _sv_); _sv_ = _mm256_add_epi32(_sv_, cv);
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
@ -583,8 +583,8 @@ unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32(_i_, _nb_,_ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128()
|
||||
unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
@ -602,80 +602,80 @@ size_t bitnd1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__re
|
||||
size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitzunpack256v, bitzunpack); }
|
||||
size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitfunpack256v, bitfunpack); }
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON) //------------------------------ SSE2/SSSE3 ---------------------------------------------------------
|
||||
#define VO16( _op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, ov)
|
||||
#define VO32( _op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, ov)
|
||||
#define VO16( _op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, ov)
|
||||
#define VO32( _op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, ov)
|
||||
#include "bitunpack_.h"
|
||||
|
||||
#define VOZ16(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define VOZ32(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define BITUNBLK128V16_0(ip, _i_, _op_, _parm_) {__m128i ov;\
|
||||
VOZ16(_op_, 0, ov, _parm_);\
|
||||
VOZ16(_op_, 1, ov, _parm_);\
|
||||
VOZ16(_op_, 2, ov, _parm_);\
|
||||
VOZ16(_op_, 3, ov, _parm_);\
|
||||
VOZ16(_op_, 4, ov, _parm_);\
|
||||
VOZ16(_op_, 5, ov, _parm_);\
|
||||
VOZ16(_op_, 6, ov, _parm_);\
|
||||
VOZ16(_op_, 7, ov, _parm_);\
|
||||
VOZ16(_op_, 8, ov, _parm_);\
|
||||
VOZ16(_op_, 9, ov, _parm_);\
|
||||
VOZ16(_op_, 10, ov, _parm_);\
|
||||
VOZ16(_op_, 11, ov, _parm_);\
|
||||
VOZ16(_op_, 12, ov, _parm_);\
|
||||
VOZ16(_op_, 13, ov, _parm_);\
|
||||
VOZ16(_op_, 14, ov, _parm_);\
|
||||
VOZ16(_op_, 15, ov, _parm_);\
|
||||
/*VOZ16(_op_, 16, ov, _parm_);\
|
||||
VOZ16(_op_, 17, ov, _parm_);\
|
||||
VOZ16(_op_, 18, ov, _parm_);\
|
||||
VOZ16(_op_, 19, ov, _parm_);\
|
||||
VOZ16(_op_, 20, ov, _parm_);\
|
||||
VOZ16(_op_, 21, ov, _parm_);\
|
||||
VOZ16(_op_, 22, ov, _parm_);\
|
||||
VOZ16(_op_, 23, ov, _parm_);\
|
||||
VOZ16(_op_, 24, ov, _parm_);\
|
||||
VOZ16(_op_, 25, ov, _parm_);\
|
||||
VOZ16(_op_, 26, ov, _parm_);\
|
||||
VOZ16(_op_, 27, ov, _parm_);\
|
||||
VOZ16(_op_, 28, ov, _parm_);\
|
||||
VOZ16(_op_, 29, ov, _parm_);\
|
||||
VOZ16(_op_, 30, ov, _parm_);\
|
||||
VOZ16(_op_, 31, ov, _parm_);*/\
|
||||
#define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define BITUNBLK128V16_0(ip, _i_, _op_, _nb_,_parm_) {__m128i ov;\
|
||||
VOZ16(_op_, 0, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 1, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 2, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 3, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 4, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 5, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 6, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 7, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 8, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 9, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 10, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 11, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 12, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 13, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 14, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 15, ov, _nb_,_parm_);\
|
||||
/*VOZ16(_op_, 16, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 17, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 18, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 19, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 20, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 21, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 22, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 23, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 24, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 25, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 26, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 27, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 28, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 29, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 30, ov, _nb_,_parm_);\
|
||||
VOZ16(_op_, 31, ov, _nb_,_parm_);*/\
|
||||
}
|
||||
|
||||
#define BITUNBLK128V32_0(ip, _i_, _op_, _parm_) {__m128i ov;\
|
||||
VOZ32(_op_, 0, ov, _parm_);\
|
||||
VOZ32(_op_, 1, ov, _parm_);\
|
||||
VOZ32(_op_, 2, ov, _parm_);\
|
||||
VOZ32(_op_, 3, ov, _parm_);\
|
||||
VOZ32(_op_, 4, ov, _parm_);\
|
||||
VOZ32(_op_, 5, ov, _parm_);\
|
||||
VOZ32(_op_, 6, ov, _parm_);\
|
||||
VOZ32(_op_, 7, ov, _parm_);\
|
||||
VOZ32(_op_, 8, ov, _parm_);\
|
||||
VOZ32(_op_, 9, ov, _parm_);\
|
||||
VOZ32(_op_, 10, ov, _parm_);\
|
||||
VOZ32(_op_, 11, ov, _parm_);\
|
||||
VOZ32(_op_, 12, ov, _parm_);\
|
||||
VOZ32(_op_, 13, ov, _parm_);\
|
||||
VOZ32(_op_, 14, ov, _parm_);\
|
||||
VOZ32(_op_, 15, ov, _parm_);\
|
||||
VOZ32(_op_, 16, ov, _parm_);\
|
||||
VOZ32(_op_, 17, ov, _parm_);\
|
||||
VOZ32(_op_, 18, ov, _parm_);\
|
||||
VOZ32(_op_, 19, ov, _parm_);\
|
||||
VOZ32(_op_, 20, ov, _parm_);\
|
||||
VOZ32(_op_, 21, ov, _parm_);\
|
||||
VOZ32(_op_, 22, ov, _parm_);\
|
||||
VOZ32(_op_, 23, ov, _parm_);\
|
||||
VOZ32(_op_, 24, ov, _parm_);\
|
||||
VOZ32(_op_, 25, ov, _parm_);\
|
||||
VOZ32(_op_, 26, ov, _parm_);\
|
||||
VOZ32(_op_, 27, ov, _parm_);\
|
||||
VOZ32(_op_, 28, ov, _parm_);\
|
||||
VOZ32(_op_, 29, ov, _parm_);\
|
||||
VOZ32(_op_, 30, ov, _parm_);\
|
||||
VOZ32(_op_, 31, ov, _parm_);\
|
||||
#define BITUNBLK128V32_0(ip, _i_, _op_, _nb_,_parm_) {__m128i ov;\
|
||||
VOZ32(_op_, 0, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 1, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 2, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 3, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 4, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 5, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 6, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 7, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 8, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 9, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 10, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 11, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 12, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 13, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 14, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 15, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 16, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 17, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 18, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 19, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 20, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 21, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 22, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 23, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 24, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 25, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 26, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 27, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 28, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 29, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 30, ov, _nb_,_parm_);\
|
||||
VOZ32(_op_, 31, ov, _nb_,_parm_);\
|
||||
}
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm_setzero_si128()
|
||||
|
||||
@ -691,8 +691,8 @@ unsigned char *bitunpack256w32( const unsigned char *__restrict in, unsigned n,
|
||||
#define STOZ64(_op_, _ov_) _mm_storeu_si128(_op_++, _ov_); _mm_storeu_si128(_op_++, _ov_)
|
||||
#define STO64( _op_, _ov_, _zv_) _mm_storeu_si128(_op_++, _mm_unpacklo_epi32(_ov_,_zv_));_mm_storeu_si128(_op_++, _mm_unpacklo_epi32(_mm_srli_si128(_ov_,8),_zv_))
|
||||
|
||||
#define VOZ32(_op_, _i_, ov, _parm_) STOZ64(_op_, _parm_)
|
||||
#define VO32( _op_, _i_, ov, _parm_) STO64(_op_, ov, zv)
|
||||
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) STOZ64(_op_, _parm_)
|
||||
#define VO32( _op_, _i_, ov, _nb_,_parm_) STO64(_op_, ov, zv)
|
||||
#include "bitunpack_.h"
|
||||
unsigned char *bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b) {
|
||||
if(b <= 32) { const unsigned char *ip = in+PAD8(128*b);
|
||||
@ -988,10 +988,10 @@ ALIGNED(char, _shuffle_16[256][16],16) = {
|
||||
#undef _
|
||||
#endif // SSSE3
|
||||
|
||||
#define VO16( _op_, _i_, _ov_, _parm_) m = *bb++; _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) )); pex += popcnt32(m)
|
||||
#define VO32( _op_, _i_, _ov_, _parm_) if((_i_) & 1) m = (*bb++) >> 4; else m = (*bb) & 0xf; _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) )); pex += popcnt32(m)
|
||||
#define VOZ16(_op_, _i_, _ov_, _parm_) m = *bb++; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m)
|
||||
#define VOZ32(_op_, _i_, _ov_, _parm_) if((_i_) & 1) m = (*bb++) >> 4; else m = (*bb) & 0xf; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) ); pex += popcnt32(m)
|
||||
#define VO16( _op_, _i_, _ov_, _nb_,_parm_) m = *bb++; _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_>15?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) )); pex += popcnt32(m)
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_parm_) if((_i_) & 1) m = (*bb++) >> 4; else m = (*bb) & 0xf; _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_>31?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) )); pex += popcnt32(m)
|
||||
#define VOZ16(_op_, _i_, _ov_, _nb_,_parm_) m = *bb++; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m)
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) if((_i_) & 1) m = (*bb++) >> 4; else m = (*bb) & 0xf; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) ); pex += popcnt32(m)
|
||||
#define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128()
|
||||
#include "bitunpack_.h"
|
||||
|
||||
@ -1011,8 +1011,8 @@ unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n,
|
||||
//#define STOZ64(_op_, _ov_) _mm_storeu_si128(_op_++, _ov_); _mm_storeu_si128(_op_++, _ov_)
|
||||
#define STO64( _op_, _ov_, _zv_) _mm_storeu_si128(_op_++, _mm_unpacklo_epi32(_ov_,_zv_));_mm_storeu_si128(_op_++, _mm_unpacklo_epi32(_mm_srli_si128(_ov_,8),_zv_))
|
||||
|
||||
#define VO32( _op_, _i_, _ov_, _parm_) if((_i_) & 1) m = (*bb++) >> 4; else m = (*bb) & 0xf; { __m128i _wv = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) ); STO64(_op_, _wv, zv);} pex += popcnt32(m)
|
||||
#define VOZ32(_op_, _i_, _ov_, _parm_) if((_i_) & 1) m = (*bb++) >> 4; else m = (*bb) & 0xf; { __m128i _wv = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) ; STO64(_op_, _wv, zv);} pex += popcnt32(m)
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_parm_) if((_i_) & 1) m = (*bb++) >> 4; else m = (*bb) & 0xf; { __m128i _wv = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_>31?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) ); STO64(_op_, _wv, zv);} pex += popcnt32(m)
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) if((_i_) & 1) m = (*bb++) >> 4; else m = (*bb) & 0xf; { __m128i _wv = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) ; STO64(_op_, _wv, zv);} pex += popcnt32(m)
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
#include "bitunpack_.h"
|
||||
@ -1027,10 +1027,10 @@ unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n,
|
||||
#undef BITUNPACK0
|
||||
|
||||
//-------------------------------------------------------------------
|
||||
#define VOZ16(_op_, _i_, _ov_, _parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define VOZ32(_op_, _i_, _ov_, _parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define VO16( _op_, _i_, _ov_, _sv_) _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) _ov_ = mm_zzagd_epi32(_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
|
||||
#define VOZ16(_op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_)
|
||||
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) _ov_ = mm_zzagd_epi32(_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) {
|
||||
@ -1040,8 +1040,8 @@ unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO32(_op_, i, _ov_, _sv_) _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
|
||||
#define VO16(_op_, i, _ov_, _sv_) _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
|
||||
#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
|
||||
#define VO16(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_)
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
|
||||
@ -1051,8 +1051,8 @@ unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO32( _op_, _i_, _ov_, _parm_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, sv))
|
||||
#define VO16( _op_, _i_, _ov_, _parm_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, sv))
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, sv))
|
||||
#define VO16( _op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, sv))
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
|
||||
@ -1063,15 +1063,15 @@ unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#define VX32(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m)
|
||||
#define VXZ32(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m])); pex += popcnt32(m)
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VX32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_>31?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m)
|
||||
#define VXZ32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m])); pex += popcnt32(m)
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32(_i_, _nb_,_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
|
||||
#define VX16(_i_, _ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m)
|
||||
#define VXZ16(_i_, _ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ); pex += popcnt32(m)
|
||||
#define VO16( _op_, _i_, _ov_, _sv_) VX16( _i_, _ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, _ov_, _sv_) VXZ16( _i_, _ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VX16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_>15?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m)
|
||||
#define VXZ16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ); pex += popcnt32(m)
|
||||
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, _ov_, _nb_,_sv_) VXZ16( _i_, _nb_,_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *_bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) {
|
||||
@ -1082,21 +1082,21 @@ unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n
|
||||
}
|
||||
|
||||
/*
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); mm_scan_epi32(_ov_,_sv_); STO64( _op_, _sv_) //_mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32( _i_, _ov_); mm_scan_epi32(_ov_,_sv_); STOZ64( _op_, _sv_, zv) //_mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _ov_); mm_scan_epi32(_ov_,_sv_); STO64( _op_, _sv_) //_mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32( _i_, _ov_); mm_scan_epi32(_ov_,_sv_); STOZ64( _op_, _sv_, zv) //_mm_storeu_si128(_op_++, _sv_);
|
||||
unsigned char *_bitdunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, uint64_t *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start),zv = _mm_setzero_si128(); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
|
||||
}*/
|
||||
|
||||
#define VX16(_i_, _ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m)
|
||||
#define VXZ16(_i_, _ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ); pex += popcnt32(m)
|
||||
#define VO16( _op_, _i_, _ov_, _sv_) VX16( _i_, _ov_); _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, _ov_, _sv_) VXZ16( _i_, _ov_); _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VX16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_>15?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m)
|
||||
#define VXZ16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ); pex += popcnt32(m)
|
||||
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_); _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, _ov_, _nb_,_sv_) VXZ16( _i_, _nb_,_ov_); _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
|
||||
#define VX32(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) ); pex += popcnt32(m)
|
||||
#define VXZ32(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ); pex += popcnt32(m)
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _ov_ = mm_zzagd_epi32(_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32(_i_, _ov_); _ov_ = mm_zzagd_epi32(_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VX32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_>31?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ) ); pex += popcnt32(m)
|
||||
#define VXZ32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m]) ); pex += popcnt32(m)
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _ov_ = mm_zzagd_epi32(_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32(_i_, _nb_,_ov_); _ov_ = mm_zzagd_epi32(_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
@ -1108,10 +1108,10 @@ unsigned char *_bitzunpack128v32( const unsigned char *__restrict in, unsigned n
|
||||
}
|
||||
#endif
|
||||
|
||||
#define VO16(_op_, i, _ov_, _sv_) _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32(_op_, i, _ov_, _sv_) _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv)
|
||||
#define VOZ32(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv)
|
||||
#define VO16(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv)
|
||||
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv)
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi16(_parm_, cv); cv = _mm_set1_epi16(8)
|
||||
unsigned char *bitd1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
|
||||
@ -1122,10 +1122,10 @@ unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n
|
||||
const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO16(_op_, i, _ov_, _sv_) ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32(_op_, i, _ov_, _sv_) ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv)
|
||||
#define VOZ32(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv)
|
||||
#define VO16(_op_, i, _ov_, _nb_,_sv_) ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32(_op_, i, _ov_, _nb_,_sv_) ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv)
|
||||
#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv)
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi16(_parm_, cv); cv = _mm_set1_epi16(8)
|
||||
unsigned char *bits1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) {
|
||||
@ -1136,9 +1136,9 @@ unsigned char *bits1unpack128v32( const unsigned char *__restrict in, unsigned n
|
||||
const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO16( _op_, _i_, _ov_, _sv_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, _sv_)); _sv_ = _mm_add_epi16(_sv_, cv)
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _sv_)); _sv_ = _mm_add_epi32(_sv_, cv)
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) _mm_storeu_si128(_op_++, _sv_); _sv_ = _mm_add_epi32(_sv_, cv);
|
||||
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, _sv_)); _sv_ = _mm_add_epi16(_sv_, cv)
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _sv_)); _sv_ = _mm_add_epi32(_sv_, cv)
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _sv_); _sv_ = _mm_add_epi32(_sv_, cv);
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_)
|
||||
unsigned char *bitf1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) {
|
||||
@ -1149,15 +1149,15 @@ unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#define VX16(_i_, _ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_16[m]))); pex += popcnt32(m)
|
||||
#define VX32(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m)
|
||||
#define VXZ16(_i_, _ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m])); pex += popcnt32(m)
|
||||
#define VXZ32(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m])); pex += popcnt32(m)
|
||||
#define VX16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_>15?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]))); pex += popcnt32(m)
|
||||
#define VX32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_>31?0:_nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m)
|
||||
#define VXZ16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m])); pex += popcnt32(m)
|
||||
#define VXZ32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m])); pex += popcnt32(m)
|
||||
|
||||
#define VO16( _op_, _i_, _ov_, _sv_) VX16( _i_, _ov_); _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, _ov_, _sv_) VXZ16( _i_, _ov_); _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32( _i_, _ov_); _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_); _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, _ov_, _nb_,_sv_) VXZ16( _i_, _nb_,_ov_); _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32( _i_, _nb_,_ov_); _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_) mv = _mm_setzero_si128() //_parm_ = _mm_setzero_si128()
|
||||
@ -1169,10 +1169,10 @@ unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned
|
||||
const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32( 4,3,2,1); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#define VO16( _op_, _i_, _ov_, _sv_) VX16( _i_, _ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, _ov_, _sv_) VXZ16( _i_, _ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _sv_) VX32( _i_, _ov_); ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _sv_) VXZ32( _i_, _ov_); ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ16(_op_, _i_, _ov_, _nb_,_sv_) VXZ16( _i_, _nb_,_ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
#define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32( _i_, _nb_,_ov_); ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack_.h"
|
||||
#define BITUNPACK0(_parm_) mv = _mm_setzero_si128() //_parm_ = _mm_setzero_si128()
|
||||
|
Reference in New Issue
Block a user