diff --git a/bitunpack.c b/bitunpack.c index 0b531ac..4d839bd 100644 --- a/bitunpack.c +++ b/bitunpack.c @@ -1,6 +1,6 @@ /** - Copyright (C) powturbo 2013-2016 - GPL v2 License + Copyright (C) powturbo 2013-2017 + GPL v2 License This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,233 +20,706 @@ - github : https://github.com/powturbo - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com -**/ +**/ // "Integer Compression" Bit Packing - - #ifndef BPI -#include "conf.h" -#include "bitutil.h" +#include +#include "conf.h" +#include "bitutil.h" #include "bitpack.h" -#define PAD8(__x) (((__x)+7)/8) +#define PAD8(_x_) (((_x_)+7)/8) + +#pragma GCC push_options +#pragma GCC optimize ("align-functions=16") #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunsequenced" -//----------------------------------------------------------------------------------------------------------------- -#define DSTI(__op) -#define BPI(__w, __x, __parm) __w -#include "bitunpack.c" -unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return (unsigned char *)ip; } -unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return (unsigned char *)ip; } -unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK64(in, n, b, out, 0); return (unsigned char *)ip; } -#undef BPI -#undef DSTI -//----------------------------------------------------------------------------------------------------------------- -#define DSTI(__op) -#define BPI(__w, __x, __parm) (__parm += (__w) + 1) -#include "bitunpack.c" -unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -unsigned char *bitd1unpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -#undef BPI -#undef DSTI +typedef unsigned char *(*BITUNPACK_F8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out); +typedef unsigned char *(*BITUNPACK_D8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start); +typedef unsigned char *(*BITUNPACK_F16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out); +typedef unsigned char *(*BITUNPACK_D16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); +typedef unsigned char *(*BITUNPACK_F32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out); +typedef unsigned char *(*BITUNPACK_D32)(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); +typedef unsigned char *(*BITUNPACK_F64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out); +typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); + +#define PREFETCH(_ip_) __builtin_prefetch(_ip_+512)//#define PREFETCH(ip) + + #if 0 +#define OP(_op_, _x_) *_op_++ +#define OPX(_op_) + #else +#define OP(_op_, _x_) _op_[_x_] +#define OPX(_op_) _op_ += 32 + #endif -//------------------------------------------------------------------------------------------ -#define DSTI(__op) -#define BPI(__w, __x, __parm) (__parm += (__w)) -#include "bitunpack.c" -unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -unsigned char *bitdunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -#undef BPI -#undef DSTI +#define OPI(_op_,_parm_) OPX(_op_) +#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = _w_ +#define _BITUNPACK_ bitunpack +#include "bitunpack_.h" +#define DELTA -//------------------------------------------------------------------------------------------ -#define DSTI(__op) -#define BPI(__w, __x, __parm) (__parm += zigzagdec32(__w)) -#include "bitunpack.c" -unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -//unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } -#undef BPI -#undef DSTI +#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = (_parm_ += (_w_)) +#define _BITUNPACK_ bitdunpack // delta + 0 +#include "bitunpack_.h" + +#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = (_parm_ += zigzagdec32(_w_)) +#define _BITUNPACK_ bitzunpack // zigzag +#include "bitunpack_.h" + +#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = (_parm_ + (_w_)) +#define _BITUNPACK_ bitfunpack // for +#include "bitunpack_.h" -//------------------------------------------------------------------------------------------ -#define DSTI(__op) -#define BPI(__w, __x, __parm) (__parm + (__w)) -#include "bitunpack.c" +#define OPI(_op_,_parm_) OPX(_op_); _parm_ += 32 +#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = (_parm_ += (_w_)) + (_x_+1) +#define _BITUNPACK_ bitd1unpack // delta + 1 +#include "bitunpack_.h" -unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -#undef BPI -#undef DSTI +#define OUT( _op_, _x_, _w_, _parm_) OP(_op_,_x_) = _parm_ + (_w_)+(_x_+1) +#define _BITUNPACK_ bitf1unpack // for + 1 +#include "bitunpack_.h" +#undef OPI -//------------------------------------------------------------------------------------------ -#define DSTI(__op) start += 32 -#define BPI(__w, __x, __parm) (__parm + (__w)+__x+1) -#include "bitunpack.c" -unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } -#undef BPI -#undef DSTI +#define BITNUNPACK(in, n, out, csize, usize) {\ + for(op = out,out+=n; op < out;) { unsigned oplen = out - op; if(oplen > csize) oplen = csize; __builtin_prefetch(in+512);\ + unsigned b = *in++; in = TEMPLATE2(bitunpacka, usize)[b](in, csize, op);\ + op += csize;\ + } return in;\ +} +#define BITNDUNPACK(in, n, out, csize, usize, _start_, _bitunpacka_) {\ + for(op = out,out+=n; op < out;) { unsigned oplen = out - op; if(oplen > csize) oplen = csize; __builtin_prefetch(in+512);\ + unsigned b = *in++; in = TEMPLATE2(_bitunpacka_, usize)[b](in, csize, op, _start_);\ + op += csize;\ + start = op[-1];\ + } return in;\ +} +unsigned char *bitnunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op; BITNUNPACK(in, n, out, 128, 8); } +unsigned char *bitnunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op; BITNUNPACK(in, n, out, 128, 16); } +unsigned char *bitnunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op; BITNUNPACK(in, n, out, 128, 32); } +unsigned char *bitnunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op; BITNUNPACK(in, n, out, 128, 64); } + +unsigned char *bitndunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out, uint8_t start) { uint8_t *op; BITNDUNPACK(in, n, out, 128, 8, start, bitdunpacka); } +unsigned char *bitndunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out, uint16_t start) { uint16_t *op; BITNDUNPACK(in, n, out, 128, 16, start, bitdunpacka); } +unsigned char *bitndunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start) { uint32_t *op; BITNDUNPACK(in, n, out, 128, 32, start, bitdunpacka); } +unsigned char *bitndunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out, uint64_t start) { uint64_t *op; BITNDUNPACK(in, n, out, 128, 64, start, bitdunpacka); } + +unsigned char *bitnd1unpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out, uint8_t start) { uint8_t *op; BITNDUNPACK(in, n, out, 128, 8, start, bitd1unpacka); } +unsigned char *bitnd1unpack16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out, uint16_t start) { uint16_t *op; BITNDUNPACK(in, n, out, 128, 16, start, bitd1unpacka); } +unsigned char *bitnd1unpack32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start) { uint32_t *op; BITNDUNPACK(in, n, out, 128, 32, start, bitd1unpacka); } +unsigned char *bitnd1unpack64(unsigned char *__restrict in, size_t n, uint64_t *__restrict out, uint64_t start) { uint64_t *op; BITNDUNPACK(in, n, out, 128, 64, start, bitd1unpacka); } + +//-------------------------------------------------------------------------------------------------------------------------------------- +#ifdef __SSE2__ +#include + +#define VSTO( _op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, ov) +#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_) +#include "bitunpack_.h" + +#define BITUNBLK128V32_0(ip, _i_, _op_, _parm_) {__m128i ov;\ + VSTO0(_op_, 0, ov, _parm_);\ + VSTO0(_op_, 1, ov, _parm_);\ + VSTO0(_op_, 2, ov, _parm_);\ + VSTO0(_op_, 3, ov, _parm_);\ + VSTO0(_op_, 4, ov, _parm_);\ + VSTO0(_op_, 5, ov, _parm_);\ + VSTO0(_op_, 6, ov, _parm_);\ + VSTO0(_op_, 7, ov, _parm_);\ + VSTO0(_op_, 8, ov, _parm_);\ + VSTO0(_op_, 9, ov, _parm_);\ + VSTO0(_op_, 10, ov, _parm_);\ + VSTO0(_op_, 11, ov, _parm_);\ + VSTO0(_op_, 12, ov, _parm_);\ + VSTO0(_op_, 13, ov, _parm_);\ + VSTO0(_op_, 14, ov, _parm_);\ + VSTO0(_op_, 15, ov, _parm_);\ + VSTO0(_op_, 16, ov, _parm_);\ + VSTO0(_op_, 17, ov, _parm_);\ + VSTO0(_op_, 18, ov, _parm_);\ + VSTO0(_op_, 19, ov, _parm_);\ + VSTO0(_op_, 20, ov, _parm_);\ + VSTO0(_op_, 21, ov, _parm_);\ + VSTO0(_op_, 22, ov, _parm_);\ + VSTO0(_op_, 23, ov, _parm_);\ + VSTO0(_op_, 24, ov, _parm_);\ + VSTO0(_op_, 25, ov, _parm_);\ + VSTO0(_op_, 26, ov, _parm_);\ + VSTO0(_op_, 27, ov, _parm_);\ + VSTO0(_op_, 28, ov, _parm_);\ + VSTO0(_op_, 29, ov, _parm_);\ + VSTO0(_op_, 30, ov, _parm_);\ + VSTO0(_op_, 31, ov, _parm_);\ +} +#define BITUNPACK0(_parm_) _parm_ = _mm_setzero_si128() + +unsigned char *bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv; + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +//----------------------------------------------------------------------------- + #ifdef __SSSE3__ +#include +static ALIGNED(char, shuffles[16][16], 16) = { + #define _ 0x80 + { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, + { 0,1,2,3, _,_,_,_, _,_, _, _, _, _, _,_ }, + { _,_,_,_, 0,1,2,3, _,_, _, _, _, _, _,_ }, + { 0,1,2,3, 4,5,6,7, _,_, _, _, _, _, _,_ }, + { _,_,_,_, _,_,_,_, 0,1, 2, 3, _, _, _,_ }, + { 0,1,2,3, _,_,_,_, 4,5, 6, 7, _, _, _,_ }, + { _,_,_,_, 0,1,2,3, 4,5, 6, 7, _, _, _,_ }, + { 0,1,2,3, 4,5,6,7, 8,9,10,11, _, _, _,_ }, + { _,_,_,_, _,_,_,_, _,_,_,_, 0, 1, 2, 3 }, + { 0,1,2,3, _,_,_,_, _,_,_, _, 4, 5, 6, 7 }, + { _,_,_,_, 0,1,2,3, _,_,_, _, 4, 5, 6, 7 }, + { 0,1,2,3, 4,5,6,7, _,_, _, _, 8, 9,10,11 }, + { _,_,_,_, _,_,_,_, 0,1, 2, 3, 4, 5, 6, 7 }, + { 0,1,2,3, _,_,_,_, 4,5, 6, 7, 8, 9,10,11 }, + { _,_,_,_, 0,1,2,3, 4,5, 6, 7, 8, 9,10,11 }, + { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, + #undef _ +}; + +#define VSTO( _op_, _i_, _ov_, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m) +#define VSTO0(_op_, _i_, ov, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) +#define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128() +#include "bitunpack_.h" + +unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(128*b); unsigned m; + __m128i sv; + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + #endif + +//----------------------------------------------------------------------------- +#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_) +#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG128x32(__ov); SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv) +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi32(start); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VSTO(__op, i, __ov, __sv) SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv) +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi32(start); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- + #ifdef __SSSE3__ +#define VEXP(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); + +#define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m) +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); + +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(128*b); unsigned m; + __m128i sv = _mm_set1_epi32(start); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + #endif +//----------------------------------------------------------------------------- +#define VSTO(__op, i, __ov, __sv) SCANI128x32(__ov,__sv,cv); _mm_storeu_si128(__op++, __sv); +#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv) +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4) + +unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +//----------------------------------------------------------------------------- + #ifdef __SSSE3__ +#define VEXP(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI128x32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); + +#define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m) +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCANI128x32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); + +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) mv = _mm_set1_epi32(0) //_parm_ = _mm_setzero_si128() + +unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(128*b); unsigned m; + __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + #endif +#endif + +//******************************************** AVX2 ***************************************** + #ifdef __AVX2__ +#include + + #ifdef __AVX512F__ +#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_) +#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_) + #else +static unsigned char permv[256][8] __attribute__((aligned(32))) = { +0,0,0,0,0,0,0,0, +0,1,1,1,1,1,1,1, +1,0,1,1,1,1,1,1, +0,1,2,2,2,2,2,2, +1,1,0,1,1,1,1,1, +0,2,1,2,2,2,2,2, +2,0,1,2,2,2,2,2, +0,1,2,3,3,3,3,3, +1,1,1,0,1,1,1,1, +0,2,2,1,2,2,2,2, +2,0,2,1,2,2,2,2, +0,1,3,2,3,3,3,3, +2,2,0,1,2,2,2,2, +0,3,1,2,3,3,3,3, +3,0,1,2,3,3,3,3, +0,1,2,3,4,4,4,4, +1,1,1,1,0,1,1,1, +0,2,2,2,1,2,2,2, +2,0,2,2,1,2,2,2, +0,1,3,3,2,3,3,3, +2,2,0,2,1,2,2,2, +0,3,1,3,2,3,3,3, +3,0,1,3,2,3,3,3, +0,1,2,4,3,4,4,4, +2,2,2,0,1,2,2,2, +0,3,3,1,2,3,3,3, +3,0,3,1,2,3,3,3, +0,1,4,2,3,4,4,4, +3,3,0,1,2,3,3,3, +0,4,1,2,3,4,4,4, +4,0,1,2,3,4,4,4, +0,1,2,3,4,5,5,5, +1,1,1,1,1,0,1,1, +0,2,2,2,2,1,2,2, +2,0,2,2,2,1,2,2, +0,1,3,3,3,2,3,3, +2,2,0,2,2,1,2,2, +0,3,1,3,3,2,3,3, +3,0,1,3,3,2,3,3, +0,1,2,4,4,3,4,4, +2,2,2,0,2,1,2,2, +0,3,3,1,3,2,3,3, +3,0,3,1,3,2,3,3, +0,1,4,2,4,3,4,4, +3,3,0,1,3,2,3,3, +0,4,1,2,4,3,4,4, +4,0,1,2,4,3,4,4, +0,1,2,3,5,4,5,5, +2,2,2,2,0,1,2,2, +0,3,3,3,1,2,3,3, +3,0,3,3,1,2,3,3, +0,1,4,4,2,3,4,4, +3,3,0,3,1,2,3,3, +0,4,1,4,2,3,4,4, +4,0,1,4,2,3,4,4, +0,1,2,5,3,4,5,5, +3,3,3,0,1,2,3,3, +0,4,4,1,2,3,4,4, +4,0,4,1,2,3,4,4, +0,1,5,2,3,4,5,5, +4,4,0,1,2,3,4,4, +0,5,1,2,3,4,5,5, +5,0,1,2,3,4,5,5, +0,1,2,3,4,5,6,6, +1,1,1,1,1,1,0,1, +0,2,2,2,2,2,1,2, +2,0,2,2,2,2,1,2, +0,1,3,3,3,3,2,3, +2,2,0,2,2,2,1,2, +0,3,1,3,3,3,2,3, +3,0,1,3,3,3,2,3, +0,1,2,4,4,4,3,4, +2,2,2,0,2,2,1,2, +0,3,3,1,3,3,2,3, +3,0,3,1,3,3,2,3, +0,1,4,2,4,4,3,4, +3,3,0,1,3,3,2,3, +0,4,1,2,4,4,3,4, +4,0,1,2,4,4,3,4, +0,1,2,3,5,5,4,5, +2,2,2,2,0,2,1,2, +0,3,3,3,1,3,2,3, +3,0,3,3,1,3,2,3, +0,1,4,4,2,4,3,4, +3,3,0,3,1,3,2,3, +0,4,1,4,2,4,3,4, +4,0,1,4,2,4,3,4, +0,1,2,5,3,5,4,5, +3,3,3,0,1,3,2,3, +0,4,4,1,2,4,3,4, +4,0,4,1,2,4,3,4, +0,1,5,2,3,5,4,5, +4,4,0,1,2,4,3,4, +0,5,1,2,3,5,4,5, +5,0,1,2,3,5,4,5, +0,1,2,3,4,6,5,6, +2,2,2,2,2,0,1,2, +0,3,3,3,3,1,2,3, +3,0,3,3,3,1,2,3, +0,1,4,4,4,2,3,4, +3,3,0,3,3,1,2,3, +0,4,1,4,4,2,3,4, +4,0,1,4,4,2,3,4, +0,1,2,5,5,3,4,5, +3,3,3,0,3,1,2,3, +0,4,4,1,4,2,3,4, +4,0,4,1,4,2,3,4, +0,1,5,2,5,3,4,5, +4,4,0,1,4,2,3,4, +0,5,1,2,5,3,4,5, +5,0,1,2,5,3,4,5, +0,1,2,3,6,4,5,6, +3,3,3,3,0,1,2,3, +0,4,4,4,1,2,3,4, +4,0,4,4,1,2,3,4, +0,1,5,5,2,3,4,5, +4,4,0,4,1,2,3,4, +0,5,1,5,2,3,4,5, +5,0,1,5,2,3,4,5, +0,1,2,6,3,4,5,6, +4,4,4,0,1,2,3,4, +0,5,5,1,2,3,4,5, +5,0,5,1,2,3,4,5, +0,1,6,2,3,4,5,6, +5,5,0,1,2,3,4,5, +0,6,1,2,3,4,5,6, +6,0,1,2,3,4,5,6, +0,1,2,3,4,5,6,7, +1,1,1,1,1,1,1,0, +0,2,2,2,2,2,2,1, +2,0,2,2,2,2,2,1, +0,1,3,3,3,3,3,2, +2,2,0,2,2,2,2,1, +0,3,1,3,3,3,3,2, +3,0,1,3,3,3,3,2, +0,1,2,4,4,4,4,3, +2,2,2,0,2,2,2,1, +0,3,3,1,3,3,3,2, +3,0,3,1,3,3,3,2, +0,1,4,2,4,4,4,3, +3,3,0,1,3,3,3,2, +0,4,1,2,4,4,4,3, +4,0,1,2,4,4,4,3, +0,1,2,3,5,5,5,4, +2,2,2,2,0,2,2,1, +0,3,3,3,1,3,3,2, +3,0,3,3,1,3,3,2, +0,1,4,4,2,4,4,3, +3,3,0,3,1,3,3,2, +0,4,1,4,2,4,4,3, +4,0,1,4,2,4,4,3, +0,1,2,5,3,5,5,4, +3,3,3,0,1,3,3,2, +0,4,4,1,2,4,4,3, +4,0,4,1,2,4,4,3, +0,1,5,2,3,5,5,4, +4,4,0,1,2,4,4,3, +0,5,1,2,3,5,5,4, +5,0,1,2,3,5,5,4, +0,1,2,3,4,6,6,5, +2,2,2,2,2,0,2,1, +0,3,3,3,3,1,3,2, +3,0,3,3,3,1,3,2, +0,1,4,4,4,2,4,3, +3,3,0,3,3,1,3,2, +0,4,1,4,4,2,4,3, +4,0,1,4,4,2,4,3, +0,1,2,5,5,3,5,4, +3,3,3,0,3,1,3,2, +0,4,4,1,4,2,4,3, +4,0,4,1,4,2,4,3, +0,1,5,2,5,3,5,4, +4,4,0,1,4,2,4,3, +0,5,1,2,5,3,5,4, +5,0,1,2,5,3,5,4, +0,1,2,3,6,4,6,5, +3,3,3,3,0,1,3,2, +0,4,4,4,1,2,4,3, +4,0,4,4,1,2,4,3, +0,1,5,5,2,3,5,4, +4,4,0,4,1,2,4,3, +0,5,1,5,2,3,5,4, +5,0,1,5,2,3,5,4, +0,1,2,6,3,4,6,5, +4,4,4,0,1,2,4,3, +0,5,5,1,2,3,5,4, +5,0,5,1,2,3,5,4, +0,1,6,2,3,4,6,5, +5,5,0,1,2,3,5,4, +0,6,1,2,3,4,6,5, +6,0,1,2,3,4,6,5, +0,1,2,3,4,5,7,6, +2,2,2,2,2,2,0,1, +0,3,3,3,3,3,1,2, +3,0,3,3,3,3,1,2, +0,1,4,4,4,4,2,3, +3,3,0,3,3,3,1,2, +0,4,1,4,4,4,2,3, +4,0,1,4,4,4,2,3, +0,1,2,5,5,5,3,4, +3,3,3,0,3,3,1,2, +0,4,4,1,4,4,2,3, +4,0,4,1,4,4,2,3, +0,1,5,2,5,5,3,4, +4,4,0,1,4,4,2,3, +0,5,1,2,5,5,3,4, +5,0,1,2,5,5,3,4, +0,1,2,3,6,6,4,5, +3,3,3,3,0,3,1,2, +0,4,4,4,1,4,2,3, +4,0,4,4,1,4,2,3, +0,1,5,5,2,5,3,4, +4,4,0,4,1,4,2,3, +0,5,1,5,2,5,3,4, +5,0,1,5,2,5,3,4, +0,1,2,6,3,6,4,5, +4,4,4,0,1,4,2,3, +0,5,5,1,2,5,3,4, +5,0,5,1,2,5,3,4, +0,1,6,2,3,6,4,5, +5,5,0,1,2,5,3,4, +0,6,1,2,3,6,4,5, +6,0,1,2,3,6,4,5, +0,1,2,3,4,7,5,6, +3,3,3,3,3,0,1,2, +0,4,4,4,4,1,2,3, +4,0,4,4,4,1,2,3, +0,1,5,5,5,2,3,4, +4,4,0,4,4,1,2,3, +0,5,1,5,5,2,3,4, +5,0,1,5,5,2,3,4, +0,1,2,6,6,3,4,5, +4,4,4,0,4,1,2,3, +0,5,5,1,5,2,3,4, +5,0,5,1,5,2,3,4, +0,1,6,2,6,3,4,5, +5,5,0,1,5,2,3,4, +0,6,1,2,6,3,4,5, +6,0,1,2,6,3,4,5, +0,1,2,3,7,4,5,6, +4,4,4,4,0,1,2,3, +0,5,5,5,1,2,3,4, +5,0,5,5,1,2,3,4, +0,1,6,6,2,3,4,5, +5,5,0,5,1,2,3,4, +0,6,1,6,2,3,4,5, +6,0,1,6,2,3,4,5, +0,1,2,7,3,4,5,6, +5,5,5,0,1,2,3,4, +0,6,6,1,2,3,4,5, +6,0,6,1,2,3,4,5, +0,1,7,2,3,4,5,6, +6,6,0,1,2,3,4,5, +0,7,1,2,3,4,5,6, +7,0,1,2,3,4,5,6, +0,1,2,3,4,5,6,7 +}; +#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_) +#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(permv[_m_]))) ) +#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv)) + #endif + +//----------------------------------------------------------------------------- +#define VSTO( _op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, ov) +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) +#include "bitunpack_.h" + +#define BITUNBLK256V32_0(ip, _i_, _op_, _parm_) {__m256i ov;\ + VSTO0(_op_, 0, ov, _parm_);\ + VSTO0(_op_, 1, ov, _parm_);\ + VSTO0(_op_, 2, ov, _parm_);\ + VSTO0(_op_, 3, ov, _parm_);\ + VSTO0(_op_, 4, ov, _parm_);\ + VSTO0(_op_, 5, ov, _parm_);\ + VSTO0(_op_, 6, ov, _parm_);\ + VSTO0(_op_, 7, ov, _parm_);\ + VSTO0(_op_, 8, ov, _parm_);\ + VSTO0(_op_, 9, ov, _parm_);\ + VSTO0(_op_, 10, ov, _parm_);\ + VSTO0(_op_, 11, ov, _parm_);\ + VSTO0(_op_, 12, ov, _parm_);\ + VSTO0(_op_, 13, ov, _parm_);\ + VSTO0(_op_, 14, ov, _parm_);\ + VSTO0(_op_, 15, ov, _parm_);\ + VSTO0(_op_, 16, ov, _parm_);\ + VSTO0(_op_, 17, ov, _parm_);\ + VSTO0(_op_, 18, ov, _parm_);\ + VSTO0(_op_, 19, ov, _parm_);\ + VSTO0(_op_, 20, ov, _parm_);\ + VSTO0(_op_, 21, ov, _parm_);\ + VSTO0(_op_, 22, ov, _parm_);\ + VSTO0(_op_, 23, ov, _parm_);\ + VSTO0(_op_, 24, ov, _parm_);\ + VSTO0(_op_, 25, ov, _parm_);\ + VSTO0(_op_, 26, ov, _parm_);\ + VSTO0(_op_, 27, ov, _parm_);\ + VSTO0(_op_, 28, ov, _parm_);\ + VSTO0(_op_, 29, ov, _parm_);\ + VSTO0(_op_, 30, ov, _parm_);\ + VSTO0(_op_, 31, ov, _parm_);\ +} +#define BITUNPACK0(_parm_) _parm_ = _mm256_setzero_si256() + +unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv; + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//--------------------------------------- zeromask unpack for TurboPFor vp4d.c -------------------------------------- +#define VSTO(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) )); pex += popcnt32(xm) +#define VSTO0(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm) +#define BITUNPACK0(_parm_) +#include "bitunpack_.h" + +unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +//-------------------------------- +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) +#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG256x32(__ov); SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VSTO(__op, i, __ov, __sv) SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VEXP(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm) +#define VEXP0(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm) + +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); + +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) + +unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; + __m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VSTO(__op, i, __ov, __sv) SCANI256x32(__ov,__sv,cv); _mm256_storeu_si256(__op++, __sv); +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv) +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8) + +unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +//----------------------------------------------------------------------------- +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); + +#include "bitunpack_.h" + +#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128() + +unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; + __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +#endif #pragma clang diagnostic pop - - #else -#include -#define DST( __op, __x, __w, __parm) *__op++ = BPI(__w, __x, __parm) //__op[__x] = BPI(__w,__x,__parm) // - -#define USE_BITUNPACK 64 - - #if USE_BITUNPACK == 64 -#include "bitunpack64_.h" -#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n,*_op=__op;\ - switch(__nbits) {\ - case 0: do BITUNPACK64_0( __ip, __op, __parm) while(__op<__ope); break;\ - case 1: do BITUNPACK64_1( __ip, __op, __parm) while(__op<__ope); break;\ - case 2: do BITUNPACK64_2( __ip, __op, __parm) while(__op<__ope); break;\ - case 3: do BITUNPACK64_3( __ip, __op, __parm) while(__op<__ope); break;\ - case 4: do BITUNPACK64_4( __ip, __op, __parm) while(__op<__ope); break;\ - case 5: do BITUNPACK64_5( __ip, __op, __parm) while(__op<__ope); break;\ - case 6: do BITUNPACK64_6( __ip, __op, __parm) while(__op<__ope); break;\ - case 7: do BITUNPACK64_7( __ip, __op, __parm) while(__op<__ope); break;\ - case 8: do BITUNPACK64_8( __ip, __op, __parm) while(__op<__ope); break;\ - case 9: do BITUNPACK64_9( __ip, __op, __parm) while(__op<__ope); break;\ - case 10: do BITUNPACK64_10(__ip, __op, __parm) while(__op<__ope); break;\ - case 11: do BITUNPACK64_11(__ip, __op, __parm) while(__op<__ope); break;\ - case 12: do BITUNPACK64_12(__ip, __op, __parm) while(__op<__ope); break;\ - case 13: do BITUNPACK64_13(__ip, __op, __parm) while(__op<__ope); break;\ - case 14: do BITUNPACK64_14(__ip, __op, __parm) while(__op<__ope); break;\ - case 15: do BITUNPACK64_15(__ip, __op, __parm) while(__op<__ope); break;\ - case 16: do BITUNPACK64_16(__ip, __op, __parm) while(__op<__ope); break;\ - case 17: do BITUNPACK64_17(__ip, __op, __parm) while(__op<__ope); break;\ - case 18: do BITUNPACK64_18(__ip, __op, __parm) while(__op<__ope); break;\ - case 19: do BITUNPACK64_19(__ip, __op, __parm) while(__op<__ope); break;\ - case 20: do BITUNPACK64_20(__ip, __op, __parm) while(__op<__ope); break;\ - case 21: do BITUNPACK64_21(__ip, __op, __parm) while(__op<__ope); break;\ - case 22: do BITUNPACK64_22(__ip, __op, __parm) while(__op<__ope); break;\ - case 23: do BITUNPACK64_23(__ip, __op, __parm) while(__op<__ope); break;\ - case 24: do BITUNPACK64_24(__ip, __op, __parm) while(__op<__ope); break;\ - case 25: do BITUNPACK64_25(__ip, __op, __parm) while(__op<__ope); break;\ - case 26: do BITUNPACK64_26(__ip, __op, __parm) while(__op<__ope); break;\ - case 27: do BITUNPACK64_27(__ip, __op, __parm) while(__op<__ope); break;\ - case 28: do BITUNPACK64_28(__ip, __op, __parm) while(__op<__ope); break;\ - case 29: do BITUNPACK64_29(__ip, __op, __parm) while(__op<__ope); break;\ - case 30: do BITUNPACK64_30(__ip, __op, __parm) while(__op<__ope); break;\ - case 31: do BITUNPACK64_31(__ip, __op, __parm) while(__op<__ope); break;\ - case 32: do BITUNPACK64_32(__ip, __op, __parm) while(__op<__ope); break;\ - }\ -} - -#define BITUNPACK64(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n,*_op=__op;\ - switch(__nbits) {\ - case 0: do BITUNPACK64_0( __ip, __op, __parm) while(__op<__ope); break;\ - case 1: do BITUNPACK64_1( __ip, __op, __parm) while(__op<__ope); break;\ - case 2: do BITUNPACK64_2( __ip, __op, __parm) while(__op<__ope); break;\ - case 3: do BITUNPACK64_3( __ip, __op, __parm) while(__op<__ope); break;\ - case 4: do BITUNPACK64_4( __ip, __op, __parm) while(__op<__ope); break;\ - case 5: do BITUNPACK64_5( __ip, __op, __parm) while(__op<__ope); break;\ - case 6: do BITUNPACK64_6( __ip, __op, __parm) while(__op<__ope); break;\ - case 7: do BITUNPACK64_7( __ip, __op, __parm) while(__op<__ope); break;\ - case 8: do BITUNPACK64_8( __ip, __op, __parm) while(__op<__ope); break;\ - case 9: do BITUNPACK64_9( __ip, __op, __parm) while(__op<__ope); break;\ - case 10: do BITUNPACK64_10(__ip, __op, __parm) while(__op<__ope); break;\ - case 11: do BITUNPACK64_11(__ip, __op, __parm) while(__op<__ope); break;\ - case 12: do BITUNPACK64_12(__ip, __op, __parm) while(__op<__ope); break;\ - case 13: do BITUNPACK64_13(__ip, __op, __parm) while(__op<__ope); break;\ - case 14: do BITUNPACK64_14(__ip, __op, __parm) while(__op<__ope); break;\ - case 15: do BITUNPACK64_15(__ip, __op, __parm) while(__op<__ope); break;\ - case 16: do BITUNPACK64_16(__ip, __op, __parm) while(__op<__ope); break;\ - case 17: do BITUNPACK64_17(__ip, __op, __parm) while(__op<__ope); break;\ - case 18: do BITUNPACK64_18(__ip, __op, __parm) while(__op<__ope); break;\ - case 19: do BITUNPACK64_19(__ip, __op, __parm) while(__op<__ope); break;\ - case 20: do BITUNPACK64_20(__ip, __op, __parm) while(__op<__ope); break;\ - case 21: do BITUNPACK64_21(__ip, __op, __parm) while(__op<__ope); break;\ - case 22: do BITUNPACK64_22(__ip, __op, __parm) while(__op<__ope); break;\ - case 23: do BITUNPACK64_23(__ip, __op, __parm) while(__op<__ope); break;\ - case 24: do BITUNPACK64_24(__ip, __op, __parm) while(__op<__ope); break;\ - case 25: do BITUNPACK64_25(__ip, __op, __parm) while(__op<__ope); break;\ - case 26: do BITUNPACK64_26(__ip, __op, __parm) while(__op<__ope); break;\ - case 27: do BITUNPACK64_27(__ip, __op, __parm) while(__op<__ope); break;\ - case 28: do BITUNPACK64_28(__ip, __op, __parm) while(__op<__ope); break;\ - case 29: do BITUNPACK64_29(__ip, __op, __parm) while(__op<__ope); break;\ - case 30: do BITUNPACK64_30(__ip, __op, __parm) while(__op<__ope); break;\ - case 31: do BITUNPACK64_31(__ip, __op, __parm) while(__op<__ope); break;\ - case 32: do BITUNPACK64_32(__ip, __op, __parm) while(__op<__ope); break;\ - case 33: do BITUNPACK64_33(__ip, __op, __parm) while(__op<__ope); break;\ - case 34: do BITUNPACK64_34(__ip, __op, __parm) while(__op<__ope); break;\ - case 35: do BITUNPACK64_35(__ip, __op, __parm) while(__op<__ope); break;\ - case 36: do BITUNPACK64_36(__ip, __op, __parm) while(__op<__ope); break;\ - case 37: do BITUNPACK64_37(__ip, __op, __parm) while(__op<__ope); break;\ - case 38: do BITUNPACK64_38(__ip, __op, __parm) while(__op<__ope); break;\ - case 39: do BITUNPACK64_39(__ip, __op, __parm) while(__op<__ope); break;\ - case 40: do BITUNPACK64_40(__ip, __op, __parm) while(__op<__ope); break;\ - case 41: do BITUNPACK64_41(__ip, __op, __parm) while(__op<__ope); break;\ - case 42: do BITUNPACK64_42(__ip, __op, __parm) while(__op<__ope); break;\ - case 43: do BITUNPACK64_43(__ip, __op, __parm) while(__op<__ope); break;\ - case 44: do BITUNPACK64_44(__ip, __op, __parm) while(__op<__ope); break;\ - case 45: do BITUNPACK64_45(__ip, __op, __parm) while(__op<__ope); break;\ - case 46: do BITUNPACK64_46(__ip, __op, __parm) while(__op<__ope); break;\ - case 47: do BITUNPACK64_47(__ip, __op, __parm) while(__op<__ope); break;\ - case 48: do BITUNPACK64_48(__ip, __op, __parm) while(__op<__ope); break;\ - case 49: do BITUNPACK64_49(__ip, __op, __parm) while(__op<__ope); break;\ - case 50: do BITUNPACK64_50(__ip, __op, __parm) while(__op<__ope); break;\ - case 51: do BITUNPACK64_51(__ip, __op, __parm) while(__op<__ope); break;\ - case 52: do BITUNPACK64_52(__ip, __op, __parm) while(__op<__ope); break;\ - case 53: do BITUNPACK64_53(__ip, __op, __parm) while(__op<__ope); break;\ - case 54: do BITUNPACK64_54(__ip, __op, __parm) while(__op<__ope); break;\ - case 55: do BITUNPACK64_55(__ip, __op, __parm) while(__op<__ope); break;\ - case 56: do BITUNPACK64_56(__ip, __op, __parm) while(__op<__ope); break;\ - case 57: do BITUNPACK64_57(__ip, __op, __parm) while(__op<__ope); break;\ - case 58: do BITUNPACK64_58(__ip, __op, __parm) while(__op<__ope); break;\ - case 59: do BITUNPACK64_59(__ip, __op, __parm) while(__op<__ope); break;\ - case 60: do BITUNPACK64_60(__ip, __op, __parm) while(__op<__ope); break;\ - case 61: do BITUNPACK64_61(__ip, __op, __parm) while(__op<__ope); break;\ - case 62: do BITUNPACK64_62(__ip, __op, __parm) while(__op<__ope); break;\ - case 63: do BITUNPACK64_63(__ip, __op, __parm) while(__op<__ope); break;\ - case 64: do BITUNPACK64_64(__ip, __op, __parm) while(__op<__ope); break;\ - }\ -} - - #elif USE_BITUNPACK == 32 -#include "bitunpack32_.h" // Not included in the github package -#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;\ - switch(__nbits) {\ - case 0: do BITUNPACK32_0( __ip, __op, __parm) while(__op<__ope); break;\ - case 1: do BITUNPACK32_1( __ip, __op, __parm) while(__op<__ope); break;\ - case 2: do BITUNPACK32_2( __ip, __op, __parm) while(__op<__ope); break;\ - case 3: do BITUNPACK32_3( __ip, __op, __parm) while(__op<__ope); break;\ - case 4: do BITUNPACK32_4( __ip, __op, __parm) while(__op<__ope); break;\ - case 5: do BITUNPACK32_5( __ip, __op, __parm) while(__op<__ope); break;\ - case 6: do BITUNPACK32_6( __ip, __op, __parm) while(__op<__ope); break;\ - case 7: do BITUNPACK32_7( __ip, __op, __parm) while(__op<__ope); break;\ - case 8: do BITUNPACK32_8( __ip, __op, __parm) while(__op<__ope); break;\ - case 9: do BITUNPACK32_9( __ip, __op, __parm) while(__op<__ope); break;\ - case 10: do BITUNPACK32_10(__ip, __op, __parm) while(__op<__ope); break;\ - case 11: do BITUNPACK32_11(__ip, __op, __parm) while(__op<__ope); break;\ - case 12: do BITUNPACK32_12(__ip, __op, __parm) while(__op<__ope); break;\ - case 13: do BITUNPACK32_13(__ip, __op, __parm) while(__op<__ope); break;\ - case 14: do BITUNPACK32_14(__ip, __op, __parm) while(__op<__ope); break;\ - case 15: do BITUNPACK32_15(__ip, __op, __parm) while(__op<__ope); break;\ - case 16: do BITUNPACK32_16(__ip, __op, __parm) while(__op<__ope); break;\ - case 17: do BITUNPACK32_17(__ip, __op, __parm) while(__op<__ope); break;\ - case 18: do BITUNPACK32_18(__ip, __op, __parm) while(__op<__ope); break;\ - case 19: do BITUNPACK32_19(__ip, __op, __parm) while(__op<__ope); break;\ - case 20: do BITUNPACK32_20(__ip, __op, __parm) while(__op<__ope); break;\ - case 21: do BITUNPACK32_21(__ip, __op, __parm) while(__op<__ope); break;\ - case 22: do BITUNPACK32_22(__ip, __op, __parm) while(__op<__ope); break;\ - case 23: do BITUNPACK32_23(__ip, __op, __parm) while(__op<__ope); break;\ - case 24: do BITUNPACK32_24(__ip, __op, __parm) while(__op<__ope); break;\ - case 25: do BITUNPACK32_25(__ip, __op, __parm) while(__op<__ope); break;\ - case 26: do BITUNPACK32_26(__ip, __op, __parm) while(__op<__ope); break;\ - case 27: do BITUNPACK32_27(__ip, __op, __parm) while(__op<__ope); break;\ - case 28: do BITUNPACK32_28(__ip, __op, __parm) while(__op<__ope); break;\ - case 29: do BITUNPACK32_29(__ip, __op, __parm) while(__op<__ope); break;\ - case 30: do BITUNPACK32_30(__ip, __op, __parm) while(__op<__ope); break;\ - case 31: do BITUNPACK32_31(__ip, __op, __parm) while(__op<__ope); break;\ - case 32: do BITUNPACK32_32(__ip, __op, __parm) while(__op<__ope); break;\ - }\ -} - #endif - #endif - +#pragma GCC pop_options