diff --git a/bitpack256v.c b/bitpack256v.c deleted file mode 100644 index 75077ab..0000000 --- a/bitpack256v.c +++ /dev/null @@ -1,119 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Integer Compression" SIMD bit packing - #ifndef VSTI -#include -#include "bitpack.h" -#include "bitutil.h" - -#define PAD8(__x) (((__x)+8-1)/8) - -#define VSTI(ip, i, iv, parm) -#define IPP(ip, i, iv) _mm256_loadu_si256(ip++) -#include "bitpack256v.c" - -unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; } -#undef VSTI -#undef IPP - -//------------------------------------------------------------------------------------------------------------------------------ -#if 0 -#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); DELTA256x32(v,__sv, __iv) //__sv = v -#define IPP(ip, i, __iv) __iv -#include "bitpack256v.c" - -unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); - __m256i v; //,sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(); - __m128i sv = _mm_set1_epi32(start); - BITPACK256V32(in, b, out, sv); - return pout; -} -#undef VSTI - -//------------------------------------------------------------------------------------------------------------------------------ -#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); __iv = _mm256_sub_epi32(DELTA256x32(v,__sv),cv); __sv = v - -unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); - __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1); - //BITPACK256V32(in, b, out, sv); return pout; -} -#undef VSTI -//------------------------------------------------------------------------------------------------------------------------------ -#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); __iv = DELTA256x32(v,__sv); __sv = v; __iv = ZIGZAG256x32(__iv) - -unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); - __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1); - //BITPACK256V32(in, b, out, sv); - return pout; -} -#endif - -#undef VSTI - #else -#include -#include - -#define OPPE(__op) -#define IPPE(__op) - -#include "bitpack256v_.h" - -#define BITPACK256V32(__pip, __nbits, __pop, __parm) { __m256i *__ip=(__m256i *)__pip,*__op=(__m256i *)__pop;\ - switch(__nbits) {\ - case 0: break;\ - case 1:{ BITPACK256V32_1( __ip, __op, __parm); } break;\ - case 2:{ BITPACK256V32_2( __ip, __op, __parm); } break;\ - case 3:{ BITPACK256V32_3( __ip, __op, __parm); } break;\ - case 4:{ BITPACK256V32_4( __ip, __op, __parm); } break;\ - case 5:{ BITPACK256V32_5( __ip, __op, __parm); } break;\ - case 6:{ BITPACK256V32_6( __ip, __op, __parm); } break;\ - case 7:{ BITPACK256V32_7( __ip, __op, __parm); } break;\ - case 8:{ BITPACK256V32_8( __ip, __op, __parm); } break;\ - case 9:{ BITPACK256V32_9( __ip, __op, __parm); } break;\ - case 10:{ BITPACK256V32_10(__ip, __op, __parm); } break;\ - case 11:{ BITPACK256V32_11(__ip, __op, __parm); } break;\ - case 12:{ BITPACK256V32_12(__ip, __op, __parm); } break;\ - case 13:{ BITPACK256V32_13(__ip, __op, __parm); } break;\ - case 14:{ BITPACK256V32_14(__ip, __op, __parm); } break;\ - case 15:{ BITPACK256V32_15(__ip, __op, __parm); } break;\ - case 16:{ BITPACK256V32_16(__ip, __op, __parm); } break;\ - case 17:{ BITPACK256V32_17(__ip, __op, __parm); } break;\ - case 18:{ BITPACK256V32_18(__ip, __op, __parm); } break;\ - case 19:{ BITPACK256V32_19(__ip, __op, __parm); } break;\ - case 20:{ BITPACK256V32_20(__ip, __op, __parm); } break;\ - case 21:{ BITPACK256V32_21(__ip, __op, __parm); } break;\ - case 22:{ BITPACK256V32_22(__ip, __op, __parm); } break;\ - case 23:{ BITPACK256V32_23(__ip, __op, __parm); } break;\ - case 24:{ BITPACK256V32_24(__ip, __op, __parm); } break;\ - case 25:{ BITPACK256V32_25(__ip, __op, __parm); } break;\ - case 26:{ BITPACK256V32_26(__ip, __op, __parm); } break;\ - case 27:{ BITPACK256V32_27(__ip, __op, __parm); } break;\ - case 28:{ BITPACK256V32_28(__ip, __op, __parm); } break;\ - case 29:{ BITPACK256V32_29(__ip, __op, __parm); } break;\ - case 30:{ BITPACK256V32_30(__ip, __op, __parm); } break;\ - case 31:{ BITPACK256V32_31(__ip, __op, __parm); } break;\ - case 32:{ BITPACK256V32_32(__ip, __op, __parm); } break;\ - }\ -} - #endif diff --git a/bitpack128v.c b/bitpackv.c similarity index 100% rename from bitpack128v.c rename to bitpackv.c diff --git a/bitunpack.h b/bitunpack.h deleted file mode 100644 index af03f9d..0000000 --- a/bitunpack.h +++ /dev/null @@ -1,108 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Integer Compression" Bit Packing - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include "conf.h" - -// ---------------- Unpack a b-bits packed integer array -------------------------------------------------------------------------------------- -// unpack a bitpacked integer array. Return value = end of packed buffer in -unsigned char *bitunpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b); -unsigned char *bitunpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b); -unsigned char *bitunpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b); - -// ---------------- Direct Access to a single packed integer array entry -------------------------------------------------------------------- -// Get a single 32 bits value with index "idx" (or bit index b*idx) from packed integer array -static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((unsigned *)in+(bidx>>5)) >> (bidx&0x1f), b ); } -static ALWAYS_INLINE unsigned _bitgetx32(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi64( ctou64((unsigned *)in+(bidx>>5)) >> (bidx&0x1f), b ); } - -// like bitgetx32 but for 16 bits integer array -static ALWAYS_INLINE unsigned bitgetx16(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou32((unsigned *)in+(bidx>>4)) >> (bidx& 0xf), b ); } -static ALWAYS_INLINE unsigned _bitgetx16(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou32((unsigned *)in+(bidx>>4)) >> (bidx& 0xf), b ); } - -// Set a single value with index "idx" -static ALWAYS_INLINE void bitsetx16(const unsigned char *__restrict in, unsigned idx, unsigned v, unsigned b) { unsigned bidx = b*idx; unsigned *p = (unsigned *) in+(bidx>>4) ; *p = ( *p & ~(((1u <>5)); *p = ( *p & ~(((1ull< - -#include -#include -#include "conf.h" -#include "bitutil.h" -#include "bitunpack.h" - - #ifdef __AVX512F__ -#include -#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_) -#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_) - #else -#include -static unsigned char shuffles[256][8] __attribute__((aligned(32))) = { -0,0,0,0,0,0,0,0, -0,1,1,1,1,1,1,1, -1,0,1,1,1,1,1,1, -0,1,2,2,2,2,2,2, -1,1,0,1,1,1,1,1, -0,2,1,2,2,2,2,2, -2,0,1,2,2,2,2,2, -0,1,2,3,3,3,3,3, -1,1,1,0,1,1,1,1, -0,2,2,1,2,2,2,2, -2,0,2,1,2,2,2,2, -0,1,3,2,3,3,3,3, -2,2,0,1,2,2,2,2, -0,3,1,2,3,3,3,3, -3,0,1,2,3,3,3,3, -0,1,2,3,4,4,4,4, -1,1,1,1,0,1,1,1, -0,2,2,2,1,2,2,2, -2,0,2,2,1,2,2,2, -0,1,3,3,2,3,3,3, -2,2,0,2,1,2,2,2, -0,3,1,3,2,3,3,3, -3,0,1,3,2,3,3,3, -0,1,2,4,3,4,4,4, -2,2,2,0,1,2,2,2, -0,3,3,1,2,3,3,3, -3,0,3,1,2,3,3,3, -0,1,4,2,3,4,4,4, -3,3,0,1,2,3,3,3, -0,4,1,2,3,4,4,4, -4,0,1,2,3,4,4,4, -0,1,2,3,4,5,5,5, -1,1,1,1,1,0,1,1, -0,2,2,2,2,1,2,2, -2,0,2,2,2,1,2,2, -0,1,3,3,3,2,3,3, -2,2,0,2,2,1,2,2, -0,3,1,3,3,2,3,3, -3,0,1,3,3,2,3,3, -0,1,2,4,4,3,4,4, -2,2,2,0,2,1,2,2, -0,3,3,1,3,2,3,3, -3,0,3,1,3,2,3,3, -0,1,4,2,4,3,4,4, -3,3,0,1,3,2,3,3, -0,4,1,2,4,3,4,4, -4,0,1,2,4,3,4,4, -0,1,2,3,5,4,5,5, -2,2,2,2,0,1,2,2, -0,3,3,3,1,2,3,3, -3,0,3,3,1,2,3,3, -0,1,4,4,2,3,4,4, -3,3,0,3,1,2,3,3, -0,4,1,4,2,3,4,4, -4,0,1,4,2,3,4,4, -0,1,2,5,3,4,5,5, -3,3,3,0,1,2,3,3, -0,4,4,1,2,3,4,4, -4,0,4,1,2,3,4,4, -0,1,5,2,3,4,5,5, -4,4,0,1,2,3,4,4, -0,5,1,2,3,4,5,5, -5,0,1,2,3,4,5,5, -0,1,2,3,4,5,6,6, -1,1,1,1,1,1,0,1, -0,2,2,2,2,2,1,2, -2,0,2,2,2,2,1,2, -0,1,3,3,3,3,2,3, -2,2,0,2,2,2,1,2, -0,3,1,3,3,3,2,3, -3,0,1,3,3,3,2,3, -0,1,2,4,4,4,3,4, -2,2,2,0,2,2,1,2, -0,3,3,1,3,3,2,3, -3,0,3,1,3,3,2,3, -0,1,4,2,4,4,3,4, -3,3,0,1,3,3,2,3, -0,4,1,2,4,4,3,4, -4,0,1,2,4,4,3,4, -0,1,2,3,5,5,4,5, -2,2,2,2,0,2,1,2, -0,3,3,3,1,3,2,3, -3,0,3,3,1,3,2,3, -0,1,4,4,2,4,3,4, -3,3,0,3,1,3,2,3, -0,4,1,4,2,4,3,4, -4,0,1,4,2,4,3,4, -0,1,2,5,3,5,4,5, -3,3,3,0,1,3,2,3, -0,4,4,1,2,4,3,4, -4,0,4,1,2,4,3,4, -0,1,5,2,3,5,4,5, -4,4,0,1,2,4,3,4, -0,5,1,2,3,5,4,5, -5,0,1,2,3,5,4,5, -0,1,2,3,4,6,5,6, -2,2,2,2,2,0,1,2, -0,3,3,3,3,1,2,3, -3,0,3,3,3,1,2,3, -0,1,4,4,4,2,3,4, -3,3,0,3,3,1,2,3, -0,4,1,4,4,2,3,4, -4,0,1,4,4,2,3,4, -0,1,2,5,5,3,4,5, -3,3,3,0,3,1,2,3, -0,4,4,1,4,2,3,4, -4,0,4,1,4,2,3,4, -0,1,5,2,5,3,4,5, -4,4,0,1,4,2,3,4, -0,5,1,2,5,3,4,5, -5,0,1,2,5,3,4,5, -0,1,2,3,6,4,5,6, -3,3,3,3,0,1,2,3, -0,4,4,4,1,2,3,4, -4,0,4,4,1,2,3,4, -0,1,5,5,2,3,4,5, -4,4,0,4,1,2,3,4, -0,5,1,5,2,3,4,5, -5,0,1,5,2,3,4,5, -0,1,2,6,3,4,5,6, -4,4,4,0,1,2,3,4, -0,5,5,1,2,3,4,5, -5,0,5,1,2,3,4,5, -0,1,6,2,3,4,5,6, -5,5,0,1,2,3,4,5, -0,6,1,2,3,4,5,6, -6,0,1,2,3,4,5,6, -0,1,2,3,4,5,6,7, -1,1,1,1,1,1,1,0, -0,2,2,2,2,2,2,1, -2,0,2,2,2,2,2,1, -0,1,3,3,3,3,3,2, -2,2,0,2,2,2,2,1, -0,3,1,3,3,3,3,2, -3,0,1,3,3,3,3,2, -0,1,2,4,4,4,4,3, -2,2,2,0,2,2,2,1, -0,3,3,1,3,3,3,2, -3,0,3,1,3,3,3,2, -0,1,4,2,4,4,4,3, -3,3,0,1,3,3,3,2, -0,4,1,2,4,4,4,3, -4,0,1,2,4,4,4,3, -0,1,2,3,5,5,5,4, -2,2,2,2,0,2,2,1, -0,3,3,3,1,3,3,2, -3,0,3,3,1,3,3,2, -0,1,4,4,2,4,4,3, -3,3,0,3,1,3,3,2, -0,4,1,4,2,4,4,3, -4,0,1,4,2,4,4,3, -0,1,2,5,3,5,5,4, -3,3,3,0,1,3,3,2, -0,4,4,1,2,4,4,3, -4,0,4,1,2,4,4,3, -0,1,5,2,3,5,5,4, -4,4,0,1,2,4,4,3, -0,5,1,2,3,5,5,4, -5,0,1,2,3,5,5,4, -0,1,2,3,4,6,6,5, -2,2,2,2,2,0,2,1, -0,3,3,3,3,1,3,2, -3,0,3,3,3,1,3,2, -0,1,4,4,4,2,4,3, -3,3,0,3,3,1,3,2, -0,4,1,4,4,2,4,3, -4,0,1,4,4,2,4,3, -0,1,2,5,5,3,5,4, -3,3,3,0,3,1,3,2, -0,4,4,1,4,2,4,3, -4,0,4,1,4,2,4,3, -0,1,5,2,5,3,5,4, -4,4,0,1,4,2,4,3, -0,5,1,2,5,3,5,4, -5,0,1,2,5,3,5,4, -0,1,2,3,6,4,6,5, -3,3,3,3,0,1,3,2, -0,4,4,4,1,2,4,3, -4,0,4,4,1,2,4,3, -0,1,5,5,2,3,5,4, -4,4,0,4,1,2,4,3, -0,5,1,5,2,3,5,4, -5,0,1,5,2,3,5,4, -0,1,2,6,3,4,6,5, -4,4,4,0,1,2,4,3, -0,5,5,1,2,3,5,4, -5,0,5,1,2,3,5,4, -0,1,6,2,3,4,6,5, -5,5,0,1,2,3,5,4, -0,6,1,2,3,4,6,5, -6,0,1,2,3,4,6,5, -0,1,2,3,4,5,7,6, -2,2,2,2,2,2,0,1, -0,3,3,3,3,3,1,2, -3,0,3,3,3,3,1,2, -0,1,4,4,4,4,2,3, -3,3,0,3,3,3,1,2, -0,4,1,4,4,4,2,3, -4,0,1,4,4,4,2,3, -0,1,2,5,5,5,3,4, -3,3,3,0,3,3,1,2, -0,4,4,1,4,4,2,3, -4,0,4,1,4,4,2,3, -0,1,5,2,5,5,3,4, -4,4,0,1,4,4,2,3, -0,5,1,2,5,5,3,4, -5,0,1,2,5,5,3,4, -0,1,2,3,6,6,4,5, -3,3,3,3,0,3,1,2, -0,4,4,4,1,4,2,3, -4,0,4,4,1,4,2,3, -0,1,5,5,2,5,3,4, -4,4,0,4,1,4,2,3, -0,5,1,5,2,5,3,4, -5,0,1,5,2,5,3,4, -0,1,2,6,3,6,4,5, -4,4,4,0,1,4,2,3, -0,5,5,1,2,5,3,4, -5,0,5,1,2,5,3,4, -0,1,6,2,3,6,4,5, -5,5,0,1,2,5,3,4, -0,6,1,2,3,6,4,5, -6,0,1,2,3,6,4,5, -0,1,2,3,4,7,5,6, -3,3,3,3,3,0,1,2, -0,4,4,4,4,1,2,3, -4,0,4,4,4,1,2,3, -0,1,5,5,5,2,3,4, -4,4,0,4,4,1,2,3, -0,5,1,5,5,2,3,4, -5,0,1,5,5,2,3,4, -0,1,2,6,6,3,4,5, -4,4,4,0,4,1,2,3, -0,5,5,1,5,2,3,4, -5,0,5,1,5,2,3,4, -0,1,6,2,6,3,4,5, -5,5,0,1,5,2,3,4, -0,6,1,2,6,3,4,5, -6,0,1,2,6,3,4,5, -0,1,2,3,7,4,5,6, -4,4,4,4,0,1,2,3, -0,5,5,5,1,2,3,4, -5,0,5,5,1,2,3,4, -0,1,6,6,2,3,4,5, -5,5,0,5,1,2,3,4, -0,6,1,6,2,3,4,5, -6,0,1,6,2,3,4,5, -0,1,2,7,3,4,5,6, -5,5,5,0,1,2,3,4, -0,6,6,1,2,3,4,5, -6,0,6,1,2,3,4,5, -0,1,7,2,3,4,5,6, -6,6,0,1,2,3,4,5, -0,7,1,2,3,4,5,6, -7,0,1,2,3,4,5,6, -0,1,2,3,4,5,6,7 -}; -#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_) -#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(shuffles[_m_]))) ) -#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv)) - #endif - -#define PAD8(__x) (((__x)+7)/8) - -//----------------------------------------------------------------------------- -#define VSTO( _op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, ov) -#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) -#include "bitunpack256v.c" - -#define BITUNBLK256V32_0(ip, _i_, _op_, _parm_) {__m256i ov;\ - VSTO0(_op_, 0, ov, _parm_);\ - VSTO0(_op_, 1, ov, _parm_);\ - VSTO0(_op_, 2, ov, _parm_);\ - VSTO0(_op_, 3, ov, _parm_);\ - VSTO0(_op_, 4, ov, _parm_);\ - VSTO0(_op_, 5, ov, _parm_);\ - VSTO0(_op_, 6, ov, _parm_);\ - VSTO0(_op_, 7, ov, _parm_);\ - VSTO0(_op_, 8, ov, _parm_);\ - VSTO0(_op_, 9, ov, _parm_);\ - VSTO0(_op_, 10, ov, _parm_);\ - VSTO0(_op_, 11, ov, _parm_);\ - VSTO0(_op_, 12, ov, _parm_);\ - VSTO0(_op_, 13, ov, _parm_);\ - VSTO0(_op_, 14, ov, _parm_);\ - VSTO0(_op_, 15, ov, _parm_);\ - VSTO0(_op_, 16, ov, _parm_);\ - VSTO0(_op_, 17, ov, _parm_);\ - VSTO0(_op_, 18, ov, _parm_);\ - VSTO0(_op_, 19, ov, _parm_);\ - VSTO0(_op_, 20, ov, _parm_);\ - VSTO0(_op_, 21, ov, _parm_);\ - VSTO0(_op_, 22, ov, _parm_);\ - VSTO0(_op_, 23, ov, _parm_);\ - VSTO0(_op_, 24, ov, _parm_);\ - VSTO0(_op_, 25, ov, _parm_);\ - VSTO0(_op_, 26, ov, _parm_);\ - VSTO0(_op_, 27, ov, _parm_);\ - VSTO0(_op_, 28, ov, _parm_);\ - VSTO0(_op_, 29, ov, _parm_);\ - VSTO0(_op_, 30, ov, _parm_);\ - VSTO0(_op_, 31, ov, _parm_);\ -} -#define BITUNPACK0(_parm_) _parm_ = _mm256_setzero_si256() - -unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) { - const unsigned char *ip = in+PAD8(256*b); - __m256i sv; - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - -//--------------------------------------- zeromask unpack for TurboPFor vp4d.c -------------------------------------- -#define VSTO(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) )); pex += popcnt32(xm) -#define VSTO0(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm) -#define BITUNPACK0(_parm_) -#include "bitunpack256v.c" - -unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 -//-------------------------------- -#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) -#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG256x32(__ov); SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) -#include "bitunpack256v.c" - -#define BITUNPACK0(_parm_) - -unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(256*b); - __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef BITUNPACK0 - -//----------------------------------------------------------------------------- -#define VSTO(__op, i, __ov, __sv) SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) -#include "bitunpack256v.c" - -#define BITUNPACK0(_parm_) - -unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(256*b); - __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - -//----------------------------------------------------------------------------- -#define VEXP(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm) -#define VEXP0(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm) - -#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); -#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); - -#include "bitunpack256v.c" - -#define BITUNPACK0(_parm_) - -unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(256*b); unsigned xm; - __m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - -//----------------------------------------------------------------------------- -#define VSTO(__op, i, __ov, __sv) SCANI256x32(__ov,__sv,cv); _mm256_storeu_si256(__op++, __sv); -#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv) -#include "bitunpack256v.c" - -#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8) - -unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(256*b); - __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 -//----------------------------------------------------------------------------- -#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); -#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); - -#include "bitunpack256v.c" - -#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128() - -unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(256*b); unsigned xm; - __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - - #else -#include "bitunpack256v_.h" - -#define BITUNPACK256V32(__ip, __nbits, __op, _parm_) { __m256i mv,*_ov=(__m256i *)__op,*_iv=(__m256i *)__ip; \ - switch(__nbits&0x3f) {\ - case 0: BITUNPACK0(_parm_); BITUNPACK256V32_0( _iv, _ov, _parm_); break;\ - case 1: mv = _mm256_set1_epi32((1u<< 1)-1); BITUNPACK256V32_1( _iv, _ov, _parm_); break;\ - case 2: mv = _mm256_set1_epi32((1u<< 2)-1); BITUNPACK256V32_2( _iv, _ov, _parm_); break;\ - case 3: mv = _mm256_set1_epi32((1u<< 3)-1); BITUNPACK256V32_3( _iv, _ov, _parm_); break;\ - case 4: mv = _mm256_set1_epi32((1u<< 4)-1); BITUNPACK256V32_4( _iv, _ov, _parm_); break;\ - case 5: mv = _mm256_set1_epi32((1u<< 5)-1); BITUNPACK256V32_5( _iv, _ov, _parm_); break;\ - case 6: mv = _mm256_set1_epi32((1u<< 6)-1); BITUNPACK256V32_6( _iv, _ov, _parm_); break;\ - case 7: mv = _mm256_set1_epi32((1u<< 7)-1); BITUNPACK256V32_7( _iv, _ov, _parm_); break;\ - case 8: mv = _mm256_set1_epi32((1u<< 8)-1); BITUNPACK256V32_8( _iv, _ov, _parm_); break;\ - case 9: mv = _mm256_set1_epi32((1u<< 9)-1); BITUNPACK256V32_9( _iv, _ov, _parm_); break;\ - case 10: mv = _mm256_set1_epi32((1u<<10)-1); BITUNPACK256V32_10(_iv, _ov, _parm_); break;\ - case 11: mv = _mm256_set1_epi32((1u<<11)-1); BITUNPACK256V32_11(_iv, _ov, _parm_); break;\ - case 12: mv = _mm256_set1_epi32((1u<<12)-1); BITUNPACK256V32_12(_iv, _ov, _parm_); break;\ - case 13: mv = _mm256_set1_epi32((1u<<13)-1); BITUNPACK256V32_13(_iv, _ov, _parm_); break;\ - case 14: mv = _mm256_set1_epi32((1u<<14)-1); BITUNPACK256V32_14(_iv, _ov, _parm_); break;\ - case 15: mv = _mm256_set1_epi32((1u<<15)-1); BITUNPACK256V32_15(_iv, _ov, _parm_); break;\ - case 16: mv = _mm256_set1_epi32((1u<<16)-1); BITUNPACK256V32_16(_iv, _ov, _parm_); break;\ - case 17: mv = _mm256_set1_epi32((1u<<17)-1); BITUNPACK256V32_17(_iv, _ov, _parm_); break;\ - case 18: mv = _mm256_set1_epi32((1u<<18)-1); BITUNPACK256V32_18(_iv, _ov, _parm_); break;\ - case 19: mv = _mm256_set1_epi32((1u<<19)-1); BITUNPACK256V32_19(_iv, _ov, _parm_); break;\ - case 20: mv = _mm256_set1_epi32((1u<<20)-1); BITUNPACK256V32_20(_iv, _ov, _parm_); break;\ - case 21: mv = _mm256_set1_epi32((1u<<21)-1); BITUNPACK256V32_21(_iv, _ov, _parm_); break;\ - case 22: mv = _mm256_set1_epi32((1u<<22)-1); BITUNPACK256V32_22(_iv, _ov, _parm_); break;\ - case 23: mv = _mm256_set1_epi32((1u<<23)-1); BITUNPACK256V32_23(_iv, _ov, _parm_); break;\ - case 24: mv = _mm256_set1_epi32((1u<<24)-1); BITUNPACK256V32_24(_iv, _ov, _parm_); break;\ - case 25: mv = _mm256_set1_epi32((1u<<25)-1); BITUNPACK256V32_25(_iv, _ov, _parm_); break;\ - case 26: mv = _mm256_set1_epi32((1u<<26)-1); BITUNPACK256V32_26(_iv, _ov, _parm_); break;\ - case 27: mv = _mm256_set1_epi32((1u<<27)-1); BITUNPACK256V32_27(_iv, _ov, _parm_); break;\ - case 28: mv = _mm256_set1_epi32((1u<<28)-1); BITUNPACK256V32_28(_iv, _ov, _parm_); break;\ - case 29: mv = _mm256_set1_epi32((1u<<29)-1); BITUNPACK256V32_29(_iv, _ov, _parm_); break;\ - case 30: mv = _mm256_set1_epi32((1u<<30)-1); BITUNPACK256V32_30(_iv, _ov, _parm_); break;\ - case 31: mv = _mm256_set1_epi32((1u<<31)-1); BITUNPACK256V32_31(_iv, _ov, _parm_); break;\ - case 32: mv = _mm256_set1_epi32((1ull<<32)-1);BITUNPACK256V32_32(_iv, _ov, _parm_); break;\ - case 33 ... 63: break;\ - }\ -} - #endif - - diff --git a/bitunpack128v.c b/bitunpackv.c similarity index 100% rename from bitunpack128v.c rename to bitunpackv.c diff --git a/vp4d.h b/vp4.h similarity index 100% rename from vp4d.h rename to vp4.h diff --git a/vp4c.c b/vp4c.c index c8bc65d..c9c1430 100644 --- a/vp4c.c +++ b/vp4c.c @@ -29,7 +29,7 @@ #include "bitpack.h" #include "vint.h" #include "bitutil.h" -#include "vp4c.h" +#include "vp4.h" #undef P4DELTA #define PAD8(_x_) ( (((_x_)+8-1)/8) ) diff --git a/vp4c.h b/vp4c.h deleted file mode 100644 index 7731f15..0000000 --- a/vp4c.h +++ /dev/null @@ -1,98 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Integer Compression" TurboPfor (see vp4d.h for decompression) - -#ifdef __cplusplus -extern "C" { -#endif - -// Low level API: Single block n limited -//compress integer array with n values to the buffer out. Return value = end of compressed buffer out -unsigned char *p4enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4enc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); // SIMD (Vertical bitpacking) -unsigned char *p4enc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); // SIMD (Vertical bitpacking) -unsigned char *p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); - -unsigned char *p4encx16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out);// Direct access -unsigned char *p4encx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); - -unsigned char *p4denc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *p4denc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *p4denc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); // SIMD (Vertical bitpacking) -unsigned char *p4denc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *p4denc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *p4dencx16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);// Direct access -unsigned char *p4dencx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); - -unsigned char *p4d1enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *p4d1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *p4d1enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); // SIMD (Vertical bitpacking) -unsigned char *p4d1enc256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *p4d1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *p4d1encx16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);// Direct access -unsigned char *p4d1encx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); - -// same as p4enc, but with b and bx as parameters. Call after _p4bitsXX -ALWAYS_INLINE unsigned char *_p4enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -ALWAYS_INLINE unsigned char *_p4enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -ALWAYS_INLINE unsigned char *_p4enc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking) -ALWAYS_INLINE unsigned char *_p4enc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -ALWAYS_INLINE unsigned char *_p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -// calculate the best bit sizes b and bx, return b. -ALWAYS_INLINE unsigned _p4bits16( unsigned short *__restrict in, unsigned n, unsigned *pbx); -ALWAYS_INLINE unsigned _p4bits32( unsigned *__restrict in, unsigned n, unsigned *pbx); -ALWAYS_INLINE unsigned _p4bits64( uint64_t *__restrict in, unsigned n, unsigned *pbx); - -//************************** n unlimited ************************************************************************************ -// compress integer array with n values to the buffer out. Return value = end of compressed buffer out -unsigned char *p4nenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4nenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4nenc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); // SIMD (Vertical bitpacking) -unsigned char *p4nenc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4nenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); - -unsigned char *p4ndenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *p4ndenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *p4ndenc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); // SIMD (Vertical bitpacking) -unsigned char *p4ndenc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *p4ndenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *p4nd1enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *p4nd1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *p4nd1enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); // SIMD (Vertical bitpacking) -unsigned char *p4nd1enc256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *p4nd1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -#ifdef __cplusplus -} -#endif - -#define P4D_MAX 256 - -#define P4EB(_b_) (_b_ << 1) -#define P4EBX(_b_, _bx_) (_bx_ << 8 | _b_ << 1 | 1) -#define P4SAVE(_out_, _b_, _bx_) do { if(!_bx_) *_out_++ = P4EB(_b_);else *(unsigned short *)_out_ = P4EBX(_b_, _bx_), _out_ += 2; } while(0) -