TurboPFor: TurboPFor encode
This commit is contained in:
119
bitpack256v.c
119
bitpack256v.c
@ -1,119 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2017
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Integer Compression" SIMD bit packing
|
||||
#ifndef VSTI
|
||||
#include <immintrin.h>
|
||||
#include "bitpack.h"
|
||||
#include "bitutil.h"
|
||||
|
||||
#define PAD8(__x) (((__x)+8-1)/8)
|
||||
|
||||
#define VSTI(ip, i, iv, parm)
|
||||
#define IPP(ip, i, iv) _mm256_loadu_si256(ip++)
|
||||
#include "bitpack256v.c"
|
||||
|
||||
unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; }
|
||||
#undef VSTI
|
||||
#undef IPP
|
||||
|
||||
//------------------------------------------------------------------------------------------------------------------------------
|
||||
#if 0
|
||||
#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); DELTA256x32(v,__sv, __iv) //__sv = v
|
||||
#define IPP(ip, i, __iv) __iv
|
||||
#include "bitpack256v.c"
|
||||
|
||||
unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
||||
__m256i v; //,sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256();
|
||||
__m128i sv = _mm_set1_epi32(start);
|
||||
BITPACK256V32(in, b, out, sv);
|
||||
return pout;
|
||||
}
|
||||
#undef VSTI
|
||||
|
||||
//------------------------------------------------------------------------------------------------------------------------------
|
||||
#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); __iv = _mm256_sub_epi32(DELTA256x32(v,__sv),cv); __sv = v
|
||||
|
||||
unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
||||
__m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
|
||||
//BITPACK256V32(in, b, out, sv); return pout;
|
||||
}
|
||||
#undef VSTI
|
||||
//------------------------------------------------------------------------------------------------------------------------------
|
||||
#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); __iv = DELTA256x32(v,__sv); __sv = v; __iv = ZIGZAG256x32(__iv)
|
||||
|
||||
unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
||||
__m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
|
||||
//BITPACK256V32(in, b, out, sv);
|
||||
return pout;
|
||||
}
|
||||
#endif
|
||||
|
||||
#undef VSTI
|
||||
#else
|
||||
#include <strings.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
#define OPPE(__op)
|
||||
#define IPPE(__op)
|
||||
|
||||
#include "bitpack256v_.h"
|
||||
|
||||
#define BITPACK256V32(__pip, __nbits, __pop, __parm) { __m256i *__ip=(__m256i *)__pip,*__op=(__m256i *)__pop;\
|
||||
switch(__nbits) {\
|
||||
case 0: break;\
|
||||
case 1:{ BITPACK256V32_1( __ip, __op, __parm); } break;\
|
||||
case 2:{ BITPACK256V32_2( __ip, __op, __parm); } break;\
|
||||
case 3:{ BITPACK256V32_3( __ip, __op, __parm); } break;\
|
||||
case 4:{ BITPACK256V32_4( __ip, __op, __parm); } break;\
|
||||
case 5:{ BITPACK256V32_5( __ip, __op, __parm); } break;\
|
||||
case 6:{ BITPACK256V32_6( __ip, __op, __parm); } break;\
|
||||
case 7:{ BITPACK256V32_7( __ip, __op, __parm); } break;\
|
||||
case 8:{ BITPACK256V32_8( __ip, __op, __parm); } break;\
|
||||
case 9:{ BITPACK256V32_9( __ip, __op, __parm); } break;\
|
||||
case 10:{ BITPACK256V32_10(__ip, __op, __parm); } break;\
|
||||
case 11:{ BITPACK256V32_11(__ip, __op, __parm); } break;\
|
||||
case 12:{ BITPACK256V32_12(__ip, __op, __parm); } break;\
|
||||
case 13:{ BITPACK256V32_13(__ip, __op, __parm); } break;\
|
||||
case 14:{ BITPACK256V32_14(__ip, __op, __parm); } break;\
|
||||
case 15:{ BITPACK256V32_15(__ip, __op, __parm); } break;\
|
||||
case 16:{ BITPACK256V32_16(__ip, __op, __parm); } break;\
|
||||
case 17:{ BITPACK256V32_17(__ip, __op, __parm); } break;\
|
||||
case 18:{ BITPACK256V32_18(__ip, __op, __parm); } break;\
|
||||
case 19:{ BITPACK256V32_19(__ip, __op, __parm); } break;\
|
||||
case 20:{ BITPACK256V32_20(__ip, __op, __parm); } break;\
|
||||
case 21:{ BITPACK256V32_21(__ip, __op, __parm); } break;\
|
||||
case 22:{ BITPACK256V32_22(__ip, __op, __parm); } break;\
|
||||
case 23:{ BITPACK256V32_23(__ip, __op, __parm); } break;\
|
||||
case 24:{ BITPACK256V32_24(__ip, __op, __parm); } break;\
|
||||
case 25:{ BITPACK256V32_25(__ip, __op, __parm); } break;\
|
||||
case 26:{ BITPACK256V32_26(__ip, __op, __parm); } break;\
|
||||
case 27:{ BITPACK256V32_27(__ip, __op, __parm); } break;\
|
||||
case 28:{ BITPACK256V32_28(__ip, __op, __parm); } break;\
|
||||
case 29:{ BITPACK256V32_29(__ip, __op, __parm); } break;\
|
||||
case 30:{ BITPACK256V32_30(__ip, __op, __parm); } break;\
|
||||
case 31:{ BITPACK256V32_31(__ip, __op, __parm); } break;\
|
||||
case 32:{ BITPACK256V32_32(__ip, __op, __parm); } break;\
|
||||
}\
|
||||
}
|
||||
#endif
|
||||
108
bitunpack.h
108
bitunpack.h
@ -1,108 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2017
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Integer Compression" Bit Packing
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "conf.h"
|
||||
|
||||
// ---------------- Unpack a b-bits packed integer array --------------------------------------------------------------------------------------
|
||||
// unpack a bitpacked integer array. Return value = end of packed buffer in
|
||||
unsigned char *bitunpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b);
|
||||
unsigned char *bitunpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
|
||||
unsigned char *bitunpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b);
|
||||
|
||||
// ---------------- Direct Access to a single packed integer array entry --------------------------------------------------------------------
|
||||
// Get a single 32 bits value with index "idx" (or bit index b*idx) from packed integer array
|
||||
static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((unsigned *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
|
||||
static ALWAYS_INLINE unsigned _bitgetx32(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi64( ctou64((unsigned *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
|
||||
|
||||
// like bitgetx32 but for 16 bits integer array
|
||||
static ALWAYS_INLINE unsigned bitgetx16(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou32((unsigned *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
|
||||
static ALWAYS_INLINE unsigned _bitgetx16(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou32((unsigned *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
|
||||
|
||||
// Set a single value with index "idx"
|
||||
static ALWAYS_INLINE void bitsetx16(const unsigned char *__restrict in, unsigned idx, unsigned v, unsigned b) { unsigned bidx = b*idx; unsigned *p = (unsigned *) in+(bidx>>4) ; *p = ( *p & ~(((1u <<b)-1) << (bidx& 0xf)) ) | v<<(bidx& 0xf);}
|
||||
static ALWAYS_INLINE void bitsetx32(const unsigned char *__restrict in, unsigned idx, unsigned v, unsigned b) { unsigned bidx = b*idx; unsigned long long *p = (unsigned long long *)((unsigned *)in+(bidx>>5)); *p = ( *p & ~(((1ull<<b)-1) << (bidx&0x1f)) ) | (unsigned long long)v<<(bidx&0x1f);}
|
||||
|
||||
// ---------------- DFOR : integrated bitpacking, for delta packed SORTED array (Ex. DocId in inverted index) -------------------------------
|
||||
// start < out[0] < out[1] < ... < out[n-2] < out[n-1] < (1<<N)-1, N=32,16
|
||||
// out[0] = start + in[0] + 1; out[1] = out[0] + in[1] + 1; ... ; out[i] = out[i-1] + in[i] + 1
|
||||
unsigned char *bitd1unpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
|
||||
unsigned char *bitd1unpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1unpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
// start <= out[0] <= out[1] <= ... <= out[n-2] <= out[n-1] <= (1<<N)-1 N=32,16
|
||||
// out[0] = start + in[0]; out[1] = out[0] + in[1]; ... ; out[i] = out[i-1] + in[i]
|
||||
unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitdunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
// ---------------- For : Direct Access for packed SORTED array (Ex. DocId in inverted index) --------------------------------------------
|
||||
// out[i] = start + in[i] + i + 1
|
||||
unsigned char *bitf1unpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitf1unpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
// out[i] = start + in[i] + i
|
||||
unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
// ---------------- SIMD : unpack a bit packed integer array -------------------------------------------------------------------------------
|
||||
// SIMD unpack a 128/256 bitpacked integer array. Return value = end of packed buffer in
|
||||
unsigned char *bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
|
||||
unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
|
||||
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
unsigned char *bitunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
|
||||
unsigned char *bitzunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitdunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1unpack128h32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
// internal TurboPFor functions: masked unpack
|
||||
unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
|
||||
unsigned char *_bitunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitdunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitd1unpack128h32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
|
||||
unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
500
bitunpack256v.c
500
bitunpack256v.c
@ -1,500 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2017
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Integer Compression" AVX2 Bit Packing
|
||||
#ifndef VSTO
|
||||
#include <stdio.h>
|
||||
|
||||
#include <strings.h>
|
||||
#include <immintrin.h>
|
||||
#include "conf.h"
|
||||
#include "bitutil.h"
|
||||
#include "bitunpack.h"
|
||||
|
||||
#ifdef __AVX512F__
|
||||
#include <immintrin.h>
|
||||
#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_)
|
||||
#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_)
|
||||
#else
|
||||
#include <immintrin.h>
|
||||
static unsigned char shuffles[256][8] __attribute__((aligned(32))) = {
|
||||
0,0,0,0,0,0,0,0,
|
||||
0,1,1,1,1,1,1,1,
|
||||
1,0,1,1,1,1,1,1,
|
||||
0,1,2,2,2,2,2,2,
|
||||
1,1,0,1,1,1,1,1,
|
||||
0,2,1,2,2,2,2,2,
|
||||
2,0,1,2,2,2,2,2,
|
||||
0,1,2,3,3,3,3,3,
|
||||
1,1,1,0,1,1,1,1,
|
||||
0,2,2,1,2,2,2,2,
|
||||
2,0,2,1,2,2,2,2,
|
||||
0,1,3,2,3,3,3,3,
|
||||
2,2,0,1,2,2,2,2,
|
||||
0,3,1,2,3,3,3,3,
|
||||
3,0,1,2,3,3,3,3,
|
||||
0,1,2,3,4,4,4,4,
|
||||
1,1,1,1,0,1,1,1,
|
||||
0,2,2,2,1,2,2,2,
|
||||
2,0,2,2,1,2,2,2,
|
||||
0,1,3,3,2,3,3,3,
|
||||
2,2,0,2,1,2,2,2,
|
||||
0,3,1,3,2,3,3,3,
|
||||
3,0,1,3,2,3,3,3,
|
||||
0,1,2,4,3,4,4,4,
|
||||
2,2,2,0,1,2,2,2,
|
||||
0,3,3,1,2,3,3,3,
|
||||
3,0,3,1,2,3,3,3,
|
||||
0,1,4,2,3,4,4,4,
|
||||
3,3,0,1,2,3,3,3,
|
||||
0,4,1,2,3,4,4,4,
|
||||
4,0,1,2,3,4,4,4,
|
||||
0,1,2,3,4,5,5,5,
|
||||
1,1,1,1,1,0,1,1,
|
||||
0,2,2,2,2,1,2,2,
|
||||
2,0,2,2,2,1,2,2,
|
||||
0,1,3,3,3,2,3,3,
|
||||
2,2,0,2,2,1,2,2,
|
||||
0,3,1,3,3,2,3,3,
|
||||
3,0,1,3,3,2,3,3,
|
||||
0,1,2,4,4,3,4,4,
|
||||
2,2,2,0,2,1,2,2,
|
||||
0,3,3,1,3,2,3,3,
|
||||
3,0,3,1,3,2,3,3,
|
||||
0,1,4,2,4,3,4,4,
|
||||
3,3,0,1,3,2,3,3,
|
||||
0,4,1,2,4,3,4,4,
|
||||
4,0,1,2,4,3,4,4,
|
||||
0,1,2,3,5,4,5,5,
|
||||
2,2,2,2,0,1,2,2,
|
||||
0,3,3,3,1,2,3,3,
|
||||
3,0,3,3,1,2,3,3,
|
||||
0,1,4,4,2,3,4,4,
|
||||
3,3,0,3,1,2,3,3,
|
||||
0,4,1,4,2,3,4,4,
|
||||
4,0,1,4,2,3,4,4,
|
||||
0,1,2,5,3,4,5,5,
|
||||
3,3,3,0,1,2,3,3,
|
||||
0,4,4,1,2,3,4,4,
|
||||
4,0,4,1,2,3,4,4,
|
||||
0,1,5,2,3,4,5,5,
|
||||
4,4,0,1,2,3,4,4,
|
||||
0,5,1,2,3,4,5,5,
|
||||
5,0,1,2,3,4,5,5,
|
||||
0,1,2,3,4,5,6,6,
|
||||
1,1,1,1,1,1,0,1,
|
||||
0,2,2,2,2,2,1,2,
|
||||
2,0,2,2,2,2,1,2,
|
||||
0,1,3,3,3,3,2,3,
|
||||
2,2,0,2,2,2,1,2,
|
||||
0,3,1,3,3,3,2,3,
|
||||
3,0,1,3,3,3,2,3,
|
||||
0,1,2,4,4,4,3,4,
|
||||
2,2,2,0,2,2,1,2,
|
||||
0,3,3,1,3,3,2,3,
|
||||
3,0,3,1,3,3,2,3,
|
||||
0,1,4,2,4,4,3,4,
|
||||
3,3,0,1,3,3,2,3,
|
||||
0,4,1,2,4,4,3,4,
|
||||
4,0,1,2,4,4,3,4,
|
||||
0,1,2,3,5,5,4,5,
|
||||
2,2,2,2,0,2,1,2,
|
||||
0,3,3,3,1,3,2,3,
|
||||
3,0,3,3,1,3,2,3,
|
||||
0,1,4,4,2,4,3,4,
|
||||
3,3,0,3,1,3,2,3,
|
||||
0,4,1,4,2,4,3,4,
|
||||
4,0,1,4,2,4,3,4,
|
||||
0,1,2,5,3,5,4,5,
|
||||
3,3,3,0,1,3,2,3,
|
||||
0,4,4,1,2,4,3,4,
|
||||
4,0,4,1,2,4,3,4,
|
||||
0,1,5,2,3,5,4,5,
|
||||
4,4,0,1,2,4,3,4,
|
||||
0,5,1,2,3,5,4,5,
|
||||
5,0,1,2,3,5,4,5,
|
||||
0,1,2,3,4,6,5,6,
|
||||
2,2,2,2,2,0,1,2,
|
||||
0,3,3,3,3,1,2,3,
|
||||
3,0,3,3,3,1,2,3,
|
||||
0,1,4,4,4,2,3,4,
|
||||
3,3,0,3,3,1,2,3,
|
||||
0,4,1,4,4,2,3,4,
|
||||
4,0,1,4,4,2,3,4,
|
||||
0,1,2,5,5,3,4,5,
|
||||
3,3,3,0,3,1,2,3,
|
||||
0,4,4,1,4,2,3,4,
|
||||
4,0,4,1,4,2,3,4,
|
||||
0,1,5,2,5,3,4,5,
|
||||
4,4,0,1,4,2,3,4,
|
||||
0,5,1,2,5,3,4,5,
|
||||
5,0,1,2,5,3,4,5,
|
||||
0,1,2,3,6,4,5,6,
|
||||
3,3,3,3,0,1,2,3,
|
||||
0,4,4,4,1,2,3,4,
|
||||
4,0,4,4,1,2,3,4,
|
||||
0,1,5,5,2,3,4,5,
|
||||
4,4,0,4,1,2,3,4,
|
||||
0,5,1,5,2,3,4,5,
|
||||
5,0,1,5,2,3,4,5,
|
||||
0,1,2,6,3,4,5,6,
|
||||
4,4,4,0,1,2,3,4,
|
||||
0,5,5,1,2,3,4,5,
|
||||
5,0,5,1,2,3,4,5,
|
||||
0,1,6,2,3,4,5,6,
|
||||
5,5,0,1,2,3,4,5,
|
||||
0,6,1,2,3,4,5,6,
|
||||
6,0,1,2,3,4,5,6,
|
||||
0,1,2,3,4,5,6,7,
|
||||
1,1,1,1,1,1,1,0,
|
||||
0,2,2,2,2,2,2,1,
|
||||
2,0,2,2,2,2,2,1,
|
||||
0,1,3,3,3,3,3,2,
|
||||
2,2,0,2,2,2,2,1,
|
||||
0,3,1,3,3,3,3,2,
|
||||
3,0,1,3,3,3,3,2,
|
||||
0,1,2,4,4,4,4,3,
|
||||
2,2,2,0,2,2,2,1,
|
||||
0,3,3,1,3,3,3,2,
|
||||
3,0,3,1,3,3,3,2,
|
||||
0,1,4,2,4,4,4,3,
|
||||
3,3,0,1,3,3,3,2,
|
||||
0,4,1,2,4,4,4,3,
|
||||
4,0,1,2,4,4,4,3,
|
||||
0,1,2,3,5,5,5,4,
|
||||
2,2,2,2,0,2,2,1,
|
||||
0,3,3,3,1,3,3,2,
|
||||
3,0,3,3,1,3,3,2,
|
||||
0,1,4,4,2,4,4,3,
|
||||
3,3,0,3,1,3,3,2,
|
||||
0,4,1,4,2,4,4,3,
|
||||
4,0,1,4,2,4,4,3,
|
||||
0,1,2,5,3,5,5,4,
|
||||
3,3,3,0,1,3,3,2,
|
||||
0,4,4,1,2,4,4,3,
|
||||
4,0,4,1,2,4,4,3,
|
||||
0,1,5,2,3,5,5,4,
|
||||
4,4,0,1,2,4,4,3,
|
||||
0,5,1,2,3,5,5,4,
|
||||
5,0,1,2,3,5,5,4,
|
||||
0,1,2,3,4,6,6,5,
|
||||
2,2,2,2,2,0,2,1,
|
||||
0,3,3,3,3,1,3,2,
|
||||
3,0,3,3,3,1,3,2,
|
||||
0,1,4,4,4,2,4,3,
|
||||
3,3,0,3,3,1,3,2,
|
||||
0,4,1,4,4,2,4,3,
|
||||
4,0,1,4,4,2,4,3,
|
||||
0,1,2,5,5,3,5,4,
|
||||
3,3,3,0,3,1,3,2,
|
||||
0,4,4,1,4,2,4,3,
|
||||
4,0,4,1,4,2,4,3,
|
||||
0,1,5,2,5,3,5,4,
|
||||
4,4,0,1,4,2,4,3,
|
||||
0,5,1,2,5,3,5,4,
|
||||
5,0,1,2,5,3,5,4,
|
||||
0,1,2,3,6,4,6,5,
|
||||
3,3,3,3,0,1,3,2,
|
||||
0,4,4,4,1,2,4,3,
|
||||
4,0,4,4,1,2,4,3,
|
||||
0,1,5,5,2,3,5,4,
|
||||
4,4,0,4,1,2,4,3,
|
||||
0,5,1,5,2,3,5,4,
|
||||
5,0,1,5,2,3,5,4,
|
||||
0,1,2,6,3,4,6,5,
|
||||
4,4,4,0,1,2,4,3,
|
||||
0,5,5,1,2,3,5,4,
|
||||
5,0,5,1,2,3,5,4,
|
||||
0,1,6,2,3,4,6,5,
|
||||
5,5,0,1,2,3,5,4,
|
||||
0,6,1,2,3,4,6,5,
|
||||
6,0,1,2,3,4,6,5,
|
||||
0,1,2,3,4,5,7,6,
|
||||
2,2,2,2,2,2,0,1,
|
||||
0,3,3,3,3,3,1,2,
|
||||
3,0,3,3,3,3,1,2,
|
||||
0,1,4,4,4,4,2,3,
|
||||
3,3,0,3,3,3,1,2,
|
||||
0,4,1,4,4,4,2,3,
|
||||
4,0,1,4,4,4,2,3,
|
||||
0,1,2,5,5,5,3,4,
|
||||
3,3,3,0,3,3,1,2,
|
||||
0,4,4,1,4,4,2,3,
|
||||
4,0,4,1,4,4,2,3,
|
||||
0,1,5,2,5,5,3,4,
|
||||
4,4,0,1,4,4,2,3,
|
||||
0,5,1,2,5,5,3,4,
|
||||
5,0,1,2,5,5,3,4,
|
||||
0,1,2,3,6,6,4,5,
|
||||
3,3,3,3,0,3,1,2,
|
||||
0,4,4,4,1,4,2,3,
|
||||
4,0,4,4,1,4,2,3,
|
||||
0,1,5,5,2,5,3,4,
|
||||
4,4,0,4,1,4,2,3,
|
||||
0,5,1,5,2,5,3,4,
|
||||
5,0,1,5,2,5,3,4,
|
||||
0,1,2,6,3,6,4,5,
|
||||
4,4,4,0,1,4,2,3,
|
||||
0,5,5,1,2,5,3,4,
|
||||
5,0,5,1,2,5,3,4,
|
||||
0,1,6,2,3,6,4,5,
|
||||
5,5,0,1,2,5,3,4,
|
||||
0,6,1,2,3,6,4,5,
|
||||
6,0,1,2,3,6,4,5,
|
||||
0,1,2,3,4,7,5,6,
|
||||
3,3,3,3,3,0,1,2,
|
||||
0,4,4,4,4,1,2,3,
|
||||
4,0,4,4,4,1,2,3,
|
||||
0,1,5,5,5,2,3,4,
|
||||
4,4,0,4,4,1,2,3,
|
||||
0,5,1,5,5,2,3,4,
|
||||
5,0,1,5,5,2,3,4,
|
||||
0,1,2,6,6,3,4,5,
|
||||
4,4,4,0,4,1,2,3,
|
||||
0,5,5,1,5,2,3,4,
|
||||
5,0,5,1,5,2,3,4,
|
||||
0,1,6,2,6,3,4,5,
|
||||
5,5,0,1,5,2,3,4,
|
||||
0,6,1,2,6,3,4,5,
|
||||
6,0,1,2,6,3,4,5,
|
||||
0,1,2,3,7,4,5,6,
|
||||
4,4,4,4,0,1,2,3,
|
||||
0,5,5,5,1,2,3,4,
|
||||
5,0,5,5,1,2,3,4,
|
||||
0,1,6,6,2,3,4,5,
|
||||
5,5,0,5,1,2,3,4,
|
||||
0,6,1,6,2,3,4,5,
|
||||
6,0,1,6,2,3,4,5,
|
||||
0,1,2,7,3,4,5,6,
|
||||
5,5,5,0,1,2,3,4,
|
||||
0,6,6,1,2,3,4,5,
|
||||
6,0,6,1,2,3,4,5,
|
||||
0,1,7,2,3,4,5,6,
|
||||
6,6,0,1,2,3,4,5,
|
||||
0,7,1,2,3,4,5,6,
|
||||
7,0,1,2,3,4,5,6,
|
||||
0,1,2,3,4,5,6,7
|
||||
};
|
||||
#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_)
|
||||
#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(shuffles[_m_]))) )
|
||||
#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv))
|
||||
#endif
|
||||
|
||||
#define PAD8(__x) (((__x)+7)/8)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO( _op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, ov)
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_)
|
||||
#include "bitunpack256v.c"
|
||||
|
||||
#define BITUNBLK256V32_0(ip, _i_, _op_, _parm_) {__m256i ov;\
|
||||
VSTO0(_op_, 0, ov, _parm_);\
|
||||
VSTO0(_op_, 1, ov, _parm_);\
|
||||
VSTO0(_op_, 2, ov, _parm_);\
|
||||
VSTO0(_op_, 3, ov, _parm_);\
|
||||
VSTO0(_op_, 4, ov, _parm_);\
|
||||
VSTO0(_op_, 5, ov, _parm_);\
|
||||
VSTO0(_op_, 6, ov, _parm_);\
|
||||
VSTO0(_op_, 7, ov, _parm_);\
|
||||
VSTO0(_op_, 8, ov, _parm_);\
|
||||
VSTO0(_op_, 9, ov, _parm_);\
|
||||
VSTO0(_op_, 10, ov, _parm_);\
|
||||
VSTO0(_op_, 11, ov, _parm_);\
|
||||
VSTO0(_op_, 12, ov, _parm_);\
|
||||
VSTO0(_op_, 13, ov, _parm_);\
|
||||
VSTO0(_op_, 14, ov, _parm_);\
|
||||
VSTO0(_op_, 15, ov, _parm_);\
|
||||
VSTO0(_op_, 16, ov, _parm_);\
|
||||
VSTO0(_op_, 17, ov, _parm_);\
|
||||
VSTO0(_op_, 18, ov, _parm_);\
|
||||
VSTO0(_op_, 19, ov, _parm_);\
|
||||
VSTO0(_op_, 20, ov, _parm_);\
|
||||
VSTO0(_op_, 21, ov, _parm_);\
|
||||
VSTO0(_op_, 22, ov, _parm_);\
|
||||
VSTO0(_op_, 23, ov, _parm_);\
|
||||
VSTO0(_op_, 24, ov, _parm_);\
|
||||
VSTO0(_op_, 25, ov, _parm_);\
|
||||
VSTO0(_op_, 26, ov, _parm_);\
|
||||
VSTO0(_op_, 27, ov, _parm_);\
|
||||
VSTO0(_op_, 28, ov, _parm_);\
|
||||
VSTO0(_op_, 29, ov, _parm_);\
|
||||
VSTO0(_op_, 30, ov, _parm_);\
|
||||
VSTO0(_op_, 31, ov, _parm_);\
|
||||
}
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm256_setzero_si256()
|
||||
|
||||
unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv;
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
|
||||
//--------------------------------------- zeromask unpack for TurboPFor vp4d.c --------------------------------------
|
||||
#define VSTO(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) )); pex += popcnt32(xm)
|
||||
#define VSTO0(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm)
|
||||
#define BITUNPACK0(_parm_)
|
||||
#include "bitunpack256v.c"
|
||||
|
||||
unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
//--------------------------------
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_)
|
||||
#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG256x32(__ov); SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv)
|
||||
#include "bitunpack256v.c"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256();
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef BITUNPACK0
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO(__op, i, __ov, __sv) SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv)
|
||||
#include "bitunpack256v.c"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256();
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VEXP(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm)
|
||||
#define VEXP0(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm)
|
||||
|
||||
#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack256v.c"
|
||||
|
||||
#define BITUNPACK0(_parm_)
|
||||
|
||||
unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(256*b); unsigned xm;
|
||||
__m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO(__op, i, __ov, __sv) SCANI256x32(__ov,__sv,cv); _mm256_storeu_si256(__op++, __sv);
|
||||
#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv)
|
||||
#include "bitunpack256v.c"
|
||||
|
||||
#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8)
|
||||
|
||||
unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) {
|
||||
const unsigned char *ip = in+PAD8(256*b);
|
||||
__m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256();
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
//-----------------------------------------------------------------------------
|
||||
#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_);
|
||||
|
||||
#include "bitunpack256v.c"
|
||||
|
||||
#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128()
|
||||
|
||||
unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) {
|
||||
const unsigned char *ip = in+PAD8(256*b); unsigned xm;
|
||||
__m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7);
|
||||
BITUNPACK256V32(in, b, out, sv);
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
#undef VSTO
|
||||
#undef VSTO0
|
||||
#undef BITUNPACK0
|
||||
|
||||
#else
|
||||
#include "bitunpack256v_.h"
|
||||
|
||||
#define BITUNPACK256V32(__ip, __nbits, __op, _parm_) { __m256i mv,*_ov=(__m256i *)__op,*_iv=(__m256i *)__ip; \
|
||||
switch(__nbits&0x3f) {\
|
||||
case 0: BITUNPACK0(_parm_); BITUNPACK256V32_0( _iv, _ov, _parm_); break;\
|
||||
case 1: mv = _mm256_set1_epi32((1u<< 1)-1); BITUNPACK256V32_1( _iv, _ov, _parm_); break;\
|
||||
case 2: mv = _mm256_set1_epi32((1u<< 2)-1); BITUNPACK256V32_2( _iv, _ov, _parm_); break;\
|
||||
case 3: mv = _mm256_set1_epi32((1u<< 3)-1); BITUNPACK256V32_3( _iv, _ov, _parm_); break;\
|
||||
case 4: mv = _mm256_set1_epi32((1u<< 4)-1); BITUNPACK256V32_4( _iv, _ov, _parm_); break;\
|
||||
case 5: mv = _mm256_set1_epi32((1u<< 5)-1); BITUNPACK256V32_5( _iv, _ov, _parm_); break;\
|
||||
case 6: mv = _mm256_set1_epi32((1u<< 6)-1); BITUNPACK256V32_6( _iv, _ov, _parm_); break;\
|
||||
case 7: mv = _mm256_set1_epi32((1u<< 7)-1); BITUNPACK256V32_7( _iv, _ov, _parm_); break;\
|
||||
case 8: mv = _mm256_set1_epi32((1u<< 8)-1); BITUNPACK256V32_8( _iv, _ov, _parm_); break;\
|
||||
case 9: mv = _mm256_set1_epi32((1u<< 9)-1); BITUNPACK256V32_9( _iv, _ov, _parm_); break;\
|
||||
case 10: mv = _mm256_set1_epi32((1u<<10)-1); BITUNPACK256V32_10(_iv, _ov, _parm_); break;\
|
||||
case 11: mv = _mm256_set1_epi32((1u<<11)-1); BITUNPACK256V32_11(_iv, _ov, _parm_); break;\
|
||||
case 12: mv = _mm256_set1_epi32((1u<<12)-1); BITUNPACK256V32_12(_iv, _ov, _parm_); break;\
|
||||
case 13: mv = _mm256_set1_epi32((1u<<13)-1); BITUNPACK256V32_13(_iv, _ov, _parm_); break;\
|
||||
case 14: mv = _mm256_set1_epi32((1u<<14)-1); BITUNPACK256V32_14(_iv, _ov, _parm_); break;\
|
||||
case 15: mv = _mm256_set1_epi32((1u<<15)-1); BITUNPACK256V32_15(_iv, _ov, _parm_); break;\
|
||||
case 16: mv = _mm256_set1_epi32((1u<<16)-1); BITUNPACK256V32_16(_iv, _ov, _parm_); break;\
|
||||
case 17: mv = _mm256_set1_epi32((1u<<17)-1); BITUNPACK256V32_17(_iv, _ov, _parm_); break;\
|
||||
case 18: mv = _mm256_set1_epi32((1u<<18)-1); BITUNPACK256V32_18(_iv, _ov, _parm_); break;\
|
||||
case 19: mv = _mm256_set1_epi32((1u<<19)-1); BITUNPACK256V32_19(_iv, _ov, _parm_); break;\
|
||||
case 20: mv = _mm256_set1_epi32((1u<<20)-1); BITUNPACK256V32_20(_iv, _ov, _parm_); break;\
|
||||
case 21: mv = _mm256_set1_epi32((1u<<21)-1); BITUNPACK256V32_21(_iv, _ov, _parm_); break;\
|
||||
case 22: mv = _mm256_set1_epi32((1u<<22)-1); BITUNPACK256V32_22(_iv, _ov, _parm_); break;\
|
||||
case 23: mv = _mm256_set1_epi32((1u<<23)-1); BITUNPACK256V32_23(_iv, _ov, _parm_); break;\
|
||||
case 24: mv = _mm256_set1_epi32((1u<<24)-1); BITUNPACK256V32_24(_iv, _ov, _parm_); break;\
|
||||
case 25: mv = _mm256_set1_epi32((1u<<25)-1); BITUNPACK256V32_25(_iv, _ov, _parm_); break;\
|
||||
case 26: mv = _mm256_set1_epi32((1u<<26)-1); BITUNPACK256V32_26(_iv, _ov, _parm_); break;\
|
||||
case 27: mv = _mm256_set1_epi32((1u<<27)-1); BITUNPACK256V32_27(_iv, _ov, _parm_); break;\
|
||||
case 28: mv = _mm256_set1_epi32((1u<<28)-1); BITUNPACK256V32_28(_iv, _ov, _parm_); break;\
|
||||
case 29: mv = _mm256_set1_epi32((1u<<29)-1); BITUNPACK256V32_29(_iv, _ov, _parm_); break;\
|
||||
case 30: mv = _mm256_set1_epi32((1u<<30)-1); BITUNPACK256V32_30(_iv, _ov, _parm_); break;\
|
||||
case 31: mv = _mm256_set1_epi32((1u<<31)-1); BITUNPACK256V32_31(_iv, _ov, _parm_); break;\
|
||||
case 32: mv = _mm256_set1_epi32((1ull<<32)-1);BITUNPACK256V32_32(_iv, _ov, _parm_); break;\
|
||||
case 33 ... 63: break;\
|
||||
}\
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
2
vp4c.c
2
vp4c.c
@ -29,7 +29,7 @@
|
||||
#include "bitpack.h"
|
||||
#include "vint.h"
|
||||
#include "bitutil.h"
|
||||
#include "vp4c.h"
|
||||
#include "vp4.h"
|
||||
#undef P4DELTA
|
||||
|
||||
#define PAD8(_x_) ( (((_x_)+8-1)/8) )
|
||||
|
||||
98
vp4c.h
98
vp4c.h
@ -1,98 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2017
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Integer Compression" TurboPfor (see vp4d.h for decompression)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Low level API: Single block n limited
|
||||
//compress integer array with n values to the buffer out. Return value = end of compressed buffer out
|
||||
unsigned char *p4enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4enc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); // SIMD (Vertical bitpacking)
|
||||
unsigned char *p4enc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); // SIMD (Vertical bitpacking)
|
||||
unsigned char *p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
|
||||
unsigned char *p4encx16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out);// Direct access
|
||||
unsigned char *p4encx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
|
||||
unsigned char *p4denc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *p4denc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *p4denc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); // SIMD (Vertical bitpacking)
|
||||
unsigned char *p4denc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *p4denc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *p4dencx16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);// Direct access
|
||||
unsigned char *p4dencx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
|
||||
unsigned char *p4d1enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *p4d1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *p4d1enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); // SIMD (Vertical bitpacking)
|
||||
unsigned char *p4d1enc256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *p4d1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *p4d1encx16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);// Direct access
|
||||
unsigned char *p4d1encx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
|
||||
// same as p4enc, but with b and bx as parameters. Call after _p4bitsXX
|
||||
ALWAYS_INLINE unsigned char *_p4enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
ALWAYS_INLINE unsigned char *_p4enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
ALWAYS_INLINE unsigned char *_p4enc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
|
||||
ALWAYS_INLINE unsigned char *_p4enc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
ALWAYS_INLINE unsigned char *_p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
// calculate the best bit sizes b and bx, return b.
|
||||
ALWAYS_INLINE unsigned _p4bits16( unsigned short *__restrict in, unsigned n, unsigned *pbx);
|
||||
ALWAYS_INLINE unsigned _p4bits32( unsigned *__restrict in, unsigned n, unsigned *pbx);
|
||||
ALWAYS_INLINE unsigned _p4bits64( uint64_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
|
||||
//************************** n unlimited ************************************************************************************
|
||||
// compress integer array with n values to the buffer out. Return value = end of compressed buffer out
|
||||
unsigned char *p4nenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4nenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4nenc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); // SIMD (Vertical bitpacking)
|
||||
unsigned char *p4nenc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4nenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
|
||||
unsigned char *p4ndenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *p4ndenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *p4ndenc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); // SIMD (Vertical bitpacking)
|
||||
unsigned char *p4ndenc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *p4ndenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *p4nd1enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *p4nd1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *p4nd1enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); // SIMD (Vertical bitpacking)
|
||||
unsigned char *p4nd1enc256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *p4nd1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#define P4D_MAX 256
|
||||
|
||||
#define P4EB(_b_) (_b_ << 1)
|
||||
#define P4EBX(_b_, _bx_) (_bx_ << 8 | _b_ << 1 | 1)
|
||||
#define P4SAVE(_out_, _b_, _bx_) do { if(!_bx_) *_out_++ = P4EB(_b_);else *(unsigned short *)_out_ = P4EBX(_b_, _bx_), _out_ += 2; } while(0)
|
||||
|
||||
Reference in New Issue
Block a user