/** Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - homepage : https://sites.google.com/site/powturbo/ - github : https://github.com/powturbo - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ // "Integer Compression" TurboPFor - Pfor/PforDelta #ifndef USIZE #include #include "conf.h" #include "bitutil.h" #include "bitpack.h" #include "vint.h" //#include "vsimple.h" #include "vp4.h" #define PAD8(__x) ( (((__x)+8-1)/8) ) #if 0 //defined(__AVX_2__) #include "avx2.h" #elif defined(__SSSE3__) #include static ALIGNED(char, shuffles[16][16], 16) = { #define _ 0x80 { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, { 0,1,2,3, _,_,_,_, _,_, _, _, _, _, _,_ }, { _,_,_,_, 0,1,2,3, _,_, _, _, _, _, _,_ }, { 0,1,2,3, 4,5,6,7, _,_, _, _, _, _, _,_ }, { _,_,_,_, _,_,_,_, 0,1, 2, 3, _, _, _,_ }, { 0,1,2,3, _,_,_,_, 4,5, 6, 7, _, _, _,_ }, { _,_,_,_, 0,1,2,3, 4,5, 6, 7, _, _, _,_ }, { 0,1,2,3, 4,5,6,7, 8,9,10,11, _, _, _,_ }, { _,_,_,_, _,_,_,_, _,_,_,_, 0, 1, 2, 3 }, { 0,1,2,3, _,_,_,_, _,_,_, _, 4, 5, 6, 7 }, { _,_,_,_, 0,1,2,3, _,_,_, _, 4, 5, 6, 7 }, { 0,1,2,3, 4,5,6,7, _,_, _, _, 8, 9,10,11 }, { _,_,_,_, _,_,_,_, 0,1, 2, 3, 4, 5, 6, 7 }, { 0,1,2,3, _,_,_,_, 4,5, 6, 7, 8, 9,10,11 }, { _,_,_,_, 0,1,2,3, 4,5, 6, 7, 8, 9,10,11 }, { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, #undef _ }; #endif #define P4DELTA(a) #define P4DELTA_(a) #undef DELTA #define _P4DEC _p4dec #define P4DEC p4dec #define P4NDEC p4ndec #define P4NDECS p4dec #define BITUNPACK bitunpack // unpack only #define BITUNPACKD bitunpack // integrated unpack #define _BITUNPACKD bitunpack // integrated pfor #define P4DECX // direct access no 64 bits #define USIZE 8 #include "vp4d.c" #define USIZE 16 #include "vp4d.c" #define USIZE 32 #include "vp4d.c" #undef P4DECX #define USIZE 64 #include "vp4d.c" #define P4DELTA(a) ,a #define P4DELTA_(a) a #define DELTA #define _P4DEC _p4ddec //delta0 #define P4DEC p4ddec #define P4NDEC p4nddec #define P4NDECS p4ddec #define BITUNPACKD bitdunpack #define _BITUNPACKD _bitdunpack #define BITUNDD bitddec #define USIZE 8 #include "vp4d.c" #define USIZE 16 #include "vp4d.c" #define USIZE 32 #include "vp4d.c" #define USIZE 64 #include "vp4d.c" #define _P4DEC _p4d1dec //delta1 #define P4DEC p4d1dec #define P4NDEC p4nd1dec #define P4NDECS p4d1dec #define BITUNPACKD bitd1unpack #define _BITUNPACKD bitd1unpack #define BITUNDD bitd1dec #define USIZE 8 #include "vp4d.c" #define USIZE 16 #include "vp4d.c" #define USIZE 32 #include "vp4d.c" #define USIZE 64 #include "vp4d.c" #define _P4DEC _p4zdec //zigzag0 #define P4DEC p4zdec #define P4NDEC p4nzdec #define P4NDECS p4zdec #define BITUNPACKD bitzunpack #define _BITUNPACKD _bitzunpack #define BITUNDD bitzdec #define USIZE 8 #include "vp4d.c" #define USIZE 16 #include "vp4d.c" #define USIZE 32 #include "vp4d.c" #define USIZE 64 #include "vp4d.c" #undef _P4DEC #undef P4DEC #undef BITUNPACK #undef BITUNDD #undef P4DELTA #define USIZE 32 // SIMD ------------- #ifndef NSIMD #ifdef __SSSE3__ #define VSIZE 128 #define P4DELTA(a) #define P4DELTA_(a) #undef DELTA #define _P4DEC _p4dec128v #define P4DEC p4dec128v #define P4NDEC p4ndec128v #define P4NDECS p4dec #define BITUNPACK bitunpack128v #define BITUNPACKD bitunpack128v #define _BITUNPACKD _bitunpack128v #include "vp4d.c" #define P4DELTA(a) ,a #define P4DELTA_(a) a #define DELTA #define _P4DEC _p4ddec128v #define P4DEC p4ddec128v #define P4NDEC p4nddec128v #define P4NDECS p4ddec #define BITUNPACKD bitdunpack128v #define _BITUNPACKD _bitdunpack128v #define BITUNDD bitddec #include "vp4d.c" #define _P4DEC _p4d1dec128v #define P4DEC p4d1dec128v #define P4NDEC p4nd1dec128v #define P4NDECS p4d1dec #define BITUNPACKD bitd1unpack128v #define _BITUNPACKD _bitd1unpack128v #define BITUNDD bitd1dec #include "vp4d.c" #define _P4DEC _p4zdec128v #define P4DEC p4zdec128v #define P4NDEC p4nzdec128v #define P4NDECS p4zdec #define BITUNPACKD bitzunpack128v #define _BITUNPACKD _bitzunpack128v #define BITUNDD bitzdec #include "vp4d.c" #undef BITUNDD #undef P4DELTA #undef DELTA #endif #ifdef __AVX2__ #define P4DELTA(a) #define P4DELTA_(a) #undef DELTA #define VSIZE 256 #define _P4DEC _p4dec256v #define P4DEC p4dec256v #define P4NDEC p4ndec256v #define P4NDECS p4dec #define BITUNPACK bitunpack256v #define BITUNPACKD bitunpack256v #define _BITUNPACKD _bitunpack256v #include "vp4d.c" #define P4DELTA(a) ,a #define P4DELTA_(a) a #define DELTA #define _P4DEC _p4ddec256v #define P4DEC p4ddec256v #define P4NDEC p4nddec256v #define P4NDECS p4ddec #define BITUNPACKD bitdunpack256v #define _BITUNPACKD _bitdunpack256v #define BITUNDD bitddec #include "vp4d.c" #define _P4DEC _p4d1dec256v #define P4DEC p4d1dec256v #define P4NDEC p4nd1dec256v #define P4NDECS p4d1dec #define BITUNPACKD bitd1unpack256v #define _BITUNPACKD _bitd1unpack256v #define BITUNDD bitd1dec #include "vp4d.c" #undef BITUNDD #endif #endif #undef USIZE #else #define uint_t TEMPLATE3(uint, USIZE, _t) #pragma GCC push_options #pragma GCC optimize ("align-functions=16") ALWAYS_INLINE unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start), unsigned b, unsigned bx ) { uint_t ex[P4D_MAX+32]; #if USIZE == 64 b = ((b>>1)==63)?(64<<1):b; #endif if(!(b & 1)) return TEMPLATE2(BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b>>1); b >>= 1; #if defined(VSIZE) && USIZE < 64 unsigned char *pb = in; #if VSIZE == 128 in = TEMPLATE2(bitunpack, USIZE)(in+16, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)), ex, bx); #else in = TEMPLATE2(bitunpack, USIZE)(in+32, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)) + popcnt64(ctou64(in+16)) + popcnt64(ctou64(in+24)), ex, bx); #endif return TEMPLATE2(_BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b, ex, pb); #else unsigned long long bb[P4D_MAX/64]; unsigned num=0,i,p4dn = (n+63)/64; for(i = 0; i < n/64; i++) { bb[i] = ctou64(in+i*8); num += popcnt64(bb[i]); } if(n & 0x3f) { bb[i] = ctou64(in+i*8) & ((1ull<<(n&0x3f))-1); num += popcnt64(bb[i]); } in = TEMPLATE2(bitunpack, USIZE)(in+PAD8(n), num, ex, bx); in = TEMPLATE2(bitunpack, USIZE)(in, n, out, b); #if 0 //defined(AVX_2__) uint_t *op,*pex = ex; for(i = 0; i < p4dn; i++) { for(op = out; bb[i]; bb[i] >>= 8,op += 8) { unsigned m = (unsigned char)bb[i], mc=popcnt32(m), s = pex[mc]; pex[mc]=0; _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), mm256_maskz_expand_epi32(m,_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b)))); pex += mc; *pex=s; } //out += 64; } #elif defined(__SSSE3__) && USIZE == 32 uint_t *_op=out,*op,*pex = ex; for(i = 0; i < p4dn; i++) { for(op=_op; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); } _op+=64; } #else unsigned k = 0; uint_t *op; for(op=out,i = 0; i < p4dn; i++,op += 64) while(bb[i]) { unsigned x = ctz64(bb[i]); op[x] += ex[k++]< 8 else { uint_t ex[P4D_MAX+32]; b = (b & 0x7f)>>1; bx = *in++; in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b); in = TEMPLATE2(vbdec, USIZE)(in, bx, ex); for(i = 0; i != (bx & ~7); i += 8) { out[in[i ]] |= ex[i ] << b; out[in[i+1]] |= ex[i+1] << b; out[in[i+2]] |= ex[i+2] << b; out[in[i+3]] |= ex[i+3] << b; out[in[i+4]] |= ex[i+4] << b; out[in[i+5]] |= ex[i+5] << b; out[in[i+6]] |= ex[i+6] << b; out[in[i+7]] |= ex[i+7] << b; } for(;i < bx; i++) out[in[i]] |= ex[i] << b; in += bx; #ifdef BITUNDD TEMPLATE2(BITUNDD, USIZE)(out, n, start); #endif return in; } #endif } #ifdef VSIZE #define CSIZE VSIZE #else #define CSIZE 128 #endif size_t TEMPLATE2(P4NDEC, USIZE)(unsigned char *__restrict in, size_t n, uint_t *__restrict out) { if(!n) return 0; unsigned char *ip = in; uint_t *op; #ifdef DELTA uint_t start; TEMPLATE2(vbxget, USIZE)(ip, start); *out++ = start; --n; #endif for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) { __builtin_prefetch(ip+512); unsigned b = *ip++,bx,i; if(likely(!(b & 0x80))) { if(b & 1) bx = *ip++; ip = TEMPLATE2(_P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start), b, bx ); } #if USIZE > 8 else { uint_t ex[P4D_MAX+8]; b = (b & 0x7f)>>1; bx = *ip++; ip = TEMPLATE2(BITUNPACK, USIZE)(ip, CSIZE, op, b); ip = TEMPLATE2(vbdec, USIZE)(ip, bx, ex); for(i = 0; i != (bx & ~3); i += 4) { op[ip[i ]] |= ex[i ] << b; op[ip[i+1]] |= ex[i+1] << b; op[ip[i+2]] |= ex[i+2] << b; op[ip[i+3]] |= ex[i+3] << b; } for(;i < bx; i++) op[ip[i]] |= ex[i] << b; ip += bx; #ifdef BITUNDD TEMPLATE2(BITUNDD, USIZE)(op, CSIZE, start); #endif } // ip = TEMPLATE2(P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start)); #endif P4DELTA_(start = op[CSIZE-1]); } return TEMPLATE2(P4NDECS, USIZE)(ip, n&(CSIZE-1), op P4DELTA(start)) - in; } #ifdef P4DECX unsigned char *TEMPLATE2(p4decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out) { unsigned b,i; struct p4 p4; p4ini(&p4, &in, n, &b); if(unlikely(p4.isx)) { #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+k, b); for(i = 0; i != n&~3; i += 4) { ITX(0); ITX(1); ITX(2); ITX(3); } for( ; i != n; i++ ) ITX(0); } else { #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, i+k, b); for(i = 0; i != n&~3; i += 4) { ITY(0); ITY(1); ITY(2); ITY(3); } for( ; i != n; i++) ITY(0); } return in + PAD8(n*b); } unsigned char *TEMPLATE2(p4f1decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, uint_t start) { unsigned b,i; struct p4 p4; p4ini(&p4, &in, n, &b); if(unlikely(p4.isx)) { #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, (i+k), b)+start+i+k+1; for(i = 0; i != n&~3; i+=4) { ITX(0); ITX(1); ITX(2); ITX(3); } for( ; i != n; i++) ITX(0); } else { #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, (i+k), b)+start+i+k+1; for(i = 0; i != n&~3; i += 4) { ITY(0); ITY(1); ITY(2); ITY(3); } for( ; i != n; i++) ITY(0); } return in + PAD8(n*b); } unsigned char *TEMPLATE2(p4fdecx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, uint_t start) { unsigned b,i; struct p4 p4; p4ini(&p4, &in, n, &b); if(unlikely(p4.isx)) { #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+k, b)+start; for(i = 0; i != n&~3; i+=4) { ITX(0); ITX(1); ITX(2); ITX(3); } for( ; i != n; i++) ITX(0); } else { #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, (i+k), b)+start; for(i = 0; i != n&~3; i+=4) { ITY(0); ITY(1); ITY(2); ITY(3); } for( ; i != n; i++) ITY(0); } return in + PAD8(n*b); } #endif #endif