diff --git a/TurboPFor-Integer-Compression/lib/vp4c.c b/TurboPFor-Integer-Compression/lib/vp4c.c new file mode 100644 index 0000000..8e0049a --- /dev/null +++ b/TurboPFor-Integer-Compression/lib/vp4c.c @@ -0,0 +1,441 @@ +/** + Copyright (C) powturbo 2013-2023 + SPDX-License-Identifier: GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// "TurboPFor: Integer Compression" Turbo PFor/PforDelta + +#ifndef USIZE //--------------------------------- Functions ---------------------------------------------------------------------- +#include "include_/conf.h" +#include "include_/bitpack.h" +#include "include_/bitutil.h" +#include "include_/vint.h" +#include "include_/vlcbyte.h" +#include "include_/vp4.h" + +#include "include_/bitutil_.h" + +#pragma warning( disable : 4005) +#pragma warning( disable : 4090) +#pragma warning( disable : 4068) + +#undef P4DELTA +#define PAD8(_x_) ( (((_x_)+8-1)/8) ) + +#define HYBRID 1 // Hybrid TurboPFor : 0=fixed bit packing, 1=fixed BP+Variable byte + + #ifndef __AVX2__ +#define VP4BOUND(_n_, _esize_, _csize_) ((_esize_*_n_) + ((_n_+_csize_-1)/_csize_)) +size_t p4nbound8( size_t n) { return VP4BOUND(n, 1, 128); } +size_t p4nbound16( size_t n) { return VP4BOUND(n, 2, 128); } +size_t p4nbound32( size_t n) { return VP4BOUND(n, 4, 128); } +size_t p4nbound64( size_t n) { return VP4BOUND(n, 8, 128); } + +size_t p4nbound128v8( size_t n) { return VP4BOUND(n, 1, 128); } +size_t p4nbound128v16(size_t n) { return VP4BOUND(n, 2, 128); } +size_t p4nbound128v32(size_t n) { return VP4BOUND(n, 4, 128); } +size_t p4nbound128v64(size_t n) { return VP4BOUND(n, 8, 128); } + +size_t p4nbound256v8( size_t n) { return VP4BOUND(n, 1, 256); } +size_t p4nbound256v16(size_t n) { return VP4BOUND(n, 2, 256); } +size_t p4nbound256v32(size_t n) { return VP4BOUND(n, 4, 256); } +size_t p4nbound256v64(size_t n) { return VP4BOUND(n, 8, 128); } + +#define _P4BITS _p4bits +#define P4BITS _p4bits +#define _P4ENC _p4enc +#define P4ENC p4enc +#define P4NENC p4nenc +#define BITPACK bitpack +#define BITDELTA bitdienc +#define USIZE 8 +#include "vp4c.c" +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" +#define USIZE 64 +#include "vp4c.c" + +#define P4DELTA 0 // p4d functions +#define P4DENC p4denc +#define P4NENC p4ndenc +#define P4NENCS p4denc +#define USIZE 8 +#include "vp4c.c" +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" +#define USIZE 64 +#include "vp4c.c" + +#define P4DELTA 1 // p4d1 functions +#define P4DENC p4d1enc +#define P4NENC p4nd1enc +#define P4NENCS p4d1enc +#define USIZE 8 +#include "vp4c.c" +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" +#define USIZE 64 +#include "vp4c.c" + +#define BITDELTA bitzenc // // p4z functions +#define P4DENC p4zenc +#define P4NENC p4nzenc +#define P4NENCS p4zenc +#define USIZE 8 +#include "vp4c.c" +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" +#define USIZE 64 +#include "vp4c.c" + +#undef P4DELTA +#define BITDELTA bitdienc + +#define HYBRID 0 // Direct access +#define P4BITS _p4bitsx +#define _P4BITS _p4bitsx +#define _P4ENC _p4encx +#define P4ENC p4encx +#define P4NENC p4nencx +#define USIZE 8 +#include "vp4c.c" +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" +#define USIZE 64 +#include "vp4c.c" + + +#undef _P4ENC +#undef P4ENC +#undef BITPACK + +#define P4NDENC(in, n, out, _csize_, _usize_, _p4c_) { if(!n) return 0;\ + unsigned char *op = out; \ + start = *in++;\ + T2(vbxput, _usize_)(op, start);\ + for(n--,ip = in; ip != in + (n&~(_csize_-1)); ) { PREFETCH(ip+512,0);\ + op = T2(_p4c_, _usize_)(ip, _csize_, op, start); ip += _csize_; start = ip[-1];\ + } if(n&=(_csize_-1)) { op = T2(_p4c_, _usize_)(ip, n, op, start); }\ + return op - out;\ +} + +#define P4NDDEC(in, n, out, _csize_, _usize_, _p4d_) { if(!n) return 0;\ + unsigned char *ip = in;\ + T2(vbxget, _usize_)(ip, start);\ + for(*out++ = start,--n,op = out; op != out+(n&~(_csize_-1)); ) { PREFETCH(ip+512,0);\ + ip = T2(_p4d_, _usize_)(ip, _csize_, op, start); op += _csize_; start = op[-1];\ + } if(n&=(_csize_-1)) { ip = T2(_p4d_, _usize_)(ip, n, op, start); }\ + return ip - in;\ +} + +/*unsigned char *p4senc16(uint16_t *in, unsigned n, unsigned char *out, uint16_t x) { uint16_t pa[128+32],eq, mdelta = bitdi16(in, n, 0, x); vbput16(out, mdelta); bitdienc16(in, n, pa, x, mdelta); return p4enc16(pa, n, out);} +unsigned char *p4senc32(uint32_t *in, unsigned n, unsigned char *out, uint32_t x) { uint32_t pa[128+32],eq, mdelta = bitdi32(in, n, 0, x); vbput32(out, mdelta); bitdienc32(in, n, pa, x, mdelta); return p4enc32(pa, n, out);} +unsigned char *p4senc64(uint64_t *in, unsigned n, unsigned char *out, uint64_t x) { uint64_t pa[128+64],eq, mdelta = bitdi64(in, n, 0, x); vbput64(out, mdelta); bitdienc64(in, n, pa, x, mdelta); return p4enc64(pa, n, out);} + +unsigned char *p4sdec16(unsigned char *in, unsigned n, uint16_t *out, uint16_t x) { uint16_t mdelta; vbget16(in, mdelta); in = p4dec16(in, n, out); bitdidec16(out, n, x, mdelta); return in; } +unsigned char *p4sdec32(unsigned char *in, unsigned n, uint32_t *out, uint32_t x) { uint32_t mdelta; vbget32(in, mdelta); in = p4dec32(in, n, out); bitdidec32(out, n, x, mdelta); return in; } +unsigned char *p4sdec64(unsigned char *in, unsigned n, uint64_t *out, uint64_t x) { uint64_t mdelta; vbget64(in, mdelta); in = p4dec64(in, n, out); bitdidec64(out, n, x, mdelta); return in; } + +size_t p4nsenc16(uint16_t *in, size_t n, unsigned char *out) { uint16_t *ip,start; P4NDENC(in, n, out, 128, 16, p4senc); } +size_t p4nsenc32(uint32_t *in, size_t n, unsigned char *out) { uint32_t *ip,start; P4NDENC(in, n, out, 128, 32, p4senc); } +size_t p4nsenc64(uint64_t *in, size_t n, unsigned char *out) { uint64_t *ip,start; P4NDENC(in, n, out, 128, 64, p4senc); } + +size_t p4nsdec16(unsigned char *in, size_t n, uint16_t *out) { uint16_t *op,start; P4NDDEC(in, n, out, 128, 16, p4sdec); } +size_t p4nsdec32(unsigned char *in, size_t n, uint32_t *out) { uint32_t *op,start; P4NDDEC(in, n, out, 128, 32, p4sdec); } +size_t p4nsdec64(unsigned char *in, size_t n, uint64_t *out) { uint64_t *op,start; P4NDDEC(in, n, out, 128, 64, p4sdec); }*/ +#undef _P4BITS + #endif + #ifdef __AVX2__ +#define BITDELTA bitdienc +#define HYBRID 1 +#define P4BITS _p4bits +#define VSIZE 256 + +#define _P4ENC _p4enc256v +#define P4ENC p4enc256v +#define P4NENC p4nenc256v +#define P4NENCS p4enc +#define BITPACK bitpack256v +#define USIZE 32 +#include "vp4c.c" + +#define P4DELTA 0 +#define P4DENC p4denc256v +#define P4NENC p4ndenc256v +#define P4NENCS p4denc +#include "vp4c.c" + +#define P4DELTA 1 +#define P4DENC p4d1enc256v +#define P4NENC p4nd1enc256v +#define P4NENCS p4d1enc +#include "vp4c.c" +#undef P4DELTA + +#define P4DELTA 0 +#define BITDELTA bitzenc +#define P4DENC p4zenc256v +#define P4NENC p4nzenc256v +#define P4NENCS p4zenc +#include "vp4c.c" + +#undef _P4ENC +#undef P4ENC +#undef BITPACK + #elif defined(__SSE3__) || defined(__ARM_NEON) //-------------------------------------------------- +#define BITDELTA bitdienc +#define HYBRID 1 +#define P4BITS _p4bits +#define USIZE 32 + +#define VSIZE 128 +#define _P4ENC _p4enc128v +#define P4ENC p4enc128v +#define P4NENCS p4enc +#define P4NENC p4nenc128v +#define BITPACK bitpack128v +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" +#define USIZE 64 +#include "vp4c.c" + +#define P4DELTA 0 +#define P4DENC p4denc128v +#define P4NENC p4ndenc128v +#define P4NENCS p4denc +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" + +#define P4DELTA 1 +#define P4DENC p4d1enc128v +#define P4NENC p4nd1enc128v +#define P4NENCS p4d1enc +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" + +#define P4DELTA 0 +#define BITDELTA bitzenc +#define P4DENC p4zenc128v +#define P4NENC p4nzenc128v +#define P4NENCS p4zenc +#define USIZE 16 +#include "vp4c.c" +#define USIZE 32 +#include "vp4c.c" + +/*#define BITDELTA bitdienc +#define VSIZE 256 +#define _P4ENC _p4enc256w +#define P4ENC p4enc256w +#define P4NENCS p4encw +#define P4NENC p4nenc256w +#define BITPACK bitpack256w +#include "vp4c.c"*/ + #endif + +#undef P4DELTA +#undef _P4ENC +#undef P4ENC +#undef BITPACK + +#else //------------------------------------------ Templates --------------------------------------------------------------- +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wparentheses" + +#pragma GCC push_options +#pragma GCC optimize ("align-functions=16") + +#define uint_t T3(uint, USIZE, _t) + + #ifdef VSIZE +#define CSIZE VSIZE + #else +#define CSIZE 128 + #endif + + #ifndef P4DELTA + + #ifdef _P4BITS +unsigned T2(_P4BITS, USIZE)(uint_t *__restrict in, unsigned n, unsigned *pbx) { + #if HYBRID > 0 && USIZE >= 16 + unsigned _vb[USIZE*2+64+16] = {0}, *vb=&_vb[USIZE+16]; + #endif + unsigned cnt[USIZE+8] = {0}, x, bx, bmp8=(n+7)/8; + uint_t *ip, u=0, a = in[0]; + int b,i,ml,l,fx=0,vv,eq=0; + + #define CNTE(i) { ++cnt[T2(bsr, USIZE)(ip[i])], u |= ip[i]; eq += (ip[i] == a); } + for(ip = in; ip != in+(n&~3); ip+=4) { CNTE(0); CNTE(1); CNTE(2); CNTE(3); } + for(;ip != in+n;ip++) CNTE(0); + + b = T2(bsr, USIZE)(u); + #if HYBRID > 0 + if(eq == n && a) { *pbx = USIZE+2; + #if USIZE == 64 + if(b == USIZE-1) b = USIZE; + #endif + return b; + } + #endif + bx = b; + ml = PAD8(n*b)+1; x = cnt[b]; + + #if HYBRID > 0 && USIZE >= 16 + #define VBB(_x_,_b_) vb[_b_-7]+=_x_; vb[_b_-15]+=_x_*2; vb[_b_-19]+=_x_*3; vb[_b_-25]+=_x_*4; + vv = x; VBB(x,b); + #else + ml -= 2+bmp8; + #endif + for(i = b-1; i >= 0; --i) { + int fi,v; + #if HYBRID > 0 && USIZE >= 16 + v = PAD8(n*i) + 2 + x + vv; + l = PAD8(n*i) + 2+bmp8 + PAD8(x*(bx-i)); + x += cnt[i]; + vv += cnt[i]+vb[i]; + VBB(cnt[i],i); + fi = l < ml; ml = fi?l:ml; b = fi?i:b; fx=fi?0:fx; + fi = v < ml; ml = fi?v:ml; b = fi?i:b; fx=fi?1:fx; + #else + l = PAD8(n*i) + PAD8(x*(bx-i)); + x += cnt[i]; + fi = l < ml; + ml = fi?l:ml; b = fi?i:b; + #endif + } //fx = 0; + #if HYBRID > 0 && USIZE >= 16 + *pbx = fx?(USIZE+1):(bx - b); + #if USIZE == 64 + if(b == USIZE-1) { b = USIZE; *pbx = 0; } + #endif + #else + *pbx = bx - b; + #endif + return b; +} + #endif + + +unsigned char *T2(_P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx) { + uint_t msk = (1ull << b)-1, _in[P4D_MAX+32], inx[P4D_MAX+32]={0},a,ax; + unsigned long long xmap[P4D_MAX/64] = {0}; + unsigned miss[P4D_MAX],i, xn, c; //, eq=0,eqx=0; + unsigned char *_out = out; + + if(!bx) + return T2(BITPACK, USIZE)(in, n, out, b); + #if HYBRID > 0 + if(bx == USIZE+2) { + T2(ctou, USIZE)(out) = in[0]; + return out+((b+7)/8); + } + #endif + #define MISS { miss[xn] = i; xn += in[i] > msk; _in[i] = in[i] & msk; i++; } //eq+= (_in[i] == a); } a = in[0] & msk; + for(xn = i = 0; i != n&~3; ) { MISS; MISS; MISS; MISS; } + while(i != n) MISS; + //ax = inx[miss[0]] >> b; + for(i = 0; i != xn; ++i) { + c = miss[i]; + xmap[c>>6] |= (1ull << (c&0x3f)); + inx[i] = in[c] >> b; // eqx += inx[i] == a; + } + + #if HYBRID > 0 && USIZE >= 16 + if(bx <= USIZE) { + #endif + for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i]; out += PAD8(n); //if(eqx == xn && bx) { out[-1] |=0x80; T2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else + out = T2(bitpack, USIZE)(inx, xn, out, bx); //if(eq == n && b) { out[-1]|= 0x80; T2(ctou, USIZE)(out)=a; out += (b+7)/8; } else + out = T2(BITPACK, USIZE)(_in, n, out, b); + #if HYBRID > 0 && USIZE >= 16 + } + else { + *out++ = xn; //if(b && eq == n) { *out++ = 0x80; T2(ctou, USIZE)(out) = _in[0]; out += (b+7)/8; } else { *out++ = 0; + out = T2(BITPACK, USIZE)(_in, n, out, b); + + out = T2(vbenc, USIZE)(inx, xn, out); + for(i = 0; i != xn; ++i) *out++ = miss[i]; + } + #endif + return out; +} + +unsigned char *T2(P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out) { unsigned bx, b; + if(!n) return out; + b = T2(P4BITS, USIZE)(in, n, &bx); + T2(P4HVE, USIZE)(out,b,bx); + out = T2(_P4ENC, USIZE)(in, n, out, b, bx); + return out; +} + +size_t T2(P4NENC, USIZE)(uint_t *__restrict in, size_t n, unsigned char *__restrict out) { if(!n) return 0; + unsigned char *op = out; + uint_t *ip; + + for(ip = in; ip != in+(n&~(CSIZE-1)); ip += CSIZE) { unsigned bx, b; PREFETCH(ip+512,0); + b = T2(P4BITS, USIZE)(ip, CSIZE, &bx); + T2(P4HVE, USIZE)(op,b,bx); + op = T2(_P4ENC, USIZE)(ip, CSIZE, op, b, bx); + } + return T2(p4enc, USIZE)(ip, n&(CSIZE-1), op) - out; +} + #else +unsigned char *T2(P4DENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, uint_t start) { + uint_t _in[P4D_MAX+8]; + if(!n) return out; + T2(BITDELTA, USIZE)(in, n, _in, start, P4DELTA); + return T2(P4ENC, USIZE)(_in, n, out); +} + +size_t T2(P4NENC, USIZE)(uint_t *__restrict in, size_t n, unsigned char *__restrict out) { + unsigned char *op = out; + uint_t *ip, start = *in++; + if(!n) + return 0; + + T2(vbxput, USIZE)(op, start); + for(ip = in, --n; ip != in+(n&~(CSIZE-1)); ip += CSIZE) { uint_t _in[P4D_MAX+8];unsigned bx, b; PREFETCH(ip+512,0); + T2(BITDELTA, USIZE)(ip, CSIZE, _in, start, P4DELTA); + b = T2(_p4bits, USIZE)(_in, CSIZE, &bx); + T2(P4HVE, USIZE)(op,b,bx); + op = T2(_P4ENC, USIZE)(_in, CSIZE, op, b, bx); // op = T2(P4ENC, USIZE)(_in, CSIZE, op); + start = ip[CSIZE-1]; + } + return T2(P4NENCS, USIZE)(ip, n&(CSIZE-1), op, start) - out; +} + #endif +#pragma clang diagnostic pop + #endif