TurboPFor: Floating point

This commit is contained in:
x
2023-03-10 20:28:32 +01:00
parent 396869edf6
commit 267a013890

533
fp.c
View File

@ -1,5 +1,5 @@
/**
Copyright (C) powturbo 2013-2019
Copyright (C) powturbo 2013-2023
GPL v2 License
This program is free software; you can redistribute it and/or modify
@ -21,17 +21,39 @@
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "Floating Point + Integer Compression (All integer compression functions can be used for float/double and vice versa)"
// "Floating Point + Integer Compression (All integer compression functions can be used for float/double and vice versa)"
#ifndef USIZE
#include <string.h>
#include "include_/conf.h"
#include "include_/bitutil.h"
#include "include_/fp.h"
#include "include_/vp4.h"
#include "include_/bitutil_.h"
#pragma warning( disable : 4005)
#pragma warning( disable : 4090)
#pragma warning( disable : 4068)
#define BITUTIL_IN
#include "conf.h"
#include "vp4.h"
#include "bitutil.h"
#include "fp.h"
#define CCODEL 3
#define lz2l(_x_) cquant[_x_] // quantize leading zeros
#define l2c( _x_) ccode[_x_] // encode
#define c2l( _x_) dcode[_x_] // decode
static unsigned char ccode[] = { // optimized for 32/64 bits
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3,
3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
};
static unsigned char cquant[] = {
0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14,
14, 14, 18, 18, 20, 20, 22, 22, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24
};
static unsigned char dcode[] = {0, 6,10, 14, 18, 20, 22, 24};
//---------------------- template generation --------------------------------------------
#define VSIZE 128
@ -50,6 +72,12 @@
#define N3 5
#define USIZE 8
#include "fp.c"
#undef P4ENCV
#undef P4DECV
#undef N_0
#undef N_1
#undef N2
#undef N3
#define P4ENCV p4enc128v
#define P4DECV p4dec128v
@ -61,6 +89,10 @@
#define N3 12
#define USIZE 16
#include "fp.c"
#undef N_0
#undef N_1
#undef N2
#undef N3
#define N_0 4
#define N_1 6
@ -69,6 +101,10 @@
#define N3 10
#define USIZE 32
#include "fp.c"
#undef N4
#undef N1
#undef N3
#undef N_1
#define N_1 7
#define N2 6 // for seconds/milliseconds,... time series
@ -81,95 +117,98 @@
#define XORENC( _u_, _pu_, _usize_) ((_u_)^(_pu_)) // xor predictor
#define XORDEC( _u_, _pu_, _usize_) ((_u_)^(_pu_))
#define ZZAGENC(_u_, _pu_, _usize_) TEMPLATE2(zigzagenc,_usize_)((_u_)-(_pu_)) //zigzag predictor
#define ZZAGDEC(_u_, _pu_, _usize_) (TEMPLATE2(zigzagdec,_usize_)(_u_)+(_pu_))
#define ZZAGENC(_u_, _pu_, _usize_) T2(zigzagenc,_usize_)((_u_)-(_pu_)) //zigzag predictor
#define ZZAGDEC(_u_, _pu_, _usize_) (T2(zigzagdec,_usize_)(_u_)+(_pu_))
#define uint_t TEMPLATE3(uint, USIZE, _t)
#define int_t TEMPLATE3(int, USIZE, _t)
#define uint_t T3(uint, USIZE, _t)
#define int_t T3(int, USIZE, _t)
//-------- TurboPFor Zigzag of zigzag for unsorted/sorted integer/floating point array ---------------------------------------
size_t TEMPLATE2(p4nzzenc128v,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
size_t T2(p4nzzenc128v,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t _p[VSIZE+32], *ip, *p, pd = 0;
unsigned char *op = out;
#define FE(i,_usize_) { TEMPLATE3(uint, USIZE, _t) u = ip[i]; start = u-start; p[i] = ZZAGENC(start,pd,_usize_); pd = start; start = u; }
#define FE(_i_,_usize_) { T3(uint, USIZE, _t) u = ip[_i_]; start = u-start; p[_i_] = ZZAGENC(start,pd,_usize_); pd = start; start = u; }
for(ip = in; ip != in + (n&~(VSIZE-1)); ) {
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
op = TEMPLATE2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
}
if(n = (in+n)-ip) {
for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE);
op = TEMPLATE2(P4ENC,USIZE)(_p, n, op);
op = T2(P4ENC,USIZE)(_p, n, op);
}
return op - out;
#undef FE
}
size_t TEMPLATE2(p4nzzdec128v,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
size_t T2(p4nzzdec128v,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t _p[VSIZE+32],*p, *op, pd=0;
unsigned char *ip = in;
#define FD(i,_usize_) { TEMPLATE3(uint, USIZE, _t) u = ZZAGDEC(p[i],start+pd,_usize_); op[i] = u; pd = u - start; start = u; }
#define FD(_i_,_usize_) { T3(uint, USIZE, _t) u = ZZAGDEC(p[_i_],start+pd,_usize_); op[_i_] = u; pd = u - start; start = u; }
for(op = out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
for(ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p), p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
for(ip = T2(P4DECV,USIZE)(ip, VSIZE, _p), p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
}
if(n = (out+n) - op)
for(ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p), p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
for(ip = T2(P4DEC,USIZE)(ip, n, _p), p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
return ip - in;
#undef FD
}
/*---------------- TurboFloat XOR: last value Predictor with TurboPFor ---------------------------------------------------------
Compress significantly (115% - 160%) better than Facebook's Gorilla algorithm for values
Compress significantly (115% - 160%) better and faster than facebook's Gorilla algorithm for values
BEST results are obtained with LOSSY COMPRESSION (using fppad32/fppad64 in bitutil.c)
Algorithm:
1: XOR value with previous value. We have now leading (for common sign/exponent bits) + mantissa trailing zero bits
2: Eliminate the common block leading zeros of sign/exponent by shifting all values in the block to left
3: reverse values to bring the mantissa trailing zero bits to left for better compression with TurboPFor
*/
size_t TEMPLATE2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t _p[VSIZE+32], *ip, *p;
unsigned char *op = out;
#if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = TEMPLATE2(_mm256_set1_epi, USIZE)(start);
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = TEMPLATE2(_mm_set1_epi, USIZE)(start);
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif
#define FE(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = ip[i]; p[i] = XORENC(u, start,_usize_); b |= p[i]; start = u; }
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, start,_usize_); b |= p[_i_]; start = u; }
for(ip = in; ip != in + (n&~(VSIZE-1)); ) { uint_t b = 0;
#if defined(__AVX2__) && USIZE >= 32
__m256i bv = _mm256_setzero_si256();
for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8),ip+=64/(USIZE/8)) {
__m256i v0 = _mm256_loadu_si256((__m256i *) ip);
__m256i v1 = _mm256_loadu_si256((__m256i *)(ip+32/(USIZE/8)));
sv = TEMPLATE2(mm256_xore_epi, USIZE)(v0,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *) p, sv); sv = v0;
sv = TEMPLATE2(mm256_xore_epi, USIZE)(v1,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), sv); sv = v1;
sv = T2(mm256_xore_epi, USIZE)(v0,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *) p, sv); sv = v0;
sv = T2(mm256_xore_epi, USIZE)(v1,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), sv); sv = v1;
}
start = (uint_t)TEMPLATE2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
b = TEMPLATE2(mm256_hor_epi, USIZE)(bv);
start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
b = T2(mm256_hor_epi, USIZE)(bv);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
__m128i bv = _mm_setzero_si128();
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),ip+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) ip);
__m128i v1 = _mm_loadu_si128((__m128i *)(ip+16/(USIZE/8)));
sv = TEMPLATE2(mm_xore_epi, USIZE)(v0,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *) p, sv); sv = v0;
sv = TEMPLATE2(mm_xore_epi, USIZE)(v1,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1;
sv = T2(mm_xore_epi, USIZE)(v0,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *) p, sv); sv = v0;
sv = T2(mm_xore_epi, USIZE)(v1,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1;
}
start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
b = TEMPLATE2(mm_hor_epi, USIZE)(bv);
start = (uint_t)T2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
b = T2(mm_hor_epi, USIZE)(bv);
#else
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
#endif
*op++ = b = TEMPLATE2(clz,USIZE)(b);
#define TR(i,_usize_) p[i] = TEMPLATE2(rbit,_usize_)(p[i]<<b)
*op++ = b = T2(clz,USIZE)(b);
#define TR(_i_,_usize_) p[_i_] = T2(rbit,_usize_)(p[_i_]<<b)
#if defined(__AVX2__) && USIZE >= 32
for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8)) {
__m256i v0 = _mm256_loadu_si256((__m256i *)p);
__m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8)));
v0 = TEMPLATE2(_mm256_slli_epi, USIZE)(v0,b);
v1 = TEMPLATE2(_mm256_slli_epi, USIZE)(v1,b);
v0 = TEMPLATE2( mm256_rbit_epi, USIZE)(v0);
v1 = TEMPLATE2( mm256_rbit_epi, USIZE)(v1);
v0 = T2(_mm256_slli_epi, USIZE)(v0,b);
v1 = T2(_mm256_slli_epi, USIZE)(v1,b);
v0 = T2( mm256_rbit_epi, USIZE)(v0);
v1 = T2( mm256_rbit_epi, USIZE)(v1);
_mm256_storeu_si256((__m256i *) p, v0);
_mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1);
}
@ -177,79 +216,81 @@ size_t TEMPLATE2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
v0 = TEMPLATE2(_mm_slli_epi, USIZE)(v0,b);
v0 = TEMPLATE2( mm_rbit_epi, USIZE)(v0);
v1 = TEMPLATE2(_mm_slli_epi, USIZE)(v1,b);
v1 = TEMPLATE2( mm_rbit_epi, USIZE)(v1);
v0 = T2(_mm_slli_epi, USIZE)(v0,b);
v0 = T2( mm_rbit_epi, USIZE)(v0);
v1 = T2(_mm_slli_epi, USIZE)(v1,b);
v1 = T2( mm_rbit_epi, USIZE)(v1);
_mm_storeu_si128((__m128i *) p, v0);
_mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), v1);
}
#else
for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); }
#endif
op = TEMPLATE2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
}
if(n = (in+n)-ip) { uint_t b = 0;
for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE);
b = TEMPLATE2(clz,USIZE)(b);
b = T2(clz,USIZE)(b);
*op++ = b;
for(p = _p; p != &_p[n]; p++) TR(0,USIZE);
op = TEMPLATE2(P4ENC,USIZE)(_p, n, op);
op = T2(P4ENC,USIZE)(_p, n, op);
}
return op - out;
#undef FE
}
size_t TEMPLATE2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t *op, _p[VSIZE+32],*p;
unsigned char *ip = in;
#if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = TEMPLATE2(_mm256_set1_epi, USIZE)(start);
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = TEMPLATE2(_mm_set1_epi, USIZE)(start);
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif
#define FD(i,_usize_) { TEMPLATE3(uint, USIZE, _t) u = p[i]; u = TEMPLATE2(rbit,_usize_)(u)>>b; u = XORDEC(u, start,_usize_); op[i] = start = u; }
for(op = out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
unsigned b = *ip++; ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p);
#define FD(_i_,_usize_) { T3(uint, USIZE, _t) u = p[_i_]; u = T2(rbit,_usize_)(u)>>b; u = XORDEC(u, start,_usize_); op[_i_] = start = u; }
for(op = out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+384,0);
unsigned b = *ip++; ip = T2(P4DECV,USIZE)(ip, VSIZE, _p);
#if defined(__AVX2__) && USIZE >= 32
for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8),op+=64/(USIZE/8)) {
__m256i v0 = _mm256_loadu_si256((__m256i *)p);
__m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8)));
v0 = TEMPLATE2( mm256_rbit_epi, USIZE)(v0);
v1 = TEMPLATE2( mm256_rbit_epi, USIZE)(v1);
v0 = TEMPLATE2(_mm256_srli_epi, USIZE)(v0,b);
v1 = TEMPLATE2(_mm256_srli_epi, USIZE)(v1,b);
v0 = TEMPLATE2( mm256_xord_epi, USIZE)(v0,sv);
sv = TEMPLATE2( mm256_xord_epi, USIZE)(v1,v0);
v0 = T2( mm256_rbit_epi, USIZE)(v0);
v1 = T2( mm256_rbit_epi, USIZE)(v1);
v0 = T2(_mm256_srli_epi, USIZE)(v0,b);
v1 = T2(_mm256_srli_epi, USIZE)(v1,b);
v0 = T2( mm256_xord_epi, USIZE)(v0,sv);
sv = T2( mm256_xord_epi, USIZE)(v1,v0);
_mm256_storeu_si256((__m256i *)op, v0);
_mm256_storeu_si256((__m256i *)(op+32/(USIZE/8)), sv);
}
start = (uint_t)TEMPLATE2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),op+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *)p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
v0 = TEMPLATE2( mm_rbit_epi, USIZE)(v0);
v0 = TEMPLATE2(_mm_srli_epi, USIZE)(v0,b);
v0 = TEMPLATE2( mm_xord_epi, USIZE)(v0,sv);
v1 = TEMPLATE2( mm_rbit_epi, USIZE)(v1);
v1 = TEMPLATE2(_mm_srli_epi, USIZE)(v1,b);
sv = TEMPLATE2( mm_xord_epi, USIZE)(v1,v0);
v0 = T2( mm_rbit_epi, USIZE)(v0);
v0 = T2(_mm_srli_epi, USIZE)(v0,b);
v0 = T2( mm_xord_epi, USIZE)(v0,sv);
v1 = T2( mm_rbit_epi, USIZE)(v1);
v1 = T2(_mm_srli_epi, USIZE)(v1,b);
sv = T2( mm_xord_epi, USIZE)(v1,v0);
_mm_storeu_si128((__m128i *) op, v0);
_mm_storeu_si128((__m128i *)(op+16/(USIZE/8)), sv);
}
start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
start = (uint_t)T2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
#else
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
#endif
}
if(n = (out+n) - op) {
uint_t b = *ip++;
for(ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p), p = _p; p < &_p[n]; p++,op++) FD(0,USIZE);
for(ip = T2(P4DEC,USIZE)(ip, n, _p), p = _p; p < &_p[n]; p++,op++) FD(0,USIZE);
}
return ip - in;
#undef FD
}
//-------- TurboFloat FCM: Finite Context Method Predictor ---------------------------------------------------------------
@ -259,30 +300,30 @@ size_t TEMPLATE2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
#define HASH16(_h_,_u_) (((_h_)<<3 ^ (_u_)>>12) & ((1u<<HBITS)-1))
#define HASH8( _h_,_u_) (((_h_)<<2 ^ (_u_)>> 5) & ((1u<<HBITS)-1))
size_t TEMPLATE2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
size_t T2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t htab[1<<HBITS] = {0}, _p[VSIZE+32], *ip, h = 0, *p;
unsigned char *op = out;
#if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = TEMPLATE2(_mm256_set1_epi, USIZE)(start);
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = TEMPLATE2(_mm_set1_epi, USIZE)(start);
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif
for(ip = in; ip != in + (n&~(VSIZE-1)); ) { uint_t b = 0;
#define FE(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = ip[i]; p[i] = XORENC(u, htab[h],_usize_); b |= p[i]; htab[h] = u; h = TEMPLATE2(HASH,_usize_)(h,u); }
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, htab[h],_usize_); b |= p[_i_]; htab[h] = u; h = T2(HASH,_usize_)(h,u); }
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
*op++ = b = TEMPLATE2(clz,USIZE)(b);
*op++ = b = T2(clz,USIZE)(b);
#if defined(__AVX2__) && USIZE >= 32
for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8)) {
__m256i v0 = _mm256_loadu_si256((__m256i *)p);
__m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8)));
v0 = TEMPLATE2(_mm256_slli_epi, USIZE)(v0,b);
v1 = TEMPLATE2(_mm256_slli_epi, USIZE)(v1,b);
v0 = TEMPLATE2( mm256_rbit_epi, USIZE)(v0);
v1 = TEMPLATE2( mm256_rbit_epi, USIZE)(v1);
v0 = T2(_mm256_slli_epi, USIZE)(v0,b);
v1 = T2(_mm256_slli_epi, USIZE)(v1,b);
v0 = T2( mm256_rbit_epi, USIZE)(v0);
v1 = T2( mm256_rbit_epi, USIZE)(v1);
_mm256_storeu_si256((__m256i *) p, v0);
_mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1);
}
@ -290,141 +331,148 @@ size_t TEMPLATE2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
v0 = TEMPLATE2(_mm_slli_epi, USIZE)(v0,b);
v0 = TEMPLATE2( mm_rbit_epi, USIZE)(v0);
v1 = TEMPLATE2(_mm_slli_epi, USIZE)(v1,b);
v1 = TEMPLATE2( mm_rbit_epi, USIZE)(v1);
v0 = T2(_mm_slli_epi, USIZE)(v0,b);
v0 = T2( mm_rbit_epi, USIZE)(v0);
v1 = T2(_mm_slli_epi, USIZE)(v1,b);
v1 = T2( mm_rbit_epi, USIZE)(v1);
_mm_storeu_si128((__m128i *) p, v0);
_mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), v1);
}
#else
#define TR(i,_usize_) p[i] = TEMPLATE2(rbit,_usize_)(p[i]<<b)
#define TR(_i_,_usize_) p[_i_] = T2(rbit,_usize_)(p[_i_]<<b)
for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); }
#endif
op = TEMPLATE2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
}
if(n = (in+n)-ip) { uint_t b = 0;
for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE);
b = TEMPLATE2(clz,USIZE)(b);
b = T2(clz,USIZE)(b);
*op++ = b;
for(p = _p; p != &_p[n]; p++) TR(0,USIZE);
op = TEMPLATE2(P4ENC,USIZE)(_p, n, op);
op = T2(P4ENC,USIZE)(_p, n, op);
}
return op - out;
#undef FE
}
size_t TEMPLATE2(fpfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
size_t T2(fpfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t *op, htab[1<<HBITS] = {0}, h = 0, _p[VSIZE+32],*p;
unsigned char *ip = in;
#define FD(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = p[i]; u = TEMPLATE2(rbit,_usize_)(u)>>b;\
u = XORDEC(u, htab[h], _usize_); op[i] = u; htab[h] = u; h = TEMPLATE2(HASH,_usize_)(h,u);\
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = p[_i_]; u = T2(rbit,_usize_)(u)>>b;\
u = XORDEC(u, htab[h], _usize_); op[_i_] = u; htab[h] = u; h = T2(HASH,_usize_)(h,u);\
}
for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
unsigned b = *ip++; ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p);
unsigned b = *ip++; ip = T2(P4DECV,USIZE)(ip, VSIZE, _p);
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
}
if(n = ((uint_t *)out+n) - op) {
unsigned b = *ip++; ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p);
unsigned b = *ip++; ip = T2(P4DEC,USIZE)(ip, n, _p);
for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
}
return ip - in;
#undef FD
}
//-------- TurboFloat DFCM: Differential Finite Context Method Predictor ----------------------------------------------------------
size_t TEMPLATE2(fpdfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
size_t T2(fpdfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip, _p[VSIZE+32], h = 0, *p, htab[1<<HBITS] = {0};
unsigned char *op = out;
#define FE(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = ip[i]; p[i] = XORENC(u, (htab[h]+start),_usize_); b |= p[i]; \
htab[h] = start = u - start; h = TEMPLATE2(HASH,_usize_)(h,start); start = u;\
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, (htab[h]+start),_usize_); b |= p[_i_]; \
htab[h] = start = u - start; h = T2(HASH,_usize_)(h,start); start = u;\
}
for(ip = in; ip != in + (n&~(VSIZE-1)); ) { uint_t b;
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
#define TR(i,_usize_) p[i] = TEMPLATE2(rbit,_usize_)(p[i]<<b)
b = TEMPLATE2(clz,USIZE)(b);
#define TR(_i_,_usize_) p[_i_] = T2(rbit,_usize_)(p[_i_]<<b)
b = T2(clz,USIZE)(b);
for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); }
*op++ = b; op = TEMPLATE2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
*op++ = b; op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
}
if(n = (in+n)-ip) { uint_t b;
for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE);
b = TEMPLATE2(clz,USIZE)(b);
b = T2(clz,USIZE)(b);
for(p = _p; p != &_p[n]; p++) TR(0,USIZE);
*op++ = b; op = TEMPLATE2(P4ENC,USIZE)(_p, n, op);
*op++ = b; op = T2(P4ENC,USIZE)(_p, n, op);
}
return op - out;
#undef FE
}
size_t TEMPLATE2(fpdfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
size_t T2(fpdfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t _p[VSIZE+32], *op, h = 0, *p, htab[1<<HBITS] = {0};
unsigned char *ip = in;
#define FD(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = TEMPLATE2(rbit,_usize_)(p[i])>>b; u = XORDEC(u, (htab[h]+start),_usize_); \
op[i] = u; htab[h] = start = u-start; h = TEMPLATE2(HASH,_usize_)(h,start); start = u;\
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = T2(rbit,_usize_)(p[_i_])>>b; u = XORDEC(u, (htab[h]+start),_usize_); \
op[_i_] = u; htab[h] = start = u-start; h = T2(HASH,_usize_)(h,start); start = u;\
}
for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
uint_t b = *ip++;
ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p);
ip = T2(P4DECV,USIZE)(ip, VSIZE, _p);
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
}
if(n = ((uint_t *)out+n) - op) {
uint_t b = *ip++;
ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p);
ip = T2(P4DEC,USIZE)(ip, n, _p);
for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
}
return ip - in;
#undef FD
}
//-------- TurboFloat 2D DFCM: Differential Finite Context Method Predictor ----------------------------------------------------------
size_t TEMPLATE2(fp2dfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
//-------- TurboFloat Double delta DFCM: Differential Finite Context Method Predictor ----------------------------------------------------------
size_t T2(fp2dfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip, _p[VSIZE+32], h = 0, *p, htab[1<<HBITS] = {0},start0=start; start=0;
unsigned char *op = out;
#define FE(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = ip[i]; p[i] = XORENC(u, (htab[h]+start),_usize_); b |= p[i]; \
htab[h] = start = u - start; h = TEMPLATE2(HASH,_usize_)(h,start); start = start0; start0 = u;\
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, (htab[h]+start),_usize_); b |= p[_i_]; \
htab[h] = start = u - start; h = T2(HASH,_usize_)(h,start); start = start0; start0 = u;\
}
#define TR(i,_usize_) p[i] = TEMPLATE2(rbit,_usize_)(p[i]<<b)
#define TR(_i_,_usize_) p[_i_] = T2(rbit,_usize_)(p[_i_]<<b)
for(ip = in; ip != in + (n&~(VSIZE-1)); ) {
uint_t b;
uint_t b = 0;
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
b = TEMPLATE2(clz,USIZE)(b);
b = T2(clz,USIZE)(b);
for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); }
*op++ = b; op = TEMPLATE2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
*op++ = b; op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0);
}
if(n = (in+n)-ip) {
uint_t b;
uint_t b = 0;
for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE);
b = TEMPLATE2(clz,USIZE)(b);
b = T2(clz,USIZE)(b);
for(p = _p; p != &_p[n]; p++) TR(0,USIZE);
*op++ = b; op = TEMPLATE2(P4ENC,USIZE)(_p, n, op);
*op++ = b; op = T2(P4ENC,USIZE)(_p, n, op);
}
return op - out;
#undef FE
}
size_t TEMPLATE2(fp2dfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
size_t T2(fp2dfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t _p[VSIZE+32], *op, h = 0, *p, htab[1<<HBITS] = {0},start0=start; start=0; ;
unsigned char *ip = in;
#define FD(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = TEMPLATE2(rbit,_usize_)(p[i])>>b; u = XORDEC(u, (htab[h]+start),_usize_);\
op[i] = u; htab[h] = start = u-start; h = TEMPLATE2(HASH,_usize_)(h,start); start = start0; start0 = u;\
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = T2(rbit,_usize_)(p[_i_])>>b; u = XORDEC(u, (htab[h]+start),_usize_);\
op[_i_] = u; htab[h] = start = u-start; h = T2(HASH,_usize_)(h,start); start = start0; start0 = u;\
}
for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
uint_t b = *ip++;
ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p);
ip = T2(P4DECV,USIZE)(ip, VSIZE, _p);
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
}
if(n = ((uint_t *)out+n) - op) {
uint_t b = *ip++;
ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p);
ip = T2(P4DEC,USIZE)(ip, n, _p);
for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
}
return ip - in;
#undef FD
}
#define BSIZE(_usize_) (_usize_==64?6:(_usize_==32?5:(_usize_==16?4:3)))
//-------- TurboGorilla : Improved Gorilla style (see Facebook paper) Floating point compression with bitio ------------------------------------
#define bitput2(_bw_,_br_, _n1_, _n2_, _x_) {\
if(!_x_) bitput(_bw_,_br_, 1, 1);/*1*/\
@ -438,54 +486,229 @@ size_t TEMPLATE2(fp2dfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uin
else bitrmv(_bw_,_br_,_n2_+2), _x_ = BZHI32(_x_>>2, _n2_);\
}
#define BSIZE(_usize_) (_usize_==64?6:(_usize_==32?5:(_usize_==16?4:3)))
size_t TEMPLATE2(fpgenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip;
unsigned ol = 0,ot = 0;
size_t T2(fpgenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip = in;
unsigned ol = 0, ot = 0;
unsigned char *op = out;
bitdef(bw,br);
if(start) { ol = TEMPLATE2(clz,USIZE)(start); ot = TEMPLATE2(ctz,USIZE)(start); }
if(start) { ol = T2(clz,USIZE)(start); ot = T2(ctz,USIZE)(start); }
#define FE(i,_usize_) { TEMPLATE3(uint, _usize_, _t) z = XORENC(ip[i], start,_usize_); start = ip[i];\
if(likely(!z)) bitput( bw,br, 1, 1);\
else { unsigned t = TEMPLATE2(ctz,_usize_)(z), l = TEMPLATE2(clz,_usize_)(z);\
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) z = XORENC(ip[_i_], start,_usize_); start = ip[_i_];\
if(unlikely(!z)) bitput( bw,br, 2, 2); /*01*/\
else {\
unsigned t = T2(ctz,_usize_)(z), l = /*lz2l[*/T2(clz,_usize_)(z);\
unsigned s = _usize_ - l - t, os = _usize_ - ol - ot;\
if(l >= ol && t >= ot && os < 6+5+s) { bitput( bw,br, 2, 2); TEMPLATE2(bitput,_usize_)(bw,br, os, z>>ot,op); }\
else { bitput( bw,br, 2+BSIZE(_usize_), l<<2); bitput2(bw,br, N_0, N_1, t); bitenorm(bw,br,op);TEMPLATE2(bitput,_usize_)(bw,br, s, z>>t,op); ol = l; ot = t; }\
if(l >= ol && t >= ot && os < 6+5+s) { bitput( bw,br, 1, 1); /*1*/ T2(bitput,_usize_)(bw,br, os, z>>ot,op); }\
else { /*00*/ /*bitput( bw,br, 2, 0); bitput(bw,br, 3, lz2c(l));*/\
bitput( bw,br, 2+BSIZE(_usize_), l<<2); bitput2(bw,br, N_0, N_1, t); bitenorm(bw,br,op);T2(bitput,_usize_)(bw,br, s, z>>t,op); ol = l; ot = t; }\
} bitenorm(bw,br,op);\
}
for(ip = in; ip != in + (n&~(4-1)); ip+=4) { PREFETCH(ip+512,0); FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
for( ; ip != in + n ; ip++) FE(0,USIZE);
for(; ip != in + (n&~(4-1)); ip+=4) { PREFETCH(ip+512,0); FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
for(; ip != in + n ; ip++) FE(0,USIZE);
bitflush(bw,br,op);
return op - out;
#undef FE
}
size_t TEMPLATE2(fpgdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
size_t T2(fpgdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op;
unsigned ol = 0,ot = 0,x;
int ol = 0,ot = 0,x;
unsigned char *ip = in;
bitdef(bw,br);
if(start) { ol = TEMPLATE2(clz,USIZE)(start); ot = TEMPLATE2(ctz,USIZE)(start); }
if(start) { ol = T2(clz,USIZE)(start); ot = T2(ctz,USIZE)(start); }
#define FD(i,_usize_) { TEMPLATE3(uint, _usize_, _t) z=0; unsigned _x; BITGET32(bw,br,1,_x); \
if(likely(!_x)) { BITGET32(bw,br,1,_x);\
if(!_x) { BITGET32(bw,br,BSIZE(_usize_),ol); bitget2(bw,br, N_0, N_1, ot); bitdnorm(bw,br,ip); }\
TEMPLATE2(bitget,_usize_)(bw,br,_usize_ - ol - ot,z,ip);\
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) z=0; unsigned _x,_y; BITGET32(bw,br,1,_x); \
if(likely(!_x)) { BITGET32(bw,br,1,_y); _x = _y << 1 | _x; }\
if(_x < 2) { \
if(!_x) { /*BITGET32(bw,br, 3, ol); ol = c2l(ol);*/BITGET32(bw,br,BSIZE(_usize_),ol); bitget2(bw,br, N_0, N_1, ot); bitdnorm(bw,br,ip); }\
T2(bitget,_usize_)(bw,br,_usize_ - ol - ot,z,ip);\
z<<=ot;\
} op[i] = start = XORDEC(z, start,_usize_); bitdnorm(bw,br,ip);\
} op[_i_] = start = XORDEC(z, start,_usize_); bitdnorm(bw,br,ip);\
}
for(bitdnorm(bw,br,ip),op = out; op != out+(n&~(4-1)); op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); PREFETCH(ip+512,0); }
for( ; op != out+n; op++) FD(0,USIZE);
bitalign(bw,br,ip);
return ip - in;
#undef FD
}
//------ Zigzag of zigzag with bitio for timestamps with bitio ------------------------------------------------------------------------------------------
//--- Chimp: Efficient Lossless Floating Point Compression for Time Series Databases - https://www.vldb.org/pvldb/vol15/p3058-liakos.pdf
size_t T2(fphenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip = in;
int ol = USIZE+1,s;
unsigned char *op = out;
bitdef(bw,br);
if(start) ol = T2(clz,USIZE)(start);
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) z = XORENC(ip[_i_], start,_usize_); start = ip[_i_];\
if(unlikely(!z)) { ol = _usize_+1; bitput(bw,br, 2, 0); }\
else { int t = T2(ctz,_usize_)(z), l = lz2l(T2(clz,_usize_)(z));\
if(t > BSIZE(_usize_)) { s = _usize_ - l - t; ol = _usize_+1; bitput(bw,br, 2, 1); bitput(bw,br, CCODEL, l2c(l)); bitput(bw,br, BSIZE(_usize_), s); z>>=t; \
bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op); \
} else if (l == ol) { s = _usize_ - l; bitput(bw,br, 2, 2);\
bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op);\
} else { s = _usize_ - l; ol = l; bitput(bw,br, 2, 3); bitput(bw,br, CCODEL, l2c(l));\
bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op); }\
} bitenorm(bw,br,op);\
}
for(; ip != in + (n&~(4-1)); ip+=4) { PREFETCH(ip+512,0); FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
for(; ip != in + n ; ip++) FE(0,USIZE);
bitflush(bw,br,op);
return op - out;
#undef FE
}
size_t T2(fphdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op = out;
int ol = 0,s;
unsigned char *ip = in;
bitdef(bw,br);
if(start) ol = T2(clz,USIZE)(start);
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) z=0; unsigned _x; bitdnorm(bw,br,ip); BITGET32(bw,br,2,_x);\
switch(_x) {\
case 3: BITGET32(bw,br, CCODEL, ol); ol = c2l(ol); bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br,(_usize_-ol), z, ip); s = _usize_ == ol?_usize_:_usize_ - ol;\
break;\
case 2: bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br,(_usize_-ol), z, ip); s = _usize_ == ol?_usize_:_usize_ - ol;\
break;\
case 1: BITGET32(bw,br, CCODEL, ol); ol = c2l(ol); BITGET32(bw,br,BSIZE(_usize_), s); s = s?s:_usize_;\
int ot = _usize_ - s - ol;\
bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br, (_usize_ - ol - ot), z, ip); z <<= ot;\
break;\
case 0: ;\
} op[_i_] = start = XORDEC(z, start,_usize_);\
}
bitdnorm(bw,br,ip);
for(; op != out+(n&~(4-1)); op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); PREFETCH(ip+512,0); }
for(; op != out+n; op++) FD(0,USIZE);
bitalign(bw,br,ip);
return ip - in;
#undef FD
}
//-- TurboFloatLz (lz77 like parsing) --------------------------------------
#if USIZE == 64
#define VA_BITS 7 // window size for 64 bits (integer/double)
#define FX_BITS 14
#elif USIZE == 32
#define VA_BITS 4 // 32 bits (intger/float)
#define FX_BITS 12
#else
#define VA_BITS 4 // 16 bits (intger/float)
#define FX_BITS 10
#endif
#define FX_MSK ((1<<FX_BITS)-1)
#define CTZMIN(_usize_) (VA_BITS+BSIZE(_usize_))
#define HASH(_x_) (_x_ & FX_MSK)
size_t T2(fpc0enc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { // simple hash table
uint_t *ip = in;
unsigned char *op = out;
unsigned htab[1<<FX_BITS] = {0}, ol = USIZE+1;
bitdef(bw,br); //if(start) ol = T2(clz,USIZE)(start);
for(*(uint_t *)op = *ip++, op += sizeof(uint_t); ip != in + n; ip++) {
uint_t u = *ip, z;
unsigned hu = HASH(u), h = htab[hu], ofs = (ip-in) - h - 1, c = 0, ctz;
htab[hu] = ip - in;
if(ofs < (1<<VA_BITS)) {
ctz = T2(ctz,USIZE)(z = u ^ in[h]);
ofs = (c = ctz > CTZMIN(USIZE))?ofs:0;
}
z = c?z:u ^ ip[-1];
if(z) {
unsigned l = lz2l(T2(clz,USIZE)(z)), t;
if(c) {
t = USIZE - l - ctz;
bitput(bw,br, (VA_BITS+CCODEL+BSIZE(USIZE)+2), ofs << (CCODEL+BSIZE(USIZE)+2) | (unsigned)l2c(l) << (BSIZE(USIZE)+2) | t<<2);
bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, t, z >> ctz, op);
ol = USIZE+1;
} else {
if(l == ol) { bitput(bw,br, 2, 1); }
else { bitput(bw,br, CCODEL+2, l2c(l)<<2|2); ol = l; }
bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, USIZE - l, z, op);
}
} else { bitput(bw,br,VA_BITS+2, ofs<<2|3 ); ol = USIZE+1; }
bitenorm(bw,br,op);
}
bitflush(bw,br,op);
return op - out;
}
size_t T2(fpcenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { // double entry hashtable
uint_t *ip = in;
unsigned char *op = out;
unsigned htab[1<<(FX_BITS+1)] = {0}, ol = USIZE+1;
bitdef(bw,br); //if(start) ol = T2(clz,USIZE)(start);
for(*(uint_t *)op = *ip++, op += sizeof(uint_t); ip != in + n; ip++) {
uint_t u = *ip, z;
unsigned hu = HASH(u)<<1, h = htab[hu], ofs = (ip-in) - h - 1, c = 0, ctz = 0;
htab[hu+1] = h, htab[hu] = ip - in;
if(ofs < (1<<VA_BITS)) {
ctz = T2(ctz,USIZE)(z = u ^ in[h]);
unsigned h1 = htab[hu+1], ofs1 = (ip-in) - h1 - 1;
if(ofs1 < (1<<VA_BITS)) {
uint_t z1;
unsigned ctz1 = T2(ctz,USIZE)(z1 = u ^ in[h1]), c1 = 0;
if(ctz1 > ctz) { ofs = ofs1; c = c1; z = z1; ctz = ctz1; }
}
ofs = (c = ctz > CTZMIN(USIZE))?ofs:0;
}
z = c?z:u ^ ip[-1];
if(z) {
unsigned l = lz2l(T2(clz,USIZE)(z)), t;
if(c) {
t = USIZE - l - ctz;
bitput(bw,br, (VA_BITS+CCODEL+BSIZE(USIZE)+2), ofs << (CCODEL+BSIZE(USIZE)+2) | (unsigned)l2c(l) << (BSIZE(USIZE)+2) | t<<2);
bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, t, z >> ctz, op);
ol = USIZE+1;
} else {
if(l == ol) { bitput(bw,br, 2, 1); }
else { bitput(bw,br, CCODEL+2, l2c(l)<<2|2); ol = l; }
bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, USIZE - l, z, op);
}
} else { bitput(bw,br,VA_BITS+2, ofs<<2|3 ); ol = USIZE+1; }
bitenorm(bw,br,op);
}
bitflush(bw,br,op);
return op - out;
}
size_t T2(fpcdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op = out, v = 0, z;
unsigned char *ip = in;
unsigned ol = USIZE+1, t;
bitdef(bw,br); //if(start) ol = T2(clz,USIZE)(start);
for(v = *op++ = *(uint_t *)ip, ip += sizeof(uint_t); op != out+n; op++) {
bitdnorm(bw,br,ip); BITGET32(bw,br, 2, t);
switch(t) {
case 0:
BITGET32(bw,br, VA_BITS+CCODEL+BSIZE(USIZE), t);
ol = c2l(bextr32(t, BSIZE(USIZE), CCODEL));
v = *(op - bextr32(t, BSIZE(USIZE)+CCODEL, VA_BITS)-1);
t &= ((1<<BSIZE(USIZE))-1);
t = t?t:USIZE;
bitdnorm(bw,br,ip); T2(bitget,USIZE)(bw,br, t, z, ip);
v ^= z << (USIZE - t - ol);
break;
case 1: bitdnorm(bw,br,ip); T2(bitget,USIZE)(bw,br, USIZE - ol, z, ip); v ^= z; break;
case 2: BITGET32(bw,br, CCODEL, ol); ol = c2l(ol); bitdnorm(bw,br,ip); T2(bitget,USIZE)(bw,br, USIZE - ol, z, ip); v ^= z; break;
case 3: BITGET32(bw,br,VA_BITS,t); v = *(op-t-1); break;
}
*op = v;
}
bitalign(bw,br,ip);
return ip - in;
}
//------ Zigzag of zigzag for timestamps with bitio ------------------------------------------------------------------------------------------
// Improved Gorilla style compression with sliding zigzag of delta + RLE + overflow handling for timestamps in time series.
// More than 300 times better compression and several times faster
#define OVERFLOW if(op >= out_) { *out++ = 1<<4; /*bitini(bw,br); bitput(bw,br,4+3,1<<4); bitflush(bw,br,out);*/ memcpy(out,in,n*sizeof(in[0])); return 1+n*sizeof(in[0]); }
size_t TEMPLATE2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
size_t T2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip = in, pd = 0, *pp = in,dd;
unsigned char *op = out, *out_ = out+n*sizeof(in[0]);
@ -494,12 +717,12 @@ size_t TEMPLATE2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t
uint64_t _r = _ip_ - _pp_;\
if(_r > NL) { _r -= NL; unsigned _b = (bsr64(_r)+7)>>3; bitput(bw,br,4+3+3,(_b-1)<<(4+3)); bitput64(bw,br,_b<<3, _r, _op_); bitenorm(bw,br,_op_); }\
else while(_r--) { bitput(bw,br,1,1); bitenorm(bw,br,_op_); }\
_d_ = TEMPLATE2(zigzagenc,_usize_)(_d_);\
_d_ = T2(zigzagenc,_usize_)(_d_);\
if(!_d_) bitput(bw,br, 1, 1);\
else if(_d_ < (1<< (N2-1))) bitput(bw,br, N2+2,_d_<<2|2);\
else if(_d_ < (1<< (N3-1))) bitput(bw,br, N3+3,_d_<<3|4);\
else if(_d_ < (1<< (N4-1))) bitput(bw,br, N4+4,_d_<<4|8);\
else { unsigned _b = (TEMPLATE2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); TEMPLATE2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\
else { unsigned _b = (T2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); T2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\
bitenorm(bw,br,_op_);\
} while(0)
@ -528,9 +751,10 @@ size_t TEMPLATE2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t
}
bitflush(bw,br,op);
return op - out;
#undef FE
}
size_t TEMPLATE2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
size_t T2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op = out, pd = 0;
unsigned char *ip = in;
@ -557,8 +781,8 @@ size_t TEMPLATE2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
#if (defined(__SSE2__) /*|| defined(__ARM_NEON)*/) && USIZE == 32
__m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4*pd,3*pd,2*pd,1*pd);
for(r += NL, _op = op; op != _op+(r&~7);) {
sv = _mm_add_epi32(sv,cv); _mm_storeu_si128(op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3)
sv = _mm_add_epi32(sv,cv); _mm_storeu_si128(op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4;
sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3)
sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4;
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
#else
@ -576,9 +800,9 @@ size_t TEMPLATE2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
*op = (start+=pd);
continue;
}
TEMPLATE2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip);
T2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip);
}
pd += TEMPLATE2(zigzagdec,USIZE)(dd);
pd += T2(zigzagdec,USIZE)(dd);
*op++ = (start += pd);
bitdnorm(bw,br,ip);
}
@ -587,7 +811,7 @@ size_t TEMPLATE2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
}
//-------- Zigzag with bit/io + RLE --------------------------------------------------------------------------
size_t TEMPLATE2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
size_t T2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip = in, *pp = in,dd;
unsigned char *op = out, *out_ = out+n*sizeof(in[0]);
@ -596,12 +820,12 @@ size_t TEMPLATE2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t
uint64_t _r = _ip_ - _pp_;\
if(_r > NL) { _r -= NL; unsigned _b = (bsr64(_r)+7)>>3; bitput(bw,br,4+3+3,(_b-1)<<(4+3)); bitput64(bw,br,_b<<3, _r, _op_); bitenorm(bw,br,_op_); }\
else while(_r--) { bitput(bw,br,1,1); bitenorm(bw,br,_op_); }\
_d_ = TEMPLATE2(zigzagenc,_usize_)(_d_);\
_d_ = T2(zigzagenc,_usize_)(_d_);\
if(!_d_) bitput(bw,br, 1, 1);\
else if(_d_ < (1<< (N2-1))) bitput(bw,br, N2+2,_d_<<2|2);\
else if(_d_ < (1<< (N3-1))) bitput(bw,br, N3+3,_d_<<3|4);\
else if(_d_ < (1<< (N4-1))) bitput(bw,br, N4+4,_d_<<4|8);\
else { unsigned _b = (TEMPLATE2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); TEMPLATE2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\
else { unsigned _b = (T2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); T2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\
bitenorm(bw,br,_op_);\
} while(0)
@ -630,9 +854,10 @@ size_t TEMPLATE2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t
}
bitflush(bw,br,op);
return op - out;
#undef FE
}
size_t TEMPLATE2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
size_t T2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op = out;
unsigned char *ip = in;
@ -659,8 +884,8 @@ size_t TEMPLATE2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
#if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32
__m128i sv = _mm_set1_epi32(start);
for(r += NL, _op = op; op != _op+(r&~7);) {
_mm_storeu_si128(op, sv); op += 4;
_mm_storeu_si128(op, sv); op += 4;
_mm_storeu_si128((__m128i *)op, sv); op += 4;
_mm_storeu_si128((__m128i *)op, sv); op += 4;
}
#else
for(r+=NL, _op = op; op != _op+(r&~7); op += 8)
@ -670,9 +895,9 @@ size_t TEMPLATE2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
*op = start;
continue;
}
TEMPLATE2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip);
T2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip);
}
dd = TEMPLATE2(zigzagdec,USIZE)(dd);
dd = T2(zigzagdec,USIZE)(dd);
*op++ = (start += dd);
bitdnorm(bw,br,ip);
}