Files
huchangqi e4f6853116
Some checks failed
build / linux (clang, ubuntu-latest) (push) Has been cancelled
build / linux (gcc, ubuntu-latest) (push) Has been cancelled
build / macos (clang, macos-latest) (push) Has been cancelled
add loongarch64 support
2025-05-28 10:04:23 +08:00

955 lines
42 KiB
C

/**
Copyright (C) powturbo 2013-2023
SPDX-License-Identifier: GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "Floating Point + Integer Compression (All integer compression functions can be used for float/double and vice versa)"
#ifndef USIZE
#include <string.h>
#include "include_/conf.h"
#include "include_/bitutil.h"
#include "include_/fp.h"
#include "include_/vp4.h"
#include "include_/bitutil_.h"
#pragma warning( disable : 4005)
#pragma warning( disable : 4090)
#pragma warning( disable : 4068)
#define CCODEL 3
#define lz2l(_x_) cquant[_x_] // quantize leading zeros
#define l2c( _x_) ccode[_x_] // encode
#define c2l( _x_) dcode[_x_] // decode
static unsigned char ccode[] = { // optimized for 32/64 bits
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3,
3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
};
static unsigned char cquant[] = {
0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14,
14, 14, 18, 18, 20, 20, 22, 22, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24
};
static unsigned char dcode[] = {0, 6,10, 14, 18, 20, 22, 24};
//---------------------- template generation --------------------------------------------
#define VSIZE 128
#define P4ENC p4enc
#define P4DEC p4dec
#define P4ENCV p4enc
#define P4DECV p4dec
#define NL 18
#define N4 17 // must be > 16
#define N_0 3
#define N_1 4
#define N2 3
#define N3 5
#define USIZE 8
#include "fp.c"
#undef P4ENCV
#undef P4DECV
#undef N_0
#undef N_1
#undef N2
#undef N3
#define P4ENCV p4enc128v
#define P4DECV p4dec128v
#define N_0 3
#define N_1 5
#define N2 6
#define N3 12
#define USIZE 16
#include "fp.c"
#undef N_0
#undef N_1
#undef N2
#undef N3
#define N_0 4
#define N_1 6
#define N2 6 // for seconds time series
#define N3 10
#define USIZE 32
#include "fp.c"
#undef N4
#undef N1
#undef N3
#undef N_1
#define N_1 7
#define N2 6 // for seconds/milliseconds,... time series
#define N3 12
#define N4 20 // must be > 16
#define USIZE 64
#include "fp.c"
#else //-------------------------------------- Template functions ------------------------------------------------------------
#define XORENC( _u_, _pu_, _usize_) ((_u_)^(_pu_)) // xor predictor
#define XORDEC( _u_, _pu_, _usize_) ((_u_)^(_pu_))
#define ZZAGENC(_u_, _pu_, _usize_) T2(zigzagenc,_usize_)((_u_)-(_pu_)) //zigzag predictor
#define ZZAGDEC(_u_, _pu_, _usize_) (T2(zigzagdec,_usize_)(_u_)+(_pu_))
#define uint_t T3(uint, USIZE, _t)
#define int_t T3(int, USIZE, _t)
//-------- TurboPFor Zigzag of zigzag for unsorted/sorted integer/floating point array ---------------------------------------
size_t T2(p4nzzenc128v,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t _p[VSIZE+32], *ip, *p, pd = 0;
unsigned char *op = out, *out_ = out+n*USIZE/8;
#define FE(_i_,_usize_) { T3(uint, USIZE, _t) u = ip[_i_]; start = u-start; p[_i_] = ZZAGENC(start,pd,_usize_); pd = start; start = u; }
for(ip = in; ip != in + (n&~(VSIZE-1)); ) {
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); if(op >= out_) goto e;
}
if((n = (in+n)-ip) != 0) {
for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE);
op = T2(P4ENC,USIZE)(_p, n, op); if(op >= out_) goto e;
}
if(op >= out_) {
e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op+=n*(USIZE/8);
}
return op - out;
#undef FE
}
size_t T2(p4nzzdec128v,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t _p[VSIZE+32],*p, *op, pd=0;
unsigned char *ip = in;
if(*ip == 0xff) { memcpy(out, in+1, n*(USIZE/8)); return n*(USIZE/8); }
#define FD(_i_,_usize_) { T3(uint, USIZE, _t) u = ZZAGDEC(p[_i_],start+pd,_usize_); op[_i_] = u; pd = u - start; start = u; }
for(op = out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
for(ip = T2(P4DECV,USIZE)(ip, VSIZE, _p), p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
}
if((n = (out+n) - op) != 0)
for(ip = T2(P4DEC,USIZE)(ip, n, _p), p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
return ip - in;
#undef FD
}
/*---------------- TurboFloat XOR: last value Predictor with TurboPFor ---------------------------------------------------------
Compress significantly (115% - 160%) better and faster than facebook's Gorilla algorithm for values
BEST results are obtained with LOSSY COMPRESSION (using fprazor32/fprazor64 in bitutil.c)
Algorithm:
1: XOR value with previous value. We have now leading (for common sign/exponent bits) + mantissa trailing zero bits
2: Eliminate the common block leading zeros of sign/exponent by shifting all values in the block to left
3: reverse values to bring the mantissa trailing zero bits to left for better compression with TurboPFor
*/
size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t _p[VSIZE+32], *ip, *p;
unsigned char *op = out, *out_ = out+n*USIZE/8;
#if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, start,_usize_); b |= p[_i_]; start = u; }
for(ip = in; ip != in + (n&~(VSIZE-1)); ) { uint_t b = 0;
#if defined(__AVX2__) && USIZE >= 32
__m256i bv = _mm256_setzero_si256();
for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8),ip+=64/(USIZE/8)) {
__m256i v0 = _mm256_loadu_si256((__m256i *) ip);
__m256i v1 = _mm256_loadu_si256((__m256i *)(ip+32/(USIZE/8)));
sv = T2(mm256_xore_epi, USIZE)(v0,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *) p, sv); sv = v0;
sv = T2(mm256_xore_epi, USIZE)(v1,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), sv); sv = v1;
}
start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
b = T2(mm256_hor_epi, USIZE)(bv);
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
__m128i bv = _mm_setzero_si128();
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),ip+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) ip);
__m128i v1 = _mm_loadu_si128((__m128i *)(ip+16/(USIZE/8)));
sv = T2(mm_xore_epi, USIZE)(v0,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *) p, sv); sv = v0;
sv = T2(mm_xore_epi, USIZE)(v1,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1;
}
start = (uint_t)T2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
b = T2(mm_hor_epi, USIZE)(bv);
#else
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
#endif
*op++ = b = T2(clz,USIZE)(b);
#define TR(_i_,_usize_) p[_i_] = T2(rbit,_usize_)(p[_i_]<<b)
#if defined(__AVX2__) && USIZE >= 32
for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8)) {
__m256i v0 = _mm256_loadu_si256((__m256i *)p);
__m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8)));
v0 = T2(_mm256_slli_epi, USIZE)(v0,b);
v1 = T2(_mm256_slli_epi, USIZE)(v1,b);
v0 = T2( mm256_rbit_epi, USIZE)(v0);
v1 = T2( mm256_rbit_epi, USIZE)(v1);
_mm256_storeu_si256((__m256i *) p, v0);
_mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1);
}
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
v0 = T2(_mm_slli_epi, USIZE)(v0,b);
v0 = T2( mm_rbit_epi, USIZE)(v0);
v1 = T2(_mm_slli_epi, USIZE)(v1,b);
v1 = T2( mm_rbit_epi, USIZE)(v1);
_mm_storeu_si128((__m128i *) p, v0);
_mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), v1);
}
#else
for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); }
#endif
op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); if(op >= out_) goto e;
}
if((n = (in+n)-ip) != 0) { uint_t b = 0;
for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE);
b = T2(clz,USIZE)(b);
*op++ = b;
for(p = _p; p != &_p[n]; p++) TR(0,USIZE);
op = T2(P4ENC,USIZE)(_p, n, op); if(op >= out_) goto e;
}
if(op >= out_) {
e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op+=n*(USIZE/8);
}
return op - out;
#undef FE
}
size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t *op, _p[VSIZE+32],*p;
unsigned char *ip = in;
if(*ip == 0xff) { memcpy(out, in+1, n*(USIZE/8)); return n*(USIZE/8); }
#if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif
#define FD(_i_,_usize_) { T3(uint, USIZE, _t) u = p[_i_]; u = T2(rbit,_usize_)(u)>>b; u = XORDEC(u, start,_usize_); op[_i_] = start = u; }
for(op = out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+384,0);
unsigned b = *ip++; ip = T2(P4DECV,USIZE)(ip, VSIZE, _p);
#if defined(__AVX2__) && USIZE >= 32
for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8),op+=64/(USIZE/8)) {
__m256i v0 = _mm256_loadu_si256((__m256i *)p);
__m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8)));
v0 = T2( mm256_rbit_epi, USIZE)(v0);
v1 = T2( mm256_rbit_epi, USIZE)(v1);
v0 = T2(_mm256_srli_epi, USIZE)(v0,b);
v1 = T2(_mm256_srli_epi, USIZE)(v1,b);
v0 = T2( mm256_xord_epi, USIZE)(v0,sv);
sv = T2( mm256_xord_epi, USIZE)(v1,v0);
_mm256_storeu_si256((__m256i *)op, v0);
_mm256_storeu_si256((__m256i *)(op+32/(USIZE/8)), sv);
}
start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),op+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *)p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
v0 = T2( mm_rbit_epi, USIZE)(v0);
v0 = T2(_mm_srli_epi, USIZE)(v0,b);
v0 = T2( mm_xord_epi, USIZE)(v0,sv);
v1 = T2( mm_rbit_epi, USIZE)(v1);
v1 = T2(_mm_srli_epi, USIZE)(v1,b);
sv = T2( mm_xord_epi, USIZE)(v1,v0);
_mm_storeu_si128((__m128i *) op, v0);
_mm_storeu_si128((__m128i *)(op+16/(USIZE/8)), sv);
}
start = (uint_t)T2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
#else
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
#endif
}
if((n = (out+n) - op) != 0) {
uint_t b = *ip++;
for(ip = T2(P4DEC,USIZE)(ip, n, _p), p = _p; p < &_p[n]; p++,op++) FD(0,USIZE);
}
return ip - in;
#undef FD
}
//-------- TurboFloat FCM: Finite Context Method Predictor ---------------------------------------------------------------
#define HBITS 13 //15
#define HASH64(_h_,_u_) (((_h_)<<5 ^ (_u_)>>50) & ((1u<<HBITS)-1))
#define HASH32(_h_,_u_) (((_h_)<<4 ^ (_u_)>>23) & ((1u<<HBITS)-1))
#define HASH16(_h_,_u_) (((_h_)<<3 ^ (_u_)>>12) & ((1u<<HBITS)-1))
#define HASH8( _h_,_u_) (((_h_)<<2 ^ (_u_)>> 5) & ((1u<<HBITS)-1))
size_t T2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t htab[1<<HBITS] = {0}, _p[VSIZE+64], *ip, *p;
unsigned h = 0, m;
unsigned char *op = out, *out_ = out+n*(USIZE/8);
#if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif
for(ip = in; ip != in + (n&~(VSIZE-1)); ) {
uint_t b = 0;
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, htab[h],_usize_); b |= p[_i_]; htab[h] = u; h = T2(HASH,_usize_)(h,u); }
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
*op++ = b = T2(clz,USIZE)(b);
#if defined(__AVX2__) && USIZE >= 32
for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8)) {
__m256i v0 = _mm256_loadu_si256((__m256i *)p);
__m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8)));
v0 = T2(_mm256_slli_epi, USIZE)(v0,b);
v1 = T2(_mm256_slli_epi, USIZE)(v1,b);
v0 = T2( mm256_rbit_epi, USIZE)(v0);
v1 = T2( mm256_rbit_epi, USIZE)(v1);
_mm256_storeu_si256((__m256i *) p, v0);
_mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1);
}
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
v0 = T2(_mm_slli_epi, USIZE)(v0,b);
v0 = T2( mm_rbit_epi, USIZE)(v0);
v1 = T2(_mm_slli_epi, USIZE)(v1,b);
v1 = T2( mm_rbit_epi, USIZE)(v1);
_mm_storeu_si128((__m128i *) p, v0);
_mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), v1);
}
#else
#define TR(_i_,_usize_) p[_i_] = T2(rbit,_usize_)(p[_i_]<<b)
for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); }
#endif
op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); if(op >= out_) goto e;
}
if((m = (in+n)-ip) != 0) {
uint_t b = 0;
for(p = _p; p != &_p[m]; p++,ip++) FE(0,USIZE);
b = b?T2(clz,USIZE)(b):USIZE;
*op++ = b;
for(p = _p; p != &_p[m]; p++) TR(0,USIZE);
op = T2(P4ENC,USIZE)(_p, m, op); if(op >= out_) goto e;
}
if(op >= out_) { e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op += n*(USIZE/8); }
return op - out;
#undef FE
}
size_t T2(fpfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t *op, _p[VSIZE+32], htab[1<<HBITS] = {0}, *p;
unsigned h = 0;
unsigned char *ip = in;
if(*in == 0xff) { memcpy(out, in+1, n*(USIZE/8)); return 1+n*(USIZE/8); }
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = p[_i_]; u = T2(rbit,_usize_)(u)>>b;\
u = XORDEC(u, htab[h], _usize_); op[_i_] = u; htab[h] = u; h = T2(HASH,_usize_)(h,u);\
}
for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
unsigned b = *ip++; ip = T2(P4DECV,USIZE)(ip, VSIZE, _p);
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
}
if((n = ((uint_t *)out+n) - op) != 0) {
unsigned b = *ip++; ip = T2(P4DEC,USIZE)(ip, n, _p);
for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
}
return ip - in;
#undef FD
}
//-------- TurboFloat DFCM: Differential Finite Context Method Predictor ----------------------------------------------------------
size_t T2(fpdfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip, _p[VSIZE+64], htab[1<<HBITS] = {0}, *p;
unsigned h = 0, m;
unsigned char *op = out, *out_ = out+n*(USIZE/8);
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, (htab[h]+start),_usize_); b |= p[_i_]; \
htab[h] = start = u - start; h = T2(HASH,_usize_)(h,start); start = u;\
}
for(ip = in; ip != in + (n&~(VSIZE-1)); ) {
uint_t b = 0;
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
#define TR(_i_,_usize_) p[_i_] = T2(rbit,_usize_)(p[_i_]<<b)
b = b?T2(clz,USIZE)(b):USIZE;
for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); }
*op++ = b;
op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); if(op >= out_) goto e;
}
if((m = (in+n)-ip) != 0) {
uint_t b = 0;
for(p = _p; p != &_p[m]; p++,ip++) FE(0,USIZE);
b = b?T2(clz,USIZE)(b):USIZE;
for(p = _p; p != &_p[m]; p++) TR(0,USIZE);
*op++ = b; op = T2(P4ENC,USIZE)(_p, m, op); if(op >= out_) goto e;
}
if(op >= out_) { e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op += n*(USIZE/8); }
return op - out;
#undef FE
}
size_t T2(fpdfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t _p[VSIZE+64], htab[1<<HBITS] = {0}, *op, *p;
unsigned h = 0;
unsigned char *ip = in;
if(*in == 0xff) { memcpy(out, in+1, n*(USIZE/8)); return 1+n*(USIZE/8); }
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = T2(rbit,_usize_)(p[_i_])>>b; u = XORDEC(u, (htab[h]+start),_usize_); \
op[_i_] = u; htab[h] = start = u-start; h = T2(HASH,_usize_)(h,start); start = u;\
}
for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
uint_t b = *ip++;
ip = T2(P4DECV,USIZE)(ip, VSIZE, _p);
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
}
if((n = ((uint_t *)out+n) - op) != 0) {
uint_t b = *ip++;
ip = T2(P4DEC,USIZE)(ip, n, _p);
for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
}
return ip - in;
#undef FD
}
//-------- TurboFloat Double delta DFCM: Differential Finite Context Method Predictor ----------------------------------------------------------
size_t T2(fp2dfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip, _p[VSIZE+32], htab[1<<HBITS] = {0}, *p,start0 = start; start = 0;
unsigned h = 0,m;
unsigned char *op = out, *out_ = out+n*USIZE/8;
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, (htab[h]+start),_usize_); b |= p[_i_]; \
htab[h] = start = u - start; h = T2(HASH,_usize_)(h,start); start = start0; start0 = u;\
}
#define TR(_i_,_usize_) p[_i_] = T2(rbit,_usize_)(p[_i_]<<b)
for(ip = in; ip != in + (n&~(VSIZE-1)); ) {
uint_t b = 0;
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
b = b?T2(clz,USIZE)(b):USIZE;
for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); }
*op++ = b; op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); if(op >= out_) goto e;
}
if((m = (in+n)-ip) != 0) {
uint_t b = 0;
for(p = _p; p != &_p[m]; p++,ip++) FE(0,USIZE);
b = T2(clz,USIZE)(b);
for(p = _p; p != &_p[m]; p++) TR(0,USIZE);
*op++ = b; op = T2(P4ENC,USIZE)(_p, m, op); if(op >= out_) goto e;
}
if(op >= out_) { e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op+=n*(USIZE/8); }
return op - out;
#undef FE
}
size_t T2(fp2dfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) {
uint_t _p[VSIZE+32], htab[1<<HBITS] = {0}, *op, *p, start0 = start; start = 0;
unsigned h = 0;
unsigned char *ip = in;
if(*ip == 0xff) { memcpy(out, in+1, n*(USIZE/8)); return n*(USIZE/8); }
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = T2(rbit,_usize_)(p[_i_])>>b; u = XORDEC(u, (htab[h]+start),_usize_);\
op[_i_] = u; htab[h] = start = u-start; h = T2(HASH,_usize_)(h,start); start = start0; start0 = u;\
}
for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0);
uint_t b = *ip++;
ip = T2(P4DECV,USIZE)(ip, VSIZE, _p);
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
}
if((n = ((uint_t *)out+n) - op) != 0) {
uint_t b = *ip++;
ip = T2(P4DEC,USIZE)(ip, n, _p);
for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE);
}
return ip - in;
#undef FD
}
#define BSIZE(_usize_) (_usize_==64?6:(_usize_==32?5:(_usize_==16?4:3)))
//-------- TurboGorilla : Improved Gorilla style (see Facebook paper) Floating point compression with bitio ------------------------------------
#define bitput2(_bw_,_br_, _n1_, _n2_, _x_) {\
if(!_x_) bitput(_bw_,_br_, 1, 1);/*1*/\
else if( _x_ < (1<< (_n1_-1))) bitput(_bw_,_br_, _n1_+2,_x_<<2|2);/*10*/\
else bitput(_bw_,_br_, _n2_+2,_x_<<2 );/*00*/\
}
#define bitget2(_bw_,_br_, _n1_, _n2_, _x_) { _x_ = bitbw(_bw_,_br_);\
if(_x_ & 1) bitrmv(_bw_,_br_, 0+1), _x_ = 0;\
else if(_x_ & 2) bitrmv(_bw_,_br_,_n1_+2), _x_ = BZHI32(_x_>>2, _n1_);\
else bitrmv(_bw_,_br_,_n2_+2), _x_ = BZHI32(_x_>>2, _n2_);\
}
size_t T2(fpgenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip = in;
unsigned ol = 0, ot = 0;
unsigned char *op = out, *out_ = out+n*USIZE/8;
bitdef(bw,br);
if(start) { ol = T2(clz,USIZE)(start); ot = T2(ctz,USIZE)(start); }
*op++ = 0;
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) z = XORENC(ip[_i_], start,_usize_); start = ip[_i_];\
if(unlikely(!z)) bitput( bw,br, 2, 2); /*01*/\
else {\
unsigned t = T2(ctz,_usize_)(z), l = /*lz2l[*/T2(clz,_usize_)(z);\
unsigned s = _usize_ - l - t, os = _usize_ - ol - ot;\
if(l >= ol && t >= ot && os < 6+5+s) { bitput( bw,br, 1, 1); /*1*/ T2(bitput,_usize_)(bw,br, os, z>>ot,op); }\
else { /*00*/ /*bitput( bw,br, 2, 0); bitput(bw,br, 3, lz2c(l));*/\
bitput( bw,br, 2+BSIZE(_usize_), l<<2); bitput2(bw,br, N_0, N_1, t); bitenorm(bw,br,op);T2(bitput,_usize_)(bw,br, s, z>>t,op); ol = l; ot = t; }\
} bitenorm(bw,br,op); if(op >= out_) goto e;\
}
for(; ip != in + (n&~(4-1)); ip+=4) { PREFETCH(ip+512,0); FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
for(; ip != in + n ; ip++) FE(0,USIZE);
bitflush(bw,br,op);
if(op >= out_) {
e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op+=n*(USIZE/8);
}
return op - out;
#undef FE
}
size_t T2(fpgdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op;
int ol = 0,ot = 0,x;
unsigned char *ip = in;
bitdef(bw,br);
if(*ip++ == 0xff) { memcpy(out, in+1, n*(USIZE/8)); return n*(USIZE/8); }
if(start) { ol = T2(clz,USIZE)(start); ot = T2(ctz,USIZE)(start); }
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) z=0; unsigned _x,_y; BITGET32(bw,br,1,_x); \
if(likely(!_x)) { BITGET32(bw,br,1,_y); _x = _y << 1 | _x; }\
if(_x < 2) { \
if(!_x) { /*BITGET32(bw,br, 3, ol); ol = c2l(ol);*/BITGET32(bw,br,BSIZE(_usize_),ol); bitget2(bw,br, N_0, N_1, ot); bitdnorm(bw,br,ip); }\
T2(bitget,_usize_)(bw,br,_usize_ - ol - ot,z,ip);\
z<<=ot;\
} op[_i_] = start = XORDEC(z, start,_usize_); bitdnorm(bw,br,ip);\
}
for(bitdnorm(bw,br,ip),op = out; op != out+(n&~(4-1)); op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); PREFETCH(ip+512,0); }
for( ; op != out+n; op++) FD(0,USIZE);
bitalign(bw,br,ip);
return ip - in;
#undef FD
}
//--- Chimp: Efficient Lossless Floating Point Compression for Time Series Databases - https://www.vldb.org/pvldb/vol15/p3058-liakos.pdf
size_t T2(fphenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip = in;
int ol = USIZE+1,s;
unsigned char *op = out, *out_ = out+n*USIZE/8;
*op++ = 0;
bitdef(bw,br);
if(start) ol = T2(clz,USIZE)(start);
#define FE(_i_,_usize_) { T3(uint, _usize_, _t) z = XORENC(ip[_i_], start,_usize_); start = ip[_i_];\
if(unlikely(!z)) { ol = _usize_+1; bitput(bw,br, 2, 0); }\
else { int t = T2(ctz,_usize_)(z), l = lz2l(T2(clz,_usize_)(z));\
if(t > BSIZE(_usize_)) { s = _usize_ - l - t; ol = _usize_+1; bitput(bw,br, 2, 1); bitput(bw,br, CCODEL, l2c(l)); bitput(bw,br, BSIZE(_usize_), s); z>>=t; \
bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op); \
} else if (l == ol) { s = _usize_ - l; bitput(bw,br, 2, 2);\
bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op);\
} else { s = _usize_ - l; ol = l; bitput(bw,br, 2, 3); bitput(bw,br, CCODEL, l2c(l));\
bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op); }\
} bitenorm(bw,br,op); if(op >= out_) goto e;\
}
for(; ip != in + (n&~(4-1)); ip+=4) { PREFETCH(ip+512,0); FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
for(; ip != in + n ; ip++) FE(0,USIZE);
bitflush(bw,br,op);
if(op >= out_) {
e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op+=n*(USIZE/8);
}
return op - out;
#undef FE
}
size_t T2(fphdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op = out;
int ol = 0,s;
unsigned char *ip = in;
bitdef(bw,br);
if(*ip++ == 0xff) { memcpy(out, in+1, n*(USIZE/8)); return n*(USIZE/8); }
if(start) ol = T2(clz,USIZE)(start);
#define FD(_i_,_usize_) { T3(uint, _usize_, _t) z=0; unsigned _x; bitdnorm(bw,br,ip); BITGET32(bw,br,2,_x);\
switch(_x) {\
case 3: BITGET32(bw,br, CCODEL, ol); ol = c2l(ol); bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br,(_usize_-ol), z, ip); s = _usize_ == ol?_usize_:_usize_ - ol;\
break;\
case 2: bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br,(_usize_-ol), z, ip); s = _usize_ == ol?_usize_:_usize_ - ol;\
break;\
case 1: BITGET32(bw,br, CCODEL, ol); ol = c2l(ol); BITGET32(bw,br,BSIZE(_usize_), s); s = s?s:_usize_;\
int ot = _usize_ - s - ol;\
bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br, (_usize_ - ol - ot), z, ip); z <<= ot;\
break;\
case 0: ;\
} op[_i_] = start = XORDEC(z, start,_usize_);\
}
bitdnorm(bw,br,ip);
for(; op != out+(n&~(4-1)); op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); PREFETCH(ip+512,0); }
for(; op != out+n; op++) FD(0,USIZE);
bitalign(bw,br,ip);
return ip - in;
#undef FD
}
//-- TurboFloatLz (lz77 like parsing) --------------------------------------
#if USIZE == 64
#define VA_BITS 7 // window size for 64 bits (integer/double)
#define FX_BITS 14
#elif USIZE == 32
#define VA_BITS 4 // 32 bits (intger/float)
#define FX_BITS 12
#else
#define VA_BITS 4 // 16 bits (intger/float)
#define FX_BITS 10
#endif
#define CTZMIN(_usize_) (VA_BITS+BSIZE(_usize_))
#define HASH(_x_) (_x_ & ((1<<FX_BITS)-1))
#define HASH16(_h_,_u_) HASH(_u_)
#define HASH32(_h_,_u_) HASH(_u_) //(((_h_) ^ (_u_)>>23) & ((1<<FX_BITS)-1))
#define HASH64(_h_,_u_) HASH(_u_) //(((_h_) ^ (_u_)>>50) & ((1<<FX_BITS)-1))
size_t T2(fpc0enc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { // simple hash table
uint_t *ip = in;
unsigned char *op = out, *out_ = out+n*USIZE/8;
unsigned htab[1<<FX_BITS] = {0}, ol = USIZE+1;
bitdef(bw,br); //if(start) ol = T2(clz,USIZE)(start);
for(*op++ = 0,*(uint_t *)op = *ip++, op += sizeof(uint_t); ip != in + n; ip++) {
uint_t u = *ip, z;
unsigned hu = T2(HASH,USIZE)(u,u), h = htab[hu], ofs = (ip-in) - h - 1, c = 0, ctz;
htab[hu] = ip - in;
if(ofs < (1<<VA_BITS)) {
ctz = T2(ctz,USIZE)(z = u ^ in[h]);
ofs = (c = ctz > CTZMIN(USIZE))?ofs:0;
}
z = c?z:u ^ ip[-1];
if(z) {
unsigned l = lz2l(T2(clz,USIZE)(z)), t;
if(c) {
t = USIZE - l - ctz;
bitput(bw,br, (VA_BITS+CCODEL+BSIZE(USIZE)+2), ofs << (CCODEL+BSIZE(USIZE)+2) | (unsigned)l2c(l) << (BSIZE(USIZE)+2) | t<<2);
bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, t, z >> ctz, op);
ol = USIZE+1;
} else {
if(l == ol) { bitput(bw,br, 2, 1); }
else { bitput(bw,br, CCODEL+2, l2c(l)<<2|2); ol = l; }
bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, USIZE - l, z, op);
}
} else { bitput(bw,br,VA_BITS+2, ofs<<2|3 ); ol = USIZE+1; }
bitenorm(bw,br,op); if(op >= out_) goto e;
}
bitflush(bw,br,op);
if(op >= out_) {
e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op+=n*(USIZE/8);
}
return op - out;
}
size_t T2(fpcenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { // double entry hashtable
uint_t *ip = in;
unsigned char *op = out, *out_ = out+n*(USIZE/8);
unsigned htab[1<<(FX_BITS+1)] = {0}, ol = USIZE+1;
bitdef(bw,br); //if(start) ol = T2(clz,USIZE)(start);
for(*op++ = 0,*(uint_t *)op = *ip++, op += sizeof(uint_t); ip != in + n; ip++) {
uint_t u = *ip, z;
unsigned hu = T2(HASH,USIZE)(u,u)<<1, h = htab[hu], ofs = (ip-in) - h - 1, c = 0, ctz = 0;
htab[hu+1] = h, htab[hu] = ip - in;
if(ofs < (1<<VA_BITS)) {
ctz = T2(ctz,USIZE)(z = u ^ in[h]);
unsigned h1 = htab[hu+1], ofs1 = (ip-in) - h1 - 1;
if(ofs1 < (1<<VA_BITS)) {
uint_t z1;
unsigned ctz1 = T2(ctz,USIZE)(z1 = u ^ in[h1]), c1 = 0;
if(ctz1 > ctz) { ofs = ofs1; c = c1; z = z1; ctz = ctz1; }
}
ofs = (c = ctz > CTZMIN(USIZE))?ofs:0;
}
z = c?z:u ^ ip[-1];
if(z) {
unsigned l = lz2l(T2(clz,USIZE)(z)), t;
if(c) {
t = USIZE - l - ctz;
bitput(bw,br, (VA_BITS+CCODEL+BSIZE(USIZE)+2), ofs << (CCODEL+BSIZE(USIZE)+2) | (unsigned)l2c(l) << (BSIZE(USIZE)+2) | t<<2);
bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, t, z >> ctz, op);
ol = USIZE+1;
} else {
if(l == ol) { bitput(bw,br, 2, 1); }
else { bitput(bw,br, CCODEL+2, l2c(l)<<2|2); ol = l; }
bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, USIZE - l, z, op);
}
} else { bitput(bw,br,VA_BITS+2, ofs<<2|3 ); ol = USIZE+1; }
bitenorm(bw,br,op); if(op >= out_) goto e;
}
bitflush(bw,br,op);
if(op >= out_) {
e:op = out; *op++ = 0xff; memcpy(op, in, n*(USIZE/8)); op+=n*(USIZE/8);
}
return op - out;
}
size_t T2(fpcdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op = out, v = 0, z;
unsigned char *ip = in;
unsigned ol = USIZE+1, t;
bitdef(bw,br); //if(start) ol = T2(clz,USIZE)(start);
if(*ip++ == 0xff) { memcpy(out, in+1, n*(USIZE/8)); return n*(USIZE/8); }
for(v = *op++ = *(uint_t *)ip, ip += sizeof(uint_t); op != out+n; op++) {
bitdnorm(bw,br,ip); BITGET32(bw,br, 2, t);
switch(t) {
case 0:
BITGET32(bw,br, VA_BITS+CCODEL+BSIZE(USIZE), t);
ol = c2l(bextr32(t, BSIZE(USIZE), CCODEL));
v = *(op - bextr32(t, BSIZE(USIZE)+CCODEL, VA_BITS)-1);
t &= ((1<<BSIZE(USIZE))-1);
t = t?t:USIZE;
bitdnorm(bw,br,ip); T2(bitget,USIZE)(bw,br, t, z, ip);
v ^= z << (USIZE - t - ol);
break;
case 1: bitdnorm(bw,br,ip); T2(bitget,USIZE)(bw,br, USIZE - ol, z, ip); v ^= z; break;
case 2: BITGET32(bw,br, CCODEL, ol); ol = c2l(ol); bitdnorm(bw,br,ip); T2(bitget,USIZE)(bw,br, USIZE - ol, z, ip); v ^= z; break;
case 3: BITGET32(bw,br,VA_BITS,t); v = *(op-t-1); break;
}
*op = v;
}
bitalign(bw,br,ip);
return ip - in;
}
//------ Zigzag of zigzag for timestamps with bitio ------------------------------------------------------------------------------------------
// Improved Gorilla style compression with sliding zigzag of delta + RLE + overflow handling for timestamps in time series.
// More than 300 times better compression and several times faster
#define OVERFLOW if(op >= out_) { *out++ = 1<<4; /*bitini(bw,br); bitput(bw,br,4+3,1<<4); bitflush(bw,br,out);*/ memcpy(out,in,n*sizeof(in[0])); return 1+n*sizeof(in[0]); }
size_t T2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip = in, pd = 0, *pp = in,dd;
unsigned char *op = out, *out_ = out+n*sizeof(in[0]);
bitdef(bw,br);
#define FE(_pp_, _ip_, _d_, _op_,_usize_) do {\
uint64_t _r = _ip_ - _pp_;\
if(_r > NL) { _r -= NL; unsigned _b = (bsr64(_r)+7)>>3; bitput(bw,br,4+3+3,(_b-1)<<(4+3)); bitput64(bw,br,_b<<3, _r, _op_); bitenorm(bw,br,_op_); }\
else while(_r--) { bitput(bw,br,1,1); bitenorm(bw,br,_op_); }\
_d_ = T2(zigzagenc,_usize_)(_d_);\
if(!_d_) bitput(bw,br, 1, 1);\
else if(_d_ < (1<< (N2-1))) bitput(bw,br, N2+2,_d_<<2|2);\
else if(_d_ < (1<< (N3-1))) bitput(bw,br, N3+3,_d_<<3|4);\
else if(_d_ < (1<< (N4-1))) bitput(bw,br, N4+4,_d_<<4|8);\
else { unsigned _b = (T2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); T2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\
bitenorm(bw,br,_op_);\
} while(0)
if(n > 4)
for(; ip < in+(n-1-4);) {
start = ip[0] - start; dd = start-pd; pd = start; start = ip[0]; if(dd) goto a; ip++;
start = ip[0] - start; dd = start-pd; pd = start; start = ip[0]; if(dd) goto a; ip++;
start = ip[0] - start; dd = start-pd; pd = start; start = ip[0]; if(dd) goto a; ip++;
start = ip[0] - start; dd = start-pd; pd = start; start = ip[0]; if(dd) goto a; ip++; PREFETCH(ip+256,0);
continue;
a:;
FE(pp,ip, dd, op,USIZE);
pp = ++ip; OVERFLOW;
}
for(;ip < in+n;) {
start = ip[0] - start; dd = start-pd; pd = start; start = ip[0]; if(dd) goto b; ip++;
continue;
b:;
FE(pp,ip, dd, op,USIZE);
pp = ++ip; OVERFLOW;
}
if(ip > pp) {
start = ip[0] - start; dd = start-pd;
FE(pp, ip, dd, op, USIZE); OVERFLOW;
}
bitflush(bw,br,op);
return op - out;
#undef FE
}
size_t T2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op = out, pd = 0;
unsigned char *ip = in;
bitdef(bw,br);
for(bitdnorm(bw,br,ip); op < out+n; ) { PREFETCH(ip+384,0);
#if USIZE == 64
uint_t dd = bitbw(bw,br);
#else
uint32_t dd = bitbw(bw,br);
#endif
if(dd & 1) bitrmv(bw,br, 0+1), dd = 0;
else if(dd & 2) bitrmv(bw,br,N2+2), dd = BZHI32(dd>>2, N2);
else if(dd & 4) bitrmv(bw,br,N3+3), dd = BZHI32(dd>>3, N3);
else if(dd & 8) bitrmv(bw,br,N4+4), dd = BZHI32(dd>>4, N4);
else {
unsigned b; uint_t *_op; uint64_t r;
BITGET32(bw,br, 4+3, b);
if((b>>=4) <= 1) {
if(b==1) { // No compression, because of overflow
memcpy(out,in+1, n*sizeof(out[0]));
return 1+n*sizeof(out[0]);
}
BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd);
#if (defined(__SSE2__) /*|| defined(__ARM_NEON) || defined(__loongarch_lp64)*/) && USIZE == 32
__m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4*pd,3*pd,2*pd,1*pd);
for(r += NL, _op = op; op != _op+(r&~7);) {
sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3)
sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4;
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
#else
for(r+=NL, _op = op; op != _op+(r&~7); op += 8)
op[0]=(start+=pd),
op[1]=(start+=pd),
op[2]=(start+=pd),
op[3]=(start+=pd),
op[4]=(start+=pd),
op[5]=(start+=pd),
op[6]=(start+=pd),
op[7]=(start+=pd);
#endif
for(; op != _op+r; op++)
*op = (start+=pd);
continue;
}
T2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip);
}
pd += T2(zigzagdec,USIZE)(dd);
*op++ = (start += pd);
bitdnorm(bw,br,ip);
}
bitalign(bw,br,ip);
return ip - in;
}
//-------- Zigzag with bit/io + RLE --------------------------------------------------------------------------
size_t T2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) {
uint_t *ip = in, *pp = in,dd;
unsigned char *op = out, *out_ = out+n*sizeof(in[0]);
bitdef(bw,br);
#define FE(_pp_, _ip_, _d_, _op_,_usize_) do {\
uint64_t _r = _ip_ - _pp_;\
if(_r > NL) { _r -= NL; unsigned _b = (bsr64(_r)+7)>>3; bitput(bw,br,4+3+3,(_b-1)<<(4+3)); bitput64(bw,br,_b<<3, _r, _op_); bitenorm(bw,br,_op_); }\
else while(_r--) { bitput(bw,br,1,1); bitenorm(bw,br,_op_); }\
_d_ = T2(zigzagenc,_usize_)(_d_);\
if(!_d_) bitput(bw,br, 1, 1);\
else if(_d_ < (1<< (N2-1))) bitput(bw,br, N2+2,_d_<<2|2);\
else if(_d_ < (1<< (N3-1))) bitput(bw,br, N3+3,_d_<<3|4);\
else if(_d_ < (1<< (N4-1))) bitput(bw,br, N4+4,_d_<<4|8);\
else { unsigned _b = (T2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); T2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\
bitenorm(bw,br,_op_);\
} while(0)
if(n > 4)
for(; ip < in+(n-1-4);) {
dd = ip[0] - start; start = ip[0]; if(dd) goto a; ip++;
dd = ip[0] - start; start = ip[0]; if(dd) goto a; ip++;
dd = ip[0] - start; start = ip[0]; if(dd) goto a; ip++;
dd = ip[0] - start; start = ip[0]; if(dd) goto a; ip++; PREFETCH(ip+256,0);
continue;
a:;
FE(pp,ip, dd, op,USIZE);
pp = ++ip; OVERFLOW;
}
for(;ip < in+n;) {
dd = ip[0] - start; start = ip[0]; if(dd) goto b; ip++;
continue;
b:;
FE(pp,ip, dd, op,USIZE);
pp = ++ip; OVERFLOW;
}
if(ip > pp) {
dd = ip[0] - start; start = ip[0];
FE(pp, ip, dd, op, USIZE); OVERFLOW;
}
bitflush(bw,br,op);
return op - out;
#undef FE
}
size_t T2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0;
uint_t *op = out;
unsigned char *ip = in;
bitdef(bw,br);
for(bitdnorm(bw,br,ip); op < out+n; ) { PREFETCH(ip+384,0);
#if USIZE == 64
uint_t dd = bitbw(bw,br);
#else
uint32_t dd = bitbw(bw,br);
#endif
if(dd & 1) bitrmv(bw,br, 0+1), dd = 0;
else if(dd & 2) bitrmv(bw,br,N2+2), dd = BZHI32(dd>>2, N2);
else if(dd & 4) bitrmv(bw,br,N3+3), dd = BZHI32(dd>>3, N3);
else if(dd & 8) bitrmv(bw,br,N4+4), dd = BZHI32(dd>>4, N4);
else {
unsigned b; uint_t *_op; uint64_t r;
BITGET32(bw,br, 4+3, b);
if((b>>=4) <= 1) {
if(b==1) { // No compression, because of overflow
memcpy(out,in+1, n*sizeof(out[0]));
return 1+n*sizeof(out[0]);
}
BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd);
#if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
__m128i sv = _mm_set1_epi32(start);
for(r += NL, _op = op; op != _op+(r&~7);) {
_mm_storeu_si128((__m128i *)op, sv); op += 4;
_mm_storeu_si128((__m128i *)op, sv); op += 4;
}
#else
for(r+=NL, _op = op; op != _op+(r&~7); op += 8)
op[0]=op[1]=op[2]=op[3]=op[4]=op[5]=op[6]=op[7]=start;
#endif
for(; op != _op+r; op++)
*op = start;
continue;
}
T2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip);
}
dd = T2(zigzagdec,USIZE)(dd);
*op++ = (start += dd);
bitdnorm(bw,br,ip);
}
bitalign(bw,br,ip);
return ip - in;
}
#undef USIZE
#endif