diff --git a/fp.c b/fp.c index 36f4852..c61b6d9 100644 --- a/fp.c +++ b/fp.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2019 + Copyright (C) powturbo 2013-2023 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,17 +21,39 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// "Floating Point + Integer Compression (All integer compression functions can be used for float/double and vice versa)" +// "Floating Point + Integer Compression (All integer compression functions can be used for float/double and vice versa)" #ifndef USIZE +#include +#include "include_/conf.h" +#include "include_/bitutil.h" +#include "include_/fp.h" +#include "include_/vp4.h" + +#include "include_/bitutil_.h" + #pragma warning( disable : 4005) #pragma warning( disable : 4090) #pragma warning( disable : 4068) -#define BITUTIL_IN -#include "conf.h" -#include "vp4.h" -#include "bitutil.h" -#include "fp.h" +#define CCODEL 3 +#define lz2l(_x_) cquant[_x_] // quantize leading zeros +#define l2c( _x_) ccode[_x_] // encode +#define c2l( _x_) dcode[_x_] // decode + +static unsigned char ccode[] = { // optimized for 32/64 bits + 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, + 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +}; +static unsigned char cquant[] = { + 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 10, 10, 10, 10, 14, 14, + 14, 14, 18, 18, 20, 20, 22, 22, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 +}; +static unsigned char dcode[] = {0, 6,10, 14, 18, 20, 22, 24}; + //---------------------- template generation -------------------------------------------- #define VSIZE 128 @@ -50,6 +72,12 @@ #define N3 5 #define USIZE 8 #include "fp.c" +#undef P4ENCV +#undef P4DECV +#undef N_0 +#undef N_1 +#undef N2 +#undef N3 #define P4ENCV p4enc128v #define P4DECV p4dec128v @@ -61,6 +89,10 @@ #define N3 12 #define USIZE 16 #include "fp.c" +#undef N_0 +#undef N_1 +#undef N2 +#undef N3 #define N_0 4 #define N_1 6 @@ -69,6 +101,10 @@ #define N3 10 #define USIZE 32 #include "fp.c" +#undef N4 +#undef N1 +#undef N3 +#undef N_1 #define N_1 7 #define N2 6 // for seconds/milliseconds,... time series @@ -81,95 +117,98 @@ #define XORENC( _u_, _pu_, _usize_) ((_u_)^(_pu_)) // xor predictor #define XORDEC( _u_, _pu_, _usize_) ((_u_)^(_pu_)) -#define ZZAGENC(_u_, _pu_, _usize_) TEMPLATE2(zigzagenc,_usize_)((_u_)-(_pu_)) //zigzag predictor -#define ZZAGDEC(_u_, _pu_, _usize_) (TEMPLATE2(zigzagdec,_usize_)(_u_)+(_pu_)) +#define ZZAGENC(_u_, _pu_, _usize_) T2(zigzagenc,_usize_)((_u_)-(_pu_)) //zigzag predictor +#define ZZAGDEC(_u_, _pu_, _usize_) (T2(zigzagdec,_usize_)(_u_)+(_pu_)) -#define uint_t TEMPLATE3(uint, USIZE, _t) -#define int_t TEMPLATE3(int, USIZE, _t) +#define uint_t T3(uint, USIZE, _t) +#define int_t T3(int, USIZE, _t) //-------- TurboPFor Zigzag of zigzag for unsorted/sorted integer/floating point array --------------------------------------- -size_t TEMPLATE2(p4nzzenc128v,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { +size_t T2(p4nzzenc128v,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { uint_t _p[VSIZE+32], *ip, *p, pd = 0; unsigned char *op = out; - #define FE(i,_usize_) { TEMPLATE3(uint, USIZE, _t) u = ip[i]; start = u-start; p[i] = ZZAGENC(start,pd,_usize_); pd = start; start = u; } + #define FE(_i_,_usize_) { T3(uint, USIZE, _t) u = ip[_i_]; start = u-start; p[_i_] = ZZAGENC(start,pd,_usize_); pd = start; start = u; } for(ip = in; ip != in + (n&~(VSIZE-1)); ) { for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); } - op = TEMPLATE2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); + op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); } if(n = (in+n)-ip) { for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE); - op = TEMPLATE2(P4ENC,USIZE)(_p, n, op); + op = T2(P4ENC,USIZE)(_p, n, op); } return op - out; + #undef FE } -size_t TEMPLATE2(p4nzzdec128v,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { +size_t T2(p4nzzdec128v,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { uint_t _p[VSIZE+32],*p, *op, pd=0; unsigned char *ip = in; - #define FD(i,_usize_) { TEMPLATE3(uint, USIZE, _t) u = ZZAGDEC(p[i],start+pd,_usize_); op[i] = u; pd = u - start; start = u; } + #define FD(_i_,_usize_) { T3(uint, USIZE, _t) u = ZZAGDEC(p[_i_],start+pd,_usize_); op[_i_] = u; pd = u - start; start = u; } for(op = out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0); - for(ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p), p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); } + for(ip = T2(P4DECV,USIZE)(ip, VSIZE, _p), p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); } } if(n = (out+n) - op) - for(ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p), p = _p; p != &_p[n]; p++,op++) FD(0,USIZE); + for(ip = T2(P4DEC,USIZE)(ip, n, _p), p = _p; p != &_p[n]; p++,op++) FD(0,USIZE); return ip - in; + #undef FD } /*---------------- TurboFloat XOR: last value Predictor with TurboPFor --------------------------------------------------------- - Compress significantly (115% - 160%) better than Facebook's Gorilla algorithm for values + Compress significantly (115% - 160%) better and faster than facebook's Gorilla algorithm for values BEST results are obtained with LOSSY COMPRESSION (using fppad32/fppad64 in bitutil.c) + Algorithm: 1: XOR value with previous value. We have now leading (for common sign/exponent bits) + mantissa trailing zero bits 2: Eliminate the common block leading zeros of sign/exponent by shifting all values in the block to left 3: reverse values to bring the mantissa trailing zero bits to left for better compression with TurboPFor */ -size_t TEMPLATE2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { +size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { uint_t _p[VSIZE+32], *ip, *p; unsigned char *op = out; #if defined(__AVX2__) && USIZE >= 32 #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) - __m256i sv = TEMPLATE2(_mm256_set1_epi, USIZE)(start); + __m256i sv = T2(_mm256_set1_epi, USIZE)(start); #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #define _mm_set1_epi64(a) _mm_set1_epi64x(a) - __m128i sv = TEMPLATE2(_mm_set1_epi, USIZE)(start); + __m128i sv = T2(_mm_set1_epi, USIZE)(start); #endif - #define FE(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = ip[i]; p[i] = XORENC(u, start,_usize_); b |= p[i]; start = u; } + #define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, start,_usize_); b |= p[_i_]; start = u; } for(ip = in; ip != in + (n&~(VSIZE-1)); ) { uint_t b = 0; #if defined(__AVX2__) && USIZE >= 32 __m256i bv = _mm256_setzero_si256(); for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8),ip+=64/(USIZE/8)) { __m256i v0 = _mm256_loadu_si256((__m256i *) ip); __m256i v1 = _mm256_loadu_si256((__m256i *)(ip+32/(USIZE/8))); - sv = TEMPLATE2(mm256_xore_epi, USIZE)(v0,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *) p, sv); sv = v0; - sv = TEMPLATE2(mm256_xore_epi, USIZE)(v1,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), sv); sv = v1; + sv = T2(mm256_xore_epi, USIZE)(v0,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *) p, sv); sv = v0; + sv = T2(mm256_xore_epi, USIZE)(v1,sv); bv = _mm256_or_si256(bv, sv); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), sv); sv = v1; } - start = (uint_t)TEMPLATE2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1); - b = TEMPLATE2(mm256_hor_epi, USIZE)(bv); + start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1); + b = T2(mm256_hor_epi, USIZE)(bv); #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) __m128i bv = _mm_setzero_si128(); for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),ip+=32/(USIZE/8)) { __m128i v0 = _mm_loadu_si128((__m128i *) ip); __m128i v1 = _mm_loadu_si128((__m128i *)(ip+16/(USIZE/8))); - sv = TEMPLATE2(mm_xore_epi, USIZE)(v0,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *) p, sv); sv = v0; - sv = TEMPLATE2(mm_xore_epi, USIZE)(v1,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1; + sv = T2(mm_xore_epi, USIZE)(v0,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *) p, sv); sv = v0; + sv = T2(mm_xore_epi, USIZE)(v1,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1; } - start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8)); - b = TEMPLATE2(mm_hor_epi, USIZE)(bv); + start = (uint_t)T2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8)); + b = T2(mm_hor_epi, USIZE)(bv); #else for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); } #endif - *op++ = b = TEMPLATE2(clz,USIZE)(b); - #define TR(i,_usize_) p[i] = TEMPLATE2(rbit,_usize_)(p[i]<= 32 for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8)) { __m256i v0 = _mm256_loadu_si256((__m256i *)p); __m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8))); - v0 = TEMPLATE2(_mm256_slli_epi, USIZE)(v0,b); - v1 = TEMPLATE2(_mm256_slli_epi, USIZE)(v1,b); - v0 = TEMPLATE2( mm256_rbit_epi, USIZE)(v0); - v1 = TEMPLATE2( mm256_rbit_epi, USIZE)(v1); + v0 = T2(_mm256_slli_epi, USIZE)(v0,b); + v1 = T2(_mm256_slli_epi, USIZE)(v1,b); + v0 = T2( mm256_rbit_epi, USIZE)(v0); + v1 = T2( mm256_rbit_epi, USIZE)(v1); _mm256_storeu_si256((__m256i *) p, v0); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1); } @@ -177,79 +216,81 @@ size_t TEMPLATE2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) { __m128i v0 = _mm_loadu_si128((__m128i *) p); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); - v0 = TEMPLATE2(_mm_slli_epi, USIZE)(v0,b); - v0 = TEMPLATE2( mm_rbit_epi, USIZE)(v0); - v1 = TEMPLATE2(_mm_slli_epi, USIZE)(v1,b); - v1 = TEMPLATE2( mm_rbit_epi, USIZE)(v1); + v0 = T2(_mm_slli_epi, USIZE)(v0,b); + v0 = T2( mm_rbit_epi, USIZE)(v0); + v1 = T2(_mm_slli_epi, USIZE)(v1,b); + v1 = T2( mm_rbit_epi, USIZE)(v1); _mm_storeu_si128((__m128i *) p, v0); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), v1); } #else for(p = _p; p != &_p[VSIZE]; p+=4) { TR(0,USIZE); TR(1,USIZE); TR(2,USIZE); TR(3,USIZE); } #endif - op = TEMPLATE2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); + op = T2(P4ENCV,USIZE)(_p, VSIZE, op); PREFETCH(ip+512,0); } if(n = (in+n)-ip) { uint_t b = 0; for(p = _p; p != &_p[n]; p++,ip++) FE(0,USIZE); - b = TEMPLATE2(clz,USIZE)(b); + b = T2(clz,USIZE)(b); *op++ = b; for(p = _p; p != &_p[n]; p++) TR(0,USIZE); - op = TEMPLATE2(P4ENC,USIZE)(_p, n, op); + op = T2(P4ENC,USIZE)(_p, n, op); } return op - out; + #undef FE } -size_t TEMPLATE2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { +size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { uint_t *op, _p[VSIZE+32],*p; unsigned char *ip = in; #if defined(__AVX2__) && USIZE >= 32 #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) - __m256i sv = TEMPLATE2(_mm256_set1_epi, USIZE)(start); + __m256i sv = T2(_mm256_set1_epi, USIZE)(start); #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #define _mm_set1_epi64(a) _mm_set1_epi64x(a) - __m128i sv = TEMPLATE2(_mm_set1_epi, USIZE)(start); + __m128i sv = T2(_mm_set1_epi, USIZE)(start); #endif - #define FD(i,_usize_) { TEMPLATE3(uint, USIZE, _t) u = p[i]; u = TEMPLATE2(rbit,_usize_)(u)>>b; u = XORDEC(u, start,_usize_); op[i] = start = u; } - for(op = out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0); - unsigned b = *ip++; ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p); + #define FD(_i_,_usize_) { T3(uint, USIZE, _t) u = p[_i_]; u = T2(rbit,_usize_)(u)>>b; u = XORDEC(u, start,_usize_); op[_i_] = start = u; } + for(op = out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+384,0); + unsigned b = *ip++; ip = T2(P4DECV,USIZE)(ip, VSIZE, _p); #if defined(__AVX2__) && USIZE >= 32 for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8),op+=64/(USIZE/8)) { __m256i v0 = _mm256_loadu_si256((__m256i *)p); __m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8))); - v0 = TEMPLATE2( mm256_rbit_epi, USIZE)(v0); - v1 = TEMPLATE2( mm256_rbit_epi, USIZE)(v1); - v0 = TEMPLATE2(_mm256_srli_epi, USIZE)(v0,b); - v1 = TEMPLATE2(_mm256_srli_epi, USIZE)(v1,b); - v0 = TEMPLATE2( mm256_xord_epi, USIZE)(v0,sv); - sv = TEMPLATE2( mm256_xord_epi, USIZE)(v1,v0); + v0 = T2( mm256_rbit_epi, USIZE)(v0); + v1 = T2( mm256_rbit_epi, USIZE)(v1); + v0 = T2(_mm256_srli_epi, USIZE)(v0,b); + v1 = T2(_mm256_srli_epi, USIZE)(v1,b); + v0 = T2( mm256_xord_epi, USIZE)(v0,sv); + sv = T2( mm256_xord_epi, USIZE)(v1,v0); _mm256_storeu_si256((__m256i *)op, v0); _mm256_storeu_si256((__m256i *)(op+32/(USIZE/8)), sv); } - start = (uint_t)TEMPLATE2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1); + start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1); #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),op+=32/(USIZE/8)) { __m128i v0 = _mm_loadu_si128((__m128i *)p); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); - v0 = TEMPLATE2( mm_rbit_epi, USIZE)(v0); - v0 = TEMPLATE2(_mm_srli_epi, USIZE)(v0,b); - v0 = TEMPLATE2( mm_xord_epi, USIZE)(v0,sv); - v1 = TEMPLATE2( mm_rbit_epi, USIZE)(v1); - v1 = TEMPLATE2(_mm_srli_epi, USIZE)(v1,b); - sv = TEMPLATE2( mm_xord_epi, USIZE)(v1,v0); + v0 = T2( mm_rbit_epi, USIZE)(v0); + v0 = T2(_mm_srli_epi, USIZE)(v0,b); + v0 = T2( mm_xord_epi, USIZE)(v0,sv); + v1 = T2( mm_rbit_epi, USIZE)(v1); + v1 = T2(_mm_srli_epi, USIZE)(v1,b); + sv = T2( mm_xord_epi, USIZE)(v1,v0); _mm_storeu_si128((__m128i *) op, v0); _mm_storeu_si128((__m128i *)(op+16/(USIZE/8)), sv); } - start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8)); + start = (uint_t)T2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8)); #else for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); } #endif } if(n = (out+n) - op) { uint_t b = *ip++; - for(ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p), p = _p; p < &_p[n]; p++,op++) FD(0,USIZE); + for(ip = T2(P4DEC,USIZE)(ip, n, _p), p = _p; p < &_p[n]; p++,op++) FD(0,USIZE); } return ip - in; + #undef FD } //-------- TurboFloat FCM: Finite Context Method Predictor --------------------------------------------------------------- @@ -259,30 +300,30 @@ size_t TEMPLATE2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t #define HASH16(_h_,_u_) (((_h_)<<3 ^ (_u_)>>12) & ((1u<> 5) & ((1u<= 32 #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) - __m256i sv = TEMPLATE2(_mm256_set1_epi, USIZE)(start); + __m256i sv = T2(_mm256_set1_epi, USIZE)(start); #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #define _mm_set1_epi64(a) _mm_set1_epi64x(a) - __m128i sv = TEMPLATE2(_mm_set1_epi, USIZE)(start); + __m128i sv = T2(_mm_set1_epi, USIZE)(start); #endif for(ip = in; ip != in + (n&~(VSIZE-1)); ) { uint_t b = 0; - #define FE(i,_usize_) { TEMPLATE3(uint, _usize_, _t) u = ip[i]; p[i] = XORENC(u, htab[h],_usize_); b |= p[i]; htab[h] = u; h = TEMPLATE2(HASH,_usize_)(h,u); } + #define FE(_i_,_usize_) { T3(uint, _usize_, _t) u = ip[_i_]; p[_i_] = XORENC(u, htab[h],_usize_); b |= p[_i_]; htab[h] = u; h = T2(HASH,_usize_)(h,u); } for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); } - *op++ = b = TEMPLATE2(clz,USIZE)(b); + *op++ = b = T2(clz,USIZE)(b); #if defined(__AVX2__) && USIZE >= 32 for(p = _p; p != &_p[VSIZE]; p+=64/(USIZE/8)) { __m256i v0 = _mm256_loadu_si256((__m256i *)p); __m256i v1 = _mm256_loadu_si256((__m256i *)(p+32/(USIZE/8))); - v0 = TEMPLATE2(_mm256_slli_epi, USIZE)(v0,b); - v1 = TEMPLATE2(_mm256_slli_epi, USIZE)(v1,b); - v0 = TEMPLATE2( mm256_rbit_epi, USIZE)(v0); - v1 = TEMPLATE2( mm256_rbit_epi, USIZE)(v1); + v0 = T2(_mm256_slli_epi, USIZE)(v0,b); + v1 = T2(_mm256_slli_epi, USIZE)(v1,b); + v0 = T2( mm256_rbit_epi, USIZE)(v0); + v1 = T2( mm256_rbit_epi, USIZE)(v1); _mm256_storeu_si256((__m256i *) p, v0); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1); } @@ -290,141 +331,148 @@ size_t TEMPLATE2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_ for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) { __m128i v0 = _mm_loadu_si128((__m128i *) p); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); - v0 = TEMPLATE2(_mm_slli_epi, USIZE)(v0,b); - v0 = TEMPLATE2( mm_rbit_epi, USIZE)(v0); - v1 = TEMPLATE2(_mm_slli_epi, USIZE)(v1,b); - v1 = TEMPLATE2( mm_rbit_epi, USIZE)(v1); + v0 = T2(_mm_slli_epi, USIZE)(v0,b); + v0 = T2( mm_rbit_epi, USIZE)(v0); + v1 = T2(_mm_slli_epi, USIZE)(v1,b); + v1 = T2( mm_rbit_epi, USIZE)(v1); _mm_storeu_si128((__m128i *) p, v0); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), v1); } #else - #define TR(i,_usize_) p[i] = TEMPLATE2(rbit,_usize_)(p[i]<>b;\ - u = XORDEC(u, htab[h], _usize_); op[i] = u; htab[h] = u; h = TEMPLATE2(HASH,_usize_)(h,u);\ + #define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = p[_i_]; u = T2(rbit,_usize_)(u)>>b;\ + u = XORDEC(u, htab[h], _usize_); op[_i_] = u; htab[h] = u; h = T2(HASH,_usize_)(h,u);\ } for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0); - unsigned b = *ip++; ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p); + unsigned b = *ip++; ip = T2(P4DECV,USIZE)(ip, VSIZE, _p); for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); } } if(n = ((uint_t *)out+n) - op) { - unsigned b = *ip++; ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p); + unsigned b = *ip++; ip = T2(P4DEC,USIZE)(ip, n, _p); for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE); } return ip - in; + #undef FD } //-------- TurboFloat DFCM: Differential Finite Context Method Predictor ---------------------------------------------------------- -size_t TEMPLATE2(fpdfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { +size_t T2(fpdfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { uint_t *ip, _p[VSIZE+32], h = 0, *p, htab[1<>b; u = XORDEC(u, (htab[h]+start),_usize_); \ - op[i] = u; htab[h] = start = u-start; h = TEMPLATE2(HASH,_usize_)(h,start); start = u;\ + #define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = T2(rbit,_usize_)(p[_i_])>>b; u = XORDEC(u, (htab[h]+start),_usize_); \ + op[_i_] = u; htab[h] = start = u-start; h = T2(HASH,_usize_)(h,start); start = u;\ } for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0); uint_t b = *ip++; - ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p); + ip = T2(P4DECV,USIZE)(ip, VSIZE, _p); for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); } } if(n = ((uint_t *)out+n) - op) { uint_t b = *ip++; - ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p); + ip = T2(P4DEC,USIZE)(ip, n, _p); for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE); } return ip - in; + #undef FD } -//-------- TurboFloat 2D DFCM: Differential Finite Context Method Predictor ---------------------------------------------------------- -size_t TEMPLATE2(fp2dfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { +//-------- TurboFloat Double delta DFCM: Differential Finite Context Method Predictor ---------------------------------------------------------- +size_t T2(fp2dfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { uint_t *ip, _p[VSIZE+32], h = 0, *p, htab[1<>b; u = XORDEC(u, (htab[h]+start),_usize_);\ - op[i] = u; htab[h] = start = u-start; h = TEMPLATE2(HASH,_usize_)(h,start); start = start0; start0 = u;\ + #define FD(_i_,_usize_) { T3(uint, _usize_, _t) u = T2(rbit,_usize_)(p[_i_])>>b; u = XORDEC(u, (htab[h]+start),_usize_);\ + op[_i_] = u; htab[h] = start = u-start; h = T2(HASH,_usize_)(h,start); start = start0; start0 = u;\ } for(op = (uint_t*)out; op != out+(n&~(VSIZE-1)); ) { PREFETCH(ip+512,0); uint_t b = *ip++; - ip = TEMPLATE2(P4DECV,USIZE)(ip, VSIZE, _p); + ip = T2(P4DECV,USIZE)(ip, VSIZE, _p); for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); } } if(n = ((uint_t *)out+n) - op) { uint_t b = *ip++; - ip = TEMPLATE2(P4DEC,USIZE)(ip, n, _p); + ip = T2(P4DEC,USIZE)(ip, n, _p); for(p = _p; p != &_p[n]; p++,op++) FD(0,USIZE); } return ip - in; + #undef FD } +#define BSIZE(_usize_) (_usize_==64?6:(_usize_==32?5:(_usize_==16?4:3))) //-------- TurboGorilla : Improved Gorilla style (see Facebook paper) Floating point compression with bitio ------------------------------------ #define bitput2(_bw_,_br_, _n1_, _n2_, _x_) {\ if(!_x_) bitput(_bw_,_br_, 1, 1);/*1*/\ @@ -438,54 +486,229 @@ size_t TEMPLATE2(fp2dfcmdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uin else bitrmv(_bw_,_br_,_n2_+2), _x_ = BZHI32(_x_>>2, _n2_);\ } -#define BSIZE(_usize_) (_usize_==64?6:(_usize_==32?5:(_usize_==16?4:3))) -size_t TEMPLATE2(fpgenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { - uint_t *ip; - unsigned ol = 0,ot = 0; +size_t T2(fpgenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { + uint_t *ip = in; + unsigned ol = 0, ot = 0; unsigned char *op = out; bitdef(bw,br); - if(start) { ol = TEMPLATE2(clz,USIZE)(start); ot = TEMPLATE2(ctz,USIZE)(start); } + if(start) { ol = T2(clz,USIZE)(start); ot = T2(ctz,USIZE)(start); } - #define FE(i,_usize_) { TEMPLATE3(uint, _usize_, _t) z = XORENC(ip[i], start,_usize_); start = ip[i];\ - if(likely(!z)) bitput( bw,br, 1, 1);\ - else { unsigned t = TEMPLATE2(ctz,_usize_)(z), l = TEMPLATE2(clz,_usize_)(z);\ + #define FE(_i_,_usize_) { T3(uint, _usize_, _t) z = XORENC(ip[_i_], start,_usize_); start = ip[_i_];\ + if(unlikely(!z)) bitput( bw,br, 2, 2); /*01*/\ + else {\ + unsigned t = T2(ctz,_usize_)(z), l = /*lz2l[*/T2(clz,_usize_)(z);\ unsigned s = _usize_ - l - t, os = _usize_ - ol - ot;\ - if(l >= ol && t >= ot && os < 6+5+s) { bitput( bw,br, 2, 2); TEMPLATE2(bitput,_usize_)(bw,br, os, z>>ot,op); }\ - else { bitput( bw,br, 2+BSIZE(_usize_), l<<2); bitput2(bw,br, N_0, N_1, t); bitenorm(bw,br,op);TEMPLATE2(bitput,_usize_)(bw,br, s, z>>t,op); ol = l; ot = t; }\ + if(l >= ol && t >= ot && os < 6+5+s) { bitput( bw,br, 1, 1); /*1*/ T2(bitput,_usize_)(bw,br, os, z>>ot,op); }\ + else { /*00*/ /*bitput( bw,br, 2, 0); bitput(bw,br, 3, lz2c(l));*/\ + bitput( bw,br, 2+BSIZE(_usize_), l<<2); bitput2(bw,br, N_0, N_1, t); bitenorm(bw,br,op);T2(bitput,_usize_)(bw,br, s, z>>t,op); ol = l; ot = t; }\ } bitenorm(bw,br,op);\ } - for(ip = in; ip != in + (n&~(4-1)); ip+=4) { PREFETCH(ip+512,0); FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); } - for( ; ip != in + n ; ip++) FE(0,USIZE); + for(; ip != in + (n&~(4-1)); ip+=4) { PREFETCH(ip+512,0); FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); } + for(; ip != in + n ; ip++) FE(0,USIZE); bitflush(bw,br,op); return op - out; + #undef FE } -size_t TEMPLATE2(fpgdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0; +size_t T2(fpgdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0; uint_t *op; - unsigned ol = 0,ot = 0,x; + int ol = 0,ot = 0,x; unsigned char *ip = in; bitdef(bw,br); - if(start) { ol = TEMPLATE2(clz,USIZE)(start); ot = TEMPLATE2(ctz,USIZE)(start); } + if(start) { ol = T2(clz,USIZE)(start); ot = T2(ctz,USIZE)(start); } - #define FD(i,_usize_) { TEMPLATE3(uint, _usize_, _t) z=0; unsigned _x; BITGET32(bw,br,1,_x); \ - if(likely(!_x)) { BITGET32(bw,br,1,_x);\ - if(!_x) { BITGET32(bw,br,BSIZE(_usize_),ol); bitget2(bw,br, N_0, N_1, ot); bitdnorm(bw,br,ip); }\ - TEMPLATE2(bitget,_usize_)(bw,br,_usize_ - ol - ot,z,ip);\ + #define FD(_i_,_usize_) { T3(uint, _usize_, _t) z=0; unsigned _x,_y; BITGET32(bw,br,1,_x); \ + if(likely(!_x)) { BITGET32(bw,br,1,_y); _x = _y << 1 | _x; }\ + if(_x < 2) { \ + if(!_x) { /*BITGET32(bw,br, 3, ol); ol = c2l(ol);*/BITGET32(bw,br,BSIZE(_usize_),ol); bitget2(bw,br, N_0, N_1, ot); bitdnorm(bw,br,ip); }\ + T2(bitget,_usize_)(bw,br,_usize_ - ol - ot,z,ip);\ z<<=ot;\ - } op[i] = start = XORDEC(z, start,_usize_); bitdnorm(bw,br,ip);\ + } op[_i_] = start = XORDEC(z, start,_usize_); bitdnorm(bw,br,ip);\ } for(bitdnorm(bw,br,ip),op = out; op != out+(n&~(4-1)); op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); PREFETCH(ip+512,0); } for( ; op != out+n; op++) FD(0,USIZE); bitalign(bw,br,ip); return ip - in; + #undef FD } -//------ Zigzag of zigzag with bitio for timestamps with bitio ------------------------------------------------------------------------------------------ +//--- Chimp: Efficient Lossless Floating Point Compression for Time Series Databases - https://www.vldb.org/pvldb/vol15/p3058-liakos.pdf +size_t T2(fphenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { + uint_t *ip = in; + int ol = USIZE+1,s; + unsigned char *op = out; + bitdef(bw,br); + if(start) ol = T2(clz,USIZE)(start); + #define FE(_i_,_usize_) { T3(uint, _usize_, _t) z = XORENC(ip[_i_], start,_usize_); start = ip[_i_];\ + if(unlikely(!z)) { ol = _usize_+1; bitput(bw,br, 2, 0); }\ + else { int t = T2(ctz,_usize_)(z), l = lz2l(T2(clz,_usize_)(z));\ + if(t > BSIZE(_usize_)) { s = _usize_ - l - t; ol = _usize_+1; bitput(bw,br, 2, 1); bitput(bw,br, CCODEL, l2c(l)); bitput(bw,br, BSIZE(_usize_), s); z>>=t; \ + bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op); \ + } else if (l == ol) { s = _usize_ - l; bitput(bw,br, 2, 2);\ + bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op);\ + } else { s = _usize_ - l; ol = l; bitput(bw,br, 2, 3); bitput(bw,br, CCODEL, l2c(l));\ + bitenorm(bw,br,op); T2(bitput,_usize_)(bw,br, s, z, op); }\ + } bitenorm(bw,br,op);\ + } + for(; ip != in + (n&~(4-1)); ip+=4) { PREFETCH(ip+512,0); FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); } + for(; ip != in + n ; ip++) FE(0,USIZE); + bitflush(bw,br,op); + return op - out; + #undef FE +} + +size_t T2(fphdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0; + uint_t *op = out; + int ol = 0,s; + unsigned char *ip = in; + bitdef(bw,br); + if(start) ol = T2(clz,USIZE)(start); + + #define FD(_i_,_usize_) { T3(uint, _usize_, _t) z=0; unsigned _x; bitdnorm(bw,br,ip); BITGET32(bw,br,2,_x);\ + switch(_x) {\ + case 3: BITGET32(bw,br, CCODEL, ol); ol = c2l(ol); bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br,(_usize_-ol), z, ip); s = _usize_ == ol?_usize_:_usize_ - ol;\ + break;\ + case 2: bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br,(_usize_-ol), z, ip); s = _usize_ == ol?_usize_:_usize_ - ol;\ + break;\ + case 1: BITGET32(bw,br, CCODEL, ol); ol = c2l(ol); BITGET32(bw,br,BSIZE(_usize_), s); s = s?s:_usize_;\ + int ot = _usize_ - s - ol;\ + bitdnorm(bw,br,ip); T2(bitget,_usize_)(bw,br, (_usize_ - ol - ot), z, ip); z <<= ot;\ + break;\ + case 0: ;\ + } op[_i_] = start = XORDEC(z, start,_usize_);\ + } + bitdnorm(bw,br,ip); + for(; op != out+(n&~(4-1)); op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); PREFETCH(ip+512,0); } + for(; op != out+n; op++) FD(0,USIZE); + bitalign(bw,br,ip); + return ip - in; + #undef FD +} + +//-- TurboFloatLz (lz77 like parsing) -------------------------------------- + #if USIZE == 64 +#define VA_BITS 7 // window size for 64 bits (integer/double) +#define FX_BITS 14 + #elif USIZE == 32 +#define VA_BITS 4 // 32 bits (intger/float) +#define FX_BITS 12 + #else +#define VA_BITS 4 // 16 bits (intger/float) +#define FX_BITS 10 + #endif + +#define FX_MSK ((1< CTZMIN(USIZE))?ofs:0; + } + z = c?z:u ^ ip[-1]; + if(z) { + unsigned l = lz2l(T2(clz,USIZE)(z)), t; + if(c) { + t = USIZE - l - ctz; + bitput(bw,br, (VA_BITS+CCODEL+BSIZE(USIZE)+2), ofs << (CCODEL+BSIZE(USIZE)+2) | (unsigned)l2c(l) << (BSIZE(USIZE)+2) | t<<2); + bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, t, z >> ctz, op); + ol = USIZE+1; + } else { + if(l == ol) { bitput(bw,br, 2, 1); } + else { bitput(bw,br, CCODEL+2, l2c(l)<<2|2); ol = l; } + bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, USIZE - l, z, op); + } + } else { bitput(bw,br,VA_BITS+2, ofs<<2|3 ); ol = USIZE+1; } + bitenorm(bw,br,op); + } + bitflush(bw,br,op); + return op - out; +} + +size_t T2(fpcenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { // double entry hashtable + uint_t *ip = in; + unsigned char *op = out; + unsigned htab[1<<(FX_BITS+1)] = {0}, ol = USIZE+1; + bitdef(bw,br); //if(start) ol = T2(clz,USIZE)(start); + + for(*(uint_t *)op = *ip++, op += sizeof(uint_t); ip != in + n; ip++) { + uint_t u = *ip, z; + unsigned hu = HASH(u)<<1, h = htab[hu], ofs = (ip-in) - h - 1, c = 0, ctz = 0; + htab[hu+1] = h, htab[hu] = ip - in; + if(ofs < (1< ctz) { ofs = ofs1; c = c1; z = z1; ctz = ctz1; } + } + ofs = (c = ctz > CTZMIN(USIZE))?ofs:0; + } + z = c?z:u ^ ip[-1]; + if(z) { + unsigned l = lz2l(T2(clz,USIZE)(z)), t; + if(c) { + t = USIZE - l - ctz; + bitput(bw,br, (VA_BITS+CCODEL+BSIZE(USIZE)+2), ofs << (CCODEL+BSIZE(USIZE)+2) | (unsigned)l2c(l) << (BSIZE(USIZE)+2) | t<<2); + bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, t, z >> ctz, op); + ol = USIZE+1; + } else { + if(l == ol) { bitput(bw,br, 2, 1); } + else { bitput(bw,br, CCODEL+2, l2c(l)<<2|2); ol = l; } + bitenorm(bw,br,op); T2(bitput,USIZE)(bw,br, USIZE - l, z, op); + } + } else { bitput(bw,br,VA_BITS+2, ofs<<2|3 ); ol = USIZE+1; } + bitenorm(bw,br,op); + } + bitflush(bw,br,op); + return op - out; +} + +size_t T2(fpcdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0; + uint_t *op = out, v = 0, z; + unsigned char *ip = in; + unsigned ol = USIZE+1, t; + bitdef(bw,br); //if(start) ol = T2(clz,USIZE)(start); + + for(v = *op++ = *(uint_t *)ip, ip += sizeof(uint_t); op != out+n; op++) { + bitdnorm(bw,br,ip); BITGET32(bw,br, 2, t); + switch(t) { + case 0: + BITGET32(bw,br, VA_BITS+CCODEL+BSIZE(USIZE), t); + ol = c2l(bextr32(t, BSIZE(USIZE), CCODEL)); + v = *(op - bextr32(t, BSIZE(USIZE)+CCODEL, VA_BITS)-1); + t &= ((1<= out_) { *out++ = 1<<4; /*bitini(bw,br); bitput(bw,br,4+3,1<<4); bitflush(bw,br,out);*/ memcpy(out,in,n*sizeof(in[0])); return 1+n*sizeof(in[0]); } -size_t TEMPLATE2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { +size_t T2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { uint_t *ip = in, pd = 0, *pp = in,dd; unsigned char *op = out, *out_ = out+n*sizeof(in[0]); @@ -494,12 +717,12 @@ size_t TEMPLATE2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t uint64_t _r = _ip_ - _pp_;\ if(_r > NL) { _r -= NL; unsigned _b = (bsr64(_r)+7)>>3; bitput(bw,br,4+3+3,(_b-1)<<(4+3)); bitput64(bw,br,_b<<3, _r, _op_); bitenorm(bw,br,_op_); }\ else while(_r--) { bitput(bw,br,1,1); bitenorm(bw,br,_op_); }\ - _d_ = TEMPLATE2(zigzagenc,_usize_)(_d_);\ + _d_ = T2(zigzagenc,_usize_)(_d_);\ if(!_d_) bitput(bw,br, 1, 1);\ else if(_d_ < (1<< (N2-1))) bitput(bw,br, N2+2,_d_<<2|2);\ else if(_d_ < (1<< (N3-1))) bitput(bw,br, N3+3,_d_<<3|4);\ else if(_d_ < (1<< (N4-1))) bitput(bw,br, N4+4,_d_<<4|8);\ - else { unsigned _b = (TEMPLATE2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); TEMPLATE2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\ + else { unsigned _b = (T2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); T2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\ bitenorm(bw,br,_op_);\ } while(0) @@ -528,9 +751,10 @@ size_t TEMPLATE2(bvzzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t } bitflush(bw,br,op); return op - out; + #undef FE } -size_t TEMPLATE2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0; +size_t T2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0; uint_t *op = out, pd = 0; unsigned char *ip = in; @@ -557,8 +781,8 @@ size_t TEMPLATE2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t #if (defined(__SSE2__) /*|| defined(__ARM_NEON)*/) && USIZE == 32 __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4*pd,3*pd,2*pd,1*pd); for(r += NL, _op = op; op != _op+(r&~7);) { - sv = _mm_add_epi32(sv,cv); _mm_storeu_si128(op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3) - sv = _mm_add_epi32(sv,cv); _mm_storeu_si128(op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; + sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3) + sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); #else @@ -576,9 +800,9 @@ size_t TEMPLATE2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t *op = (start+=pd); continue; } - TEMPLATE2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip); + T2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip); } - pd += TEMPLATE2(zigzagdec,USIZE)(dd); + pd += T2(zigzagdec,USIZE)(dd); *op++ = (start += pd); bitdnorm(bw,br,ip); } @@ -587,7 +811,7 @@ size_t TEMPLATE2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t } //-------- Zigzag with bit/io + RLE -------------------------------------------------------------------------- -size_t TEMPLATE2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { +size_t T2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) { uint_t *ip = in, *pp = in,dd; unsigned char *op = out, *out_ = out+n*sizeof(in[0]); @@ -596,12 +820,12 @@ size_t TEMPLATE2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t uint64_t _r = _ip_ - _pp_;\ if(_r > NL) { _r -= NL; unsigned _b = (bsr64(_r)+7)>>3; bitput(bw,br,4+3+3,(_b-1)<<(4+3)); bitput64(bw,br,_b<<3, _r, _op_); bitenorm(bw,br,_op_); }\ else while(_r--) { bitput(bw,br,1,1); bitenorm(bw,br,_op_); }\ - _d_ = TEMPLATE2(zigzagenc,_usize_)(_d_);\ + _d_ = T2(zigzagenc,_usize_)(_d_);\ if(!_d_) bitput(bw,br, 1, 1);\ else if(_d_ < (1<< (N2-1))) bitput(bw,br, N2+2,_d_<<2|2);\ else if(_d_ < (1<< (N3-1))) bitput(bw,br, N3+3,_d_<<3|4);\ else if(_d_ < (1<< (N4-1))) bitput(bw,br, N4+4,_d_<<4|8);\ - else { unsigned _b = (TEMPLATE2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); TEMPLATE2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\ + else { unsigned _b = (T2(bsr,_usize_)(_d_)+7)>>3; bitput(bw,br,4+3,(_b-1)<<4); T2(bitput,_usize_)(bw,br, _b<<3, _d_,_op_); }\ bitenorm(bw,br,_op_);\ } while(0) @@ -630,9 +854,10 @@ size_t TEMPLATE2(bvzenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t } bitflush(bw,br,op); return op - out; + #undef FE } -size_t TEMPLATE2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0; +size_t T2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) { if(!n) return 0; uint_t *op = out; unsigned char *ip = in; @@ -659,8 +884,8 @@ size_t TEMPLATE2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t #if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32 __m128i sv = _mm_set1_epi32(start); for(r += NL, _op = op; op != _op+(r&~7);) { - _mm_storeu_si128(op, sv); op += 4; - _mm_storeu_si128(op, sv); op += 4; + _mm_storeu_si128((__m128i *)op, sv); op += 4; + _mm_storeu_si128((__m128i *)op, sv); op += 4; } #else for(r+=NL, _op = op; op != _op+(r&~7); op += 8) @@ -670,9 +895,9 @@ size_t TEMPLATE2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t *op = start; continue; } - TEMPLATE2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip); + T2(bitget,USIZE)(bw,br,(b+1)<<3,dd,ip); } - dd = TEMPLATE2(zigzagdec,USIZE)(dd); + dd = T2(zigzagdec,USIZE)(dd); *op++ = (start += dd); bitdnorm(bw,br,ip); }