BitUtil: Delta, ZigZag, NumBits, Floating Point,...

This commit is contained in:
x
2019-12-21 14:06:33 +01:00
parent 0a02aa9ad3
commit 9113347ee4

212
bitutil.c
View File

@ -22,6 +22,7 @@
- email : powturbo [_AT_] gmail [_DOT_] com - email : powturbo [_AT_] gmail [_DOT_] com
**/ **/
// "Integer Compression" utility - delta, for, zigzag / Floating point compression // "Integer Compression" utility - delta, for, zigzag / Floating point compression
#include <math.h> //nan
#include "conf.h" #include "conf.h"
#define BITUTIL_IN #define BITUTIL_IN
#include "bitutil.h" #include "bitutil.h"
@ -42,13 +43,13 @@ uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), __m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();
for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m128i v0 = _mm_loadu_si128((__m128i *) ip); __m128i v0 = _mm_loadu_si128((__m128i *) ip);
__m128i v1 = _mm_loadu_si128((__m128i *)(ip+8)); __m128i v1 = _mm_loadu_si128((__m128i *)(ip+8));
vo0 = _mm_or_si128( vo0, v0); vo0 = _mm_or_si128( vo0, v0);
vo1 = _mm_or_si128( vo1, v1); vo1 = _mm_or_si128( vo1, v1);
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
} }
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
@ -65,26 +66,26 @@ uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) {
#ifdef __AVX2__ #ifdef __AVX2__
__m256i vb0 = _mm256_set1_epi32(*in), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(), __m256i vb0 = _mm256_set1_epi32(*in), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256();
for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m256i v0 = _mm256_loadu_si256((__m256i *) ip); __m256i v0 = _mm256_loadu_si256((__m256i *) ip);
__m256i v1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v1 = _mm256_loadu_si256((__m256i *)(ip+8));
vo0 = _mm256_or_si256(vo0, v0); vo0 = _mm256_or_si256(vo0, v0);
vo1 = _mm256_or_si256(vo1, v1); vo1 = _mm256_or_si256(vo1, v1);
vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0)); vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0)); vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
} }
vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0); vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0); vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
#elif defined(__SSE2__) || defined(__ARM_NEON) #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi32(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), __m128i vb0 = _mm_set1_epi32(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128();
for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
__m128i v0 = _mm_loadu_si128((__m128i *) ip); __m128i v0 = _mm_loadu_si128((__m128i *) ip);
__m128i v1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v1 = _mm_loadu_si128((__m128i *)(ip+4));
vo0 = _mm_or_si128(vo0, v0); vo0 = _mm_or_si128(vo0, v0);
vo1 = _mm_or_si128(vo1, v1); vo1 = _mm_or_si128(vo1, v1);
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
} }
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
@ -112,23 +113,23 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi16(u0), __m128i vb0 = _mm_set1_epi16(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start);
for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *) ip); __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8)); __m128i v0 = mm_delta_epi16(vi0,vs); vs = vi0; __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8)); __m128i v0 = mm_delta_epi16(vi0,vs); vs = vi0;
__m128i v1 = mm_delta_epi16(vi1,vs); vs = vi1; __m128i v1 = mm_delta_epi16(vi1,vs); vs = vi1;
vo0 = _mm_or_si128(vo0, v0); vo0 = _mm_or_si128(vo0, v0);
vo1 = _mm_or_si128(vo1, v1); vo1 = _mm_or_si128(vo1, v1);
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
} start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14)); } start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14));
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
#else #else
ip = in; o = x = 0; ip = in; o = x = 0;
#endif #endif
for(;ip != in+n; ip++) { for(;ip != in+n; ip++) {
uint16_t u = *ip - start; start = *ip; uint16_t u = *ip - start; start = *ip;
o |= u; o |= u;
x |= u ^ u0; x |= u ^ u0;
} }
@ -141,38 +142,38 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
#ifdef __AVX2__ #ifdef __AVX2__
__m256i vb0 = _mm256_set1_epi32(u0), __m256i vb0 = _mm256_set1_epi32(u0),
vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start); vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start);
for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m256i vi0 = _mm256_loadu_si256((__m256i *) ip); __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
__m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0; __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0;
__m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1; __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1;
vo0 = _mm256_or_si256(vo0, v0); vo0 = _mm256_or_si256(vo0, v0);
vo1 = _mm256_or_si256(vo1, v1); vo1 = _mm256_or_si256(vo1, v1);
vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0)); vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0)); vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
} start = (unsigned)_mm256_extract_epi32(vs, 7); } start = (unsigned)_mm256_extract_epi32(vs, 7);
vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0); vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0); vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
#elif defined(__SSE2__) || defined(__ARM_NEON) #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi32(u0), __m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start);
for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *)ip); __m128i vi0 = _mm_loadu_si128((__m128i *)ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = mm_delta_epi32(vi0,vs); vs = vi0; __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = mm_delta_epi32(vi0,vs); vs = vi0;
__m128i v1 = mm_delta_epi32(vi1,vs); vs = vi1; __m128i v1 = mm_delta_epi32(vi1,vs); vs = vi1;
vo0 = _mm_or_si128(vo0, v0); vo0 = _mm_or_si128(vo0, v0);
vo1 = _mm_or_si128(vo1, v1); vo1 = _mm_or_si128(vo1, v1);
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
} start = _mm_cvtsi128_si32(_mm_srli_si128(vs,12)); } start = _mm_cvtsi128_si32(_mm_srli_si128(vs,12));
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
#else #else
ip = in; o = x = 0; ip = in; o = x = 0;
#endif #endif
for(;ip != in+n; ip++) { for(;ip != in+n; ip++) {
uint32_t u = *ip - start; start = *ip; uint32_t u = *ip - start; start = *ip;
o |= u; o |= u;
x |= u ^ u0; x |= u ^ u0;
} }
@ -196,26 +197,26 @@ void bitddec32(uint32_t *p, unsigned n, unsigned start) {
unsigned *ip; unsigned *ip;
for(ip = p; ip != p+(n&~(8-1)); ip += 8) { for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
__m256i v = _mm256_loadu_si256((__m256i *)ip); __m256i v = _mm256_loadu_si256((__m256i *)ip);
vs = mm256_scan_epi32(v,vs); vs = mm256_scan_epi32(v,vs);
_mm256_storeu_si256((__m256i *)ip, vs); _mm256_storeu_si256((__m256i *)ip, vs);
} }
start = (unsigned)_mm256_extract_epi32(vs, 7); start = (unsigned)_mm256_extract_epi32(vs, 7);
while(ip != p+n) { while(ip != p+n) {
*ip = (start += (*ip)); *ip = (start += (*ip));
ip++; ip++;
} }
#elif defined(__SSE2__) || defined(__ARM_NEON) #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vs = _mm_set1_epi32(start); __m128i vs = _mm_set1_epi32(start);
unsigned *ip; unsigned *ip;
for(ip = p; ip != p+(n&~(4-1)); ip += 4) { for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
__m128i v = _mm_loadu_si128((__m128i *)ip); __m128i v = _mm_loadu_si128((__m128i *)ip);
vs = mm_scan_epi32(v, vs); vs = mm_scan_epi32(v, vs);
_mm_storeu_si128((__m128i *)ip, vs); _mm_storeu_si128((__m128i *)ip, vs);
} }
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12)); start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
while(ip != p+n) { while(ip != p+n) {
*ip = (start += (*ip)); *ip = (start += (*ip));
ip++; ip++;
} }
#else #else
BITDD(uint32_t, p, n, 0); BITDD(uint32_t, p, n, 0);
@ -258,38 +259,38 @@ uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
#ifdef __AVX2__ #ifdef __AVX2__
__m256i vb0 = _mm256_set1_epi32(u0), __m256i vb0 = _mm256_set1_epi32(u0),
vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1); vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m256i vi0 = _mm256_loadu_si256((__m256i *)ip); __m256i vi0 = _mm256_loadu_si256((__m256i *)ip);
__m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = _mm256_sub_epi32(mm256_delta_epi32(vi0,vs),cv); vs = vi0; __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = _mm256_sub_epi32(mm256_delta_epi32(vi0,vs),cv); vs = vi0;
__m256i v1 = _mm256_sub_epi32(mm256_delta_epi32(vi1,vs),cv); vs = vi1; __m256i v1 = _mm256_sub_epi32(mm256_delta_epi32(vi1,vs),cv); vs = vi1;
vo0 = _mm256_or_si256(vo0, v0); vo0 = _mm256_or_si256(vo0, v0);
vo1 = _mm256_or_si256(vo1, v1); vo1 = _mm256_or_si256(vo1, v1);
vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0)); vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0)); vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
} start = (unsigned)_mm256_extract_epi32(vs, 7); } start = (unsigned)_mm256_extract_epi32(vs, 7);
vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0); vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0); vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
#elif defined(__SSE2__) || defined(__ARM_NEON) #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi32(u0), __m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(1);
for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *)ip); __m128i vi0 = _mm_loadu_si128((__m128i *)ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = _mm_sub_epi32(mm_delta_epi32(vi0,vs),cv); vs = vi0; __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = _mm_sub_epi32(mm_delta_epi32(vi0,vs),cv); vs = vi0;
__m128i v1 = _mm_sub_epi32(mm_delta_epi32(vi1,vs),cv); vs = vi1; __m128i v1 = _mm_sub_epi32(mm_delta_epi32(vi1,vs),cv); vs = vi1;
vo0 = _mm_or_si128(vo0, v0); vo0 = _mm_or_si128(vo0, v0);
vo1 = _mm_or_si128(vo1, v1); vo1 = _mm_or_si128(vo1, v1);
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
} start = _mm_cvtsi128_si32(_mm_srli_si128(vs,12)); } start = _mm_cvtsi128_si32(_mm_srli_si128(vs,12));
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
#else #else
ip = in; o = x = 0; ip = in; o = x = 0;
#endif #endif
for(;ip != in+n; ip++) { for(;ip != in+n; ip++) {
uint32_t u = ip[0] - start-1; start = *ip; uint32_t u = ip[0] - start-1; start = *ip;
o |= u; o |= u;
x |= u ^ u0; x |= u ^ u0;
} }
@ -302,14 +303,14 @@ uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
unsigned *ip,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); unsigned *ip,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8);
for(ip = in; ip != in+(n&~(4-1)); ip += 4) { for(ip = in; ip != in+(n&~(4-1)); ip += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip); __m128i iv = _mm_loadu_si128((__m128i *)ip);
bv = _mm_or_si128(bv,_mm_sub_epi16(SUBI16x8(iv,vs),cv)); bv = _mm_or_si128(bv,_mm_sub_epi16(SUBI16x8(iv,vs),cv));
vs = iv; vs = iv;
} }
start = (unsigned short)_mm_cvtsi128_si32(_mm_srli_si128(vs,14)); start = (unsigned short)_mm_cvtsi128_si32(_mm_srli_si128(vs,14));
b = mm_hor_epi16(bv); b = mm_hor_epi16(bv);
if(px) *px = 0; if(px) *px = 0;
return b; return b;
#endif #endif
} }
unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
@ -317,14 +318,14 @@ unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
unsigned *ip,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); unsigned *ip,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4);
for(ip = in; ip != in+(n&~(4-1)); ip += 4) { for(ip = in; ip != in+(n&~(4-1)); ip += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip); __m128i iv = _mm_loadu_si128((__m128i *)ip);
bv = _mm_or_si128(bv,_mm_sub_epi32(SUBI32x4(iv,vs),cv)); bv = _mm_or_si128(bv,_mm_sub_epi32(SUBI32x4(iv,vs),cv));
vs = iv; vs = iv;
} }
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12)); start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
b = mm_hor_epi32(bv); b = mm_hor_epi32(bv);
if(px) *px = 0; if(px) *px = 0;
return b; return b;
#endif #endif
} }
void bitd1dec8( uint8_t *p, unsigned n, uint8_t start) { BITDD(uint8_t, p, n, 1); } void bitd1dec8( uint8_t *p, unsigned n, uint8_t start) { BITDD(uint8_t, p, n, 1); }
@ -335,26 +336,26 @@ void bitd1dec32(uint32_t *p, unsigned n, uint32_t start) {
__m256i vs = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1); __m256i vs = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1);
unsigned *ip; unsigned *ip;
for(ip = p; ip != p+(n&~(8-1)); ip += 8) { for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
__m256i v = _mm256_loadu_si256((__m256i *)ip); vs = mm256_scani_epi32(v, vs, cv); __m256i v = _mm256_loadu_si256((__m256i *)ip); vs = mm256_scani_epi32(v, vs, cv);
_mm256_storeu_si256((__m256i *)ip, vs); _mm256_storeu_si256((__m256i *)ip, vs);
} }
start = (unsigned)_mm256_extract_epi32(vs, 7); start = (unsigned)_mm256_extract_epi32(vs, 7);
while(ip != p+n) { while(ip != p+n) {
*ip = (start += (*ip) + 1); *ip = (start += (*ip) + 1);
ip++; ip++;
} }
#elif defined(__SSE2__) || defined(__ARM_NEON) #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vs = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); __m128i vs = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
unsigned *ip; unsigned *ip;
for(ip = p; ip != p+(n&~(4-1)); ip += 4) { for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
__m128i v = _mm_loadu_si128((__m128i *)ip); __m128i v = _mm_loadu_si128((__m128i *)ip);
vs = mm_scani_epi32(v, vs, cv); vs = mm_scani_epi32(v, vs, cv);
_mm_storeu_si128((__m128i *)ip, vs); _mm_storeu_si128((__m128i *)ip, vs);
} }
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12)); start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
while(ip != p+n) { while(ip != p+n) {
*ip = (start += (*ip) + 1); *ip = (start += (*ip) + 1);
ip++; ip++;
} }
#else #else
BITDD(uint32_t, p, n, 1); BITDD(uint32_t, p, n, 1);
@ -383,17 +384,17 @@ uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin
__m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(mindelta), dv; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(mindelta), dv;
for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) { for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip); __m128i iv = _mm_loadu_si128((__m128i *)ip);
bv = _mm_or_si128(bv, dv = _mm_sub_epi32(mm_delta_epi32(iv,vs),cv)); bv = _mm_or_si128(bv, dv = _mm_sub_epi32(mm_delta_epi32(iv,vs),cv));
vs = iv; vs = iv;
_mm_storeu_si128((__m128i *)op, dv); _mm_storeu_si128((__m128i *)op, dv);
} }
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12)); start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
b = mm_hor_epi32(bv); b = mm_hor_epi32(bv);
while(ip != in+n) { while(ip != in+n) {
unsigned x = *ip-start-mindelta; unsigned x = *ip-start-mindelta;
start = *ip++; start = *ip++;
b |= x; b |= x;
*op++ = x; *op++ = x;
} }
#else #else
uint32_t b = 0,*op = out, x, *_ip; uint32_t b = 0,*op = out, x, *_ip;
@ -437,12 +438,12 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *) ip); __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8)); __m128i v0 = mm_delta_epi16(vi0,vs); vs = vi0; v0 = mm_zzage_epi16(v0); __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+8)); __m128i v0 = mm_delta_epi16(vi0,vs); vs = vi0; v0 = mm_zzage_epi16(v0);
__m128i v1 = mm_delta_epi16(vi1,vs); vs = vi1; v1 = mm_zzage_epi16(v1); __m128i v1 = mm_delta_epi16(vi1,vs); vs = vi1; v1 = mm_zzage_epi16(v1);
vo0 = _mm_or_si128(vo0, v0); vo0 = _mm_or_si128(vo0, v0);
vo1 = _mm_or_si128(vo1, v1); vo1 = _mm_or_si128(vo1, v1);
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
} start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14)); } start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14));
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
#else #else
@ -450,8 +451,8 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
#endif #endif
for(;ip != in+n; ip++) { for(;ip != in+n; ip++) {
uint16_t u = zigzagenc16((int)ip[0] - (int)start); //int i = ((int)(*ip) - (int)start); i = (i << 1) ^ (i >> 15); uint16_t u = zigzagenc16((int)ip[0] - (int)start); //int i = ((int)(*ip) - (int)start); i = (i << 1) ^ (i >> 15);
start = *ip; start = *ip;
o |= u; o |= u;
x |= u ^ u0; x |= u ^ u0;
} }
if(px) *px = x; if(px) *px = x;
@ -463,39 +464,39 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
#ifdef __AVX2__ #ifdef __AVX2__
__m256i vb0 = _mm256_set1_epi32(u0), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(), __m256i vb0 = _mm256_set1_epi32(u0), vo0 = _mm256_setzero_si256(), vx0 = _mm256_setzero_si256(),
vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start); vo1 = _mm256_setzero_si256(), vx1 = _mm256_setzero_si256(); __m256i vs = _mm256_set1_epi32(start);
for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
__m256i vi0 = _mm256_loadu_si256((__m256i *) ip); __m256i vi0 = _mm256_loadu_si256((__m256i *) ip);
__m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0; v0 = mm256_zzage_epi32(v0); __m256i vi1 = _mm256_loadu_si256((__m256i *)(ip+8)); __m256i v0 = mm256_delta_epi32(vi0,vs); vs = vi0; v0 = mm256_zzage_epi32(v0);
__m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1; v1 = mm256_zzage_epi32(v1); __m256i v1 = mm256_delta_epi32(vi1,vs); vs = vi1; v1 = mm256_zzage_epi32(v1);
vo0 = _mm256_or_si256(vo0, v0); vo0 = _mm256_or_si256(vo0, v0);
vo1 = _mm256_or_si256(vo1, v1); vo1 = _mm256_or_si256(vo1, v1);
vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0)); vx0 = _mm256_or_si256(vx0, _mm256_xor_si256(v0, vb0));
vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0)); vx1 = _mm256_or_si256(vx1, _mm256_xor_si256(v1, vb0));
} start = (unsigned)_mm256_extract_epi32(vs, 7); } start = (unsigned)_mm256_extract_epi32(vs, 7);
vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0); vo0 = _mm256_or_si256(vo0, vo1); o = mm256_hor_epi32(vo0);
vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0); vx0 = _mm256_or_si256(vx0, vx1); x = mm256_hor_epi32(vx0);
#elif defined(__SSE2__) || defined(__ARM_NEON) #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vb0 = _mm_set1_epi32(u0), __m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start);
for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(8-1)); ip += 8) { PREFETCH(ip+512,0);
__m128i vi0 = _mm_loadu_si128((__m128i *) ip); __m128i vi0 = _mm_loadu_si128((__m128i *) ip);
__m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = mm_delta_epi32(vi0,vs); vs = vi0; v0 = mm_zzage_epi32(v0); __m128i vi1 = _mm_loadu_si128((__m128i *)(ip+4)); __m128i v0 = mm_delta_epi32(vi0,vs); vs = vi0; v0 = mm_zzage_epi32(v0);
__m128i v1 = mm_delta_epi32(vi1,vs); vs = vi1; v1 = mm_zzage_epi32(v1); __m128i v1 = mm_delta_epi32(vi1,vs); vs = vi1; v1 = mm_zzage_epi32(v1);
vo0 = _mm_or_si128(vo0, v0); vo0 = _mm_or_si128(vo0, v0);
vo1 = _mm_or_si128(vo1, v1); vo1 = _mm_or_si128(vo1, v1);
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
} start = _mm_cvtsi128_si16(_mm_srli_si128(vs,12)); } start = _mm_cvtsi128_si16(_mm_srli_si128(vs,12));
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
#else #else
ip = in; o = x = 0; //uint32_t u; BITDE(uint32_t, in, n, 0, o |= u; x |= u^u0); ip = in; o = x = 0; //uint32_t u; BITDE(uint32_t, in, n, 0, o |= u; x |= u^u0);
#endif #endif
for(;ip != in+n; ip++) { for(;ip != in+n; ip++) {
uint32_t u = zigzagenc32((int)ip[0] - (int)start); start = *ip; //((int)(*ip) - (int)start); //i = (i << 1) ^ (i >> 31); uint32_t u = zigzagenc32((int)ip[0] - (int)start); start = *ip; //((int)(*ip) - (int)start); //i = (i << 1) ^ (i >> 31);
o |= u; o |= u;
x |= u ^ u0; x |= u ^ u0;
} }
if(px) *px = x; if(px) *px = x;
@ -511,19 +512,19 @@ uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint
__m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), dv; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), dv;
for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) { for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip); __m128i iv = _mm_loadu_si128((__m128i *)ip);
dv = mm_delta_epi32(iv,vs); vs = iv; dv = mm_delta_epi32(iv,vs); vs = iv;
dv = mm_zzage_epi32(dv); dv = mm_zzage_epi32(dv);
bv = _mm_or_si128(bv, dv); bv = _mm_or_si128(bv, dv);
_mm_storeu_si128((__m128i *)op, dv); _mm_storeu_si128((__m128i *)op, dv);
} }
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12)); start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
b = mm_hor_epi32(bv); b = mm_hor_epi32(bv);
while(ip != in+n) { while(ip != in+n) {
int x = ((int)(*ip)-(int)start); int x = ((int)(*ip)-(int)start);
x = (x << 1) ^ (x >> 31); x = (x << 1) ^ (x >> 31);
start = *ip++; start = *ip++;
b |= x; b |= x;
*op++ = x; *op++ = x;
} }
#else #else
uint32_t b = 0, *op = out,x; uint32_t b = 0, *op = out,x;
@ -548,13 +549,13 @@ void bitzdec16(uint16_t *p, unsigned n, uint16_t start) {
for(ip = p; ip != p+(n&~(8-1)); ip += 8) { for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
__m128i iv = _mm_loadu_si128((__m128i *)ip); __m128i iv = _mm_loadu_si128((__m128i *)ip);
iv = mm_zzagd_epi16(iv); iv = mm_zzagd_epi16(iv);
vs = mm_scan_epi16(iv, vs); vs = mm_scan_epi16(iv, vs);
_mm_storeu_si128((__m128i *)ip, vs); _mm_storeu_si128((__m128i *)ip, vs);
} }
start = (uint16_t)_mm_cvtsi128_si32(_mm_srli_si128(vs,14)); start = (uint16_t)_mm_cvtsi128_si32(_mm_srli_si128(vs,14));
while(ip != p+n) { while(ip != p+n) {
uint16_t z = *ip; uint16_t z = *ip;
*ip++ = (start += (z >> 1 ^ -(z & 1))); *ip++ = (start += (z >> 1 ^ -(z & 1)));
} }
#else #else
BITZDEC(uint16_t, 16, p, n); BITZDEC(uint16_t, 16, p, n);
@ -568,13 +569,13 @@ void bitzdec32(unsigned *p, unsigned n, unsigned start) {
for(ip = p; ip != p+(n&~(8-1)); ip += 8) { for(ip = p; ip != p+(n&~(8-1)); ip += 8) {
__m256i iv = _mm256_loadu_si256((__m256i *)ip); __m256i iv = _mm256_loadu_si256((__m256i *)ip);
iv = mm256_zzagd_epi32(iv); iv = mm256_zzagd_epi32(iv);
vs = mm256_scan_epi32(iv,vs); vs = mm256_scan_epi32(iv,vs);
_mm256_storeu_si256((__m256i *)ip, vs); _mm256_storeu_si256((__m256i *)ip, vs);
} }
start = (unsigned)_mm256_extract_epi32(_mm256_srli_si256(vs,12), 4); start = (unsigned)_mm256_extract_epi32(_mm256_srli_si256(vs,12), 4);
while(ip != p+n) { while(ip != p+n) {
unsigned z = *ip; unsigned z = *ip;
*ip++ = (start += (z >> 1 ^ -(z & 1))); *ip++ = (start += (z >> 1 ^ -(z & 1)));
} }
#elif defined(__SSE2__) || defined(__ARM_NEON) #elif defined(__SSE2__) || defined(__ARM_NEON)
__m128i vs = _mm_set1_epi32(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
@ -582,13 +583,13 @@ void bitzdec32(unsigned *p, unsigned n, unsigned start) {
for(ip = p; ip != p+(n&~(4-1)); ip += 4) { for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip); __m128i iv = _mm_loadu_si128((__m128i *)ip);
iv = mm_zzagd_epi32(iv); iv = mm_zzagd_epi32(iv);
vs = mm_scan_epi32(iv, vs); vs = mm_scan_epi32(iv, vs);
_mm_storeu_si128((__m128i *)ip, vs); _mm_storeu_si128((__m128i *)ip, vs);
} }
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12)); start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(vs,12));
while(ip != p+n) { while(ip != p+n) {
unsigned z = *ip; unsigned z = *ip;
*ip++ = (start += zigzagdec32(z)); *ip++ = (start += zigzagdec32(z));
} }
#else #else
BITZDEC(uint32_t, 32, p, n); BITZDEC(uint32_t, 32, p, n);
@ -630,7 +631,6 @@ uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t *px, uint32_t *pmin) { uint
uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin) { uint64_t mi,mx; BITFM(uint64_t, in, n); *pmin = mi; if(px) *px = 0; return mx - mi; } uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin) { uint64_t mi,mx; BITFM(uint64_t, in, n); *pmin = mi; if(px) *px = 0; return mx - mi; }
//----------- Lossy floating point conversion: pad the trailing mantissa bits with zero bits according to the relative error e (ex. 0.00001) ---------- //----------- Lossy floating point conversion: pad the trailing mantissa bits with zero bits according to the relative error e (ex. 0.00001) ----------
#include <math.h> //nan
#ifdef USE_FLOAT16 #ifdef USE_FLOAT16
// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point // https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
@ -658,7 +658,7 @@ static inline float _fppad32(float d, float e, int lg2e) {
uint32_t u, du = ctou32(&d), sign; uint32_t u, du = ctou32(&d), sign;
int b = (du>>23 & 0xff)-0x7e; int b = (du>>23 & 0xff)-0x7e;
if((b = 25 - b - lg2e) <= 0) if((b = 25 - b - lg2e) <= 0)
return d; return d;
b = b > 23?23:b; b = b > 23?23:b;
sign = du & (1<<31); sign = du & (1<<31);
du &= 0x7fffffffu; du &= 0x7fffffffu;
@ -675,7 +675,7 @@ static inline double _fppad64(double d, double e, int lg2e) { if(isnan(d)) retur
uint64_t sign; uint64_t sign;
int b = (du.u>>52 & 0x7ff)-0x3fe; int b = (du.u>>52 & 0x7ff)-0x3fe;
if((b = 54 - b - lg2e) <= 0) if((b = 54 - b - lg2e) <= 0)
return d; return d;
b = b > 52?52:b; b = b > 52?52:b;
sign = du.u & (1ull<<63); du.u &= 0x7fffffffffffffffull; sign = du.u & (1ull<<63); du.u &= 0x7fffffffffffffffull;
int _b = b; int _b = b;