diff --git a/README.md b/README.md
index 926ba27..5ca8901 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ TurboPFor: Fastest Integer Compression [ implementation
+ - :new: **now up to 25% more faster**
+ **Simple family**
- :sparkles: **Novel** **"Variable Simple"** (incl. **RLE**) faster and more efficient than simple16, simple-8b
@@ -71,7 +72,7 @@ CPU: Sandy bridge i7-2600k at 4.2GHz, gcc 5.1, ubuntu 15.04, single thread.
| 99.910.930| 24.98| 7.99| 2524.50|1943.41|[SIMDPack FPF](#FastPFor)|
| 99.910.930| 24.98| 7.99| 1883.21|1898.11|**TurboPack**|
| 99.910.930| 24.98| 7.99| 1877.25| 935.83|**TurboForDA**|
-|102.074.663| 25.52| 8.17| 1621.64|1694.64|**TurboVbyte**|
+|102.074.663| 25.52| 8.17| 1993.95|1827.04|**TurboVbyte**|
|102.074.663| 25.52|8.17|1214.12|1688.95|[MaskedVByte](#MaskedVByte)|
|102.074.663| 25.52| 8.17| 1178.72| 949.59|[Vbyte FPF](#FastPFor)|
|103.035.930| 25.76| 8.24| 1480.47|1746.51|[libfor](#libfor)|
@@ -90,16 +91,16 @@ CPU: Skylake i7-6700 w/ only 3.7GHz
| 63392801| 15.85| 5.07| 387.30| 243.62|**TurboPForDA**|
| 65359916| 16.34| 5.23| 7.58| 609.12|OptPFD|
| 73477088| 18.37| 5.88| 101.68| 621.37|Simple16|
-| 78514276| 19.63| 6.28|256.83|676.45|**VSimple**|
-| 95915096| 23.98| 7.67| 211.79|954.62|Simple-8b|
+| 78514276| 19.63| 6.28|258.31|691.48|**VSimple**|
+| 95915096| 23.98| 7.67| 211.79|957.62|Simple-8b|
| 98546814| 24.64| 7.88| 70.85|**2349.71**|[QMX](#QMX)|
| 99910930| 24.98| 7.99|**3537.57**|**3081.79**|**TurboPackV**|
| 99910930| 24.98| 7.99| 3099.52|3071.77|SIMDPack FPF|
-| 99910930| 24.98| 7.99| 2050.47|2402.54|**TurboPack**|
+| 99910930| 24.98| 7.99| 2095.79|2495.22|**TurboPack**|
| 99910930| 24.98| 7.99| 2049.85|2364.52|**TurboFor**|
| 99910930| 24.98| 7.99| 2049.70|1124.12|**TurboForDA**|
|102074663| 25.52| 8.17| 1354.42|1745.69|MaskedVByte|
-|102074663| 25.52| 8.17| 1660.76|1626.67|**TurboVbyte**|
+|102074663| 25.52| 8.17| 1825.64|1844.34|**TurboVbyte**|
|102074663| 25.52| 8.17| 1249.77|1051.85|Vbyte FPF|
|112500000| 28.12| 9.00| 466.94|3003.70|VarintG8IU|
|128125000| 32.03| 10.25| 1109.67|1271.32|[StreamVbyte FPF](#FastPFor)|
@@ -310,4 +311,4 @@ header files to use with documentation:
- [On Inverted Index Compression for Search Engine Efficiency](http://www.dcs.gla.ac.uk/~craigm/publications/catena14compression.pdf)
- [Google's Group Varint Encoding](http://static.googleusercontent.com/media/research.google.com/de//people/jeff/WSDM09-keynote.pdf)
-Last update: 27 MAR 2016
+Last update: 08 APR 2016
diff --git a/bitunpack.c b/bitunpack.c
index dbfe47d..0dca5f7 100644
--- a/bitunpack.c
+++ b/bitunpack.c
@@ -35,17 +35,17 @@
#define DSTI(__op)
#define BPI(__w, __x, __parm) __w
#include __FILE__
-unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return ip; }
-unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return ip; }
-unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK64(in, n, b, out, 0); return ip; }
+unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return (unsigned char *)ip; }
+unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return (unsigned char *)ip; }
+unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK64(in, n, b, out, 0); return (unsigned char *)ip; }
#undef BPI
#undef DSTI
//-----------------------------------------------------------------------------------------------------------------
#define DSTI(__op)
#define BPI(__w, __x, __parm) (__parm += (__w) + 1)
#include __FILE__
-unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
-unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
+unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
#undef BPI
#undef DSTI
@@ -53,8 +53,8 @@ unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, uns
#define DSTI(__op)
#define BPI(__w, __x, __parm) (__parm += (__w))
#include __FILE__
-unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
-unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
+unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
#undef BPI
#undef DSTI
@@ -63,7 +63,7 @@ unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, uns
#define DSTI(__op)
#define BPI(__w, __x, __parm) (__parm += zigzagdec32(__w))
#include __FILE__
-unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
//unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
#undef BPI
#undef DSTI
@@ -73,8 +73,8 @@ unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, uns
#define BPI(__w, __x, __parm) (__parm + (__w))
#include __FILE__
-unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
-unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
+unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
#undef BPI
#undef DSTI
@@ -82,8 +82,8 @@ unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, uns
#define DSTI(__op) start += 32
#define BPI(__w, __x, __parm) (__parm + (__w)+__x+1)
#include __FILE__
-unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
-unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
+unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
#undef BPI
#undef DSTI
diff --git a/bitunpackv.c b/bitunpackv.c
index 29ad14b..7405626 100644
--- a/bitunpackv.c
+++ b/bitunpackv.c
@@ -78,7 +78,7 @@ unsigned char *bitunpackv32( const unsigned char *__restrict in, unsigned n, uns
const unsigned char *ip = in+PAD8(n*b);
__m128i sv;
BITUNPACKV32(in, n, b, out, sv);
- return ip;
+ return (unsigned char *)ip;
}
#undef VSTO
#undef VSTO0
@@ -116,7 +116,7 @@ unsigned char *_bitunpackv32( const unsigned char *__restrict in, unsigned n, un
const unsigned char *ip = in+PAD8(n*b); unsigned m;
__m128i sv;
BITUNPACKV32(in, n, b, out, sv);
- return ip;
+ return (unsigned char *)ip;
}
#undef VSTO
#undef VSTO0
@@ -134,7 +134,7 @@ unsigned char *bitzunpackv32( const unsigned char *__restrict in, unsigned n, un
const unsigned char *ip = in+PAD8(n*b);
__m128i sv = _mm_set1_epi32(start);
BITUNPACKV32(in, n, b, out, sv);
- return ip;
+ return (unsigned char *)ip;
}
#undef VSTO
#undef BITUNPACK0
@@ -149,7 +149,7 @@ unsigned char *bitdunpackv32( const unsigned char *__restrict in, unsigned n, un
const unsigned char *ip = in+PAD8(n*b);
__m128i sv = _mm_set1_epi32(start);
BITUNPACKV32(in, n, b, out, sv);
- return ip;
+ return (unsigned char *)ip;
}
#undef VSTO
#undef VSTO0
@@ -171,7 +171,7 @@ unsigned char *_bitdunpackv32( const unsigned char *__restrict in, unsigned n, u
const unsigned char *ip = in+PAD8(n*b); unsigned m;
__m128i sv = _mm_set1_epi32(start);
BITUNPACKV32(in, n, b, out, sv);
- return ip;
+ return (unsigned char *)ip;
}
#undef VSTO
#undef VSTO0
@@ -188,7 +188,7 @@ unsigned char *bitd1unpackv32( const unsigned char *__restrict in, unsigned n, u
const unsigned char *ip = in+PAD8(n*b);
__m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
BITUNPACKV32(in, n, b, out, sv);
- return ip;
+ return (unsigned char *)ip;
}
#undef VSTO
#undef VSTO0
@@ -209,7 +209,7 @@ unsigned char *_bitd1unpackv32( const unsigned char *__restrict in, unsigned n,
const unsigned char *ip = in+PAD8(n*b); unsigned m;
__m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
BITUNPACKV32(in, n, b, out, sv);
- return ip;
+ return (unsigned char *)ip;
}
#undef VSTO
#undef VSTO0
diff --git a/bitutil.c b/bitutil.c
index a0c543f..4b35125 100644
--- a/bitutil.c
+++ b/bitutil.c
@@ -1,5 +1,5 @@
/**
- Copyright (C) powturbo 2013-2015
+ Copyright (C) powturbo 2013-2016
GPL v2 License
This program is free software; you can redistribute it and/or modify
@@ -33,19 +33,22 @@
_x = (*_p)-__start-__inc; __start = *_p++; __act;\
_x = (*_p)-__start-__inc; __start = *_p++; __act;\
}\
- while(_p < __p+__n) { \
+ while(_p != __p+__n) { \
_x = *_p-__start-__inc; __start = *_p++; __act;\
}\
}
-#define BITUNDELTA(__p, __n, __start, __inc) { typeof(__p[0]) *_p;\
+#define BITUNDELTA(__p, __n, __start, __inc) {\
+ typeof(__p[0]) *_p;\
for(_p = __p; _p != __p+(__n&~(4-1)); ) {\
*_p = (__start += (*_p) + __inc); _p++;\
*_p = (__start += (*_p) + __inc); _p++;\
*_p = (__start += (*_p) + __inc); _p++;\
*_p = (__start += (*_p) + __inc); _p++;\
}\
- while(_p < __p+__n) { *_p = (__start += (*_p) + __inc); _p++; }\
+ while(_p != __p+__n) {\
+ *_p = (__start += (*_p) + __inc); _p++;\
+ }\
}
#define BITMINMAX(__p,__n, __mi, __mx) {\
@@ -56,7 +59,7 @@
if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \
if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \
}\
- while(_p < __p+__n) { \
+ while(_p != __p+__n) { \
if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \
}\
}
@@ -65,29 +68,36 @@ unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, uns
#ifdef __SSE2__
unsigned *ip,b,*op = out;
__m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(inc), dv;
- for(ip = in; ip != in+(n&~(4-1)); ip += 4) {
+ for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip);
bv = _mm_or_si128(bv, dv = _mm_sub_epi32(DELTA128_32(iv,sv),cv));
sv = iv;
_mm_storeu_si128((__m128i *)op, dv);
- op += 4;
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
HOR128_32(bv, b);
- while(ip < in+n) { unsigned x = *ip-start-inc; start = *ip++; b |= x; *op++ = x; }
+ while(ip != in+n) {
+ unsigned x = *ip-start-inc;
+ start = *ip++;
+ b |= x;
+ *op++ = x;
+ }
#else
- typeof(in[0]) b = 0,*op = out; BITDELTA(in, n, inc, start, b |= _x;*op++ = _x);
+ typeof(in[0]) b = 0,*op = out;
+ BITDELTA(in, n, inc, start, b |= _x;*op++ = _x);
#endif
return bsr32(b);
}
unsigned bitdelta64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, unsigned inc) {
- typeof(in[0]) b = 0,*op = out; BITDELTA(in, n, inc, start, b |= _x; *op++ = _x);
+ typeof(in[0]) b = 0,*op = out;
+ BITDELTA(in, n, inc, start, b |= _x; *op++ = _x);
return bsr64(b);
}
unsigned bit32(unsigned *in, unsigned n) {
- typeof(in[0]) b; BITSIZE32(in, n, b);
+ typeof(in[0]) b;
+ BITSIZE32(in, n, b);
return b;
}
@@ -119,13 +129,14 @@ unsigned bitd32(unsigned *in, unsigned n, unsigned start) {
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
HOR128_32(bv, b);
- while(ip < in+n) {
+ while(ip != in+n) {
unsigned x = *ip-start;
start = *ip++;
b |= x;
}
#else
- typeof(in[0]) b = 0; BITDELTA(in,n, 0, start, b |= _x);
+ typeof(in[0]) b = 0;
+ BITDELTA(in,n, 0, start, b |= _x);
#endif
return bsr32(b);
}
@@ -141,13 +152,14 @@ unsigned bitd132(unsigned *in, unsigned n, unsigned start) {
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
HOR128_32(bv, b);
- while(ip < in+n) {
+ while(ip != in+n) {
unsigned x = *ip-start-1;
start = *ip++;
b |= x;
}
#else
- typeof(in[0]) b = 0; BITDELTA(in, n, 1, start, b |= _x);
+ typeof(in[0]) b = 0;
+ BITDELTA(in, n, 1, start, b |= _x);
#endif
return bsr32(b);
}
@@ -159,14 +171,13 @@ void bitund132(unsigned *p, unsigned n, unsigned x) {
#ifdef __SSE2__
__m128i sv = _mm_set1_epi32(x), cv = _mm_set_epi32(4,3,2,1);
unsigned *ip;
- for(ip = p; ip != p+(n&~(4-1)); ) {
+ for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
__m128i v = _mm_loadu_si128((__m128i *)ip);
SCANI128_32(v, sv, cv);
_mm_storeu_si128((__m128i *)ip, sv);
- ip += 4;
}
x = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
- while(ip < p+n) {
+ while(ip != p+n) {
*ip = (x += (*ip) + 1);
ip++;
}
@@ -188,18 +199,21 @@ void bitundx64(uint64_t *p, unsigned n, uint64_t x, unsigned inc) { BITUNDELTA(p
_x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\
}\
while(_p != __p+__n) { \
- _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\
+ _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\
}\
}
-#define BITUNZIGZAG(__p, __n, __start) { typeof(__p[0]) *_p, _z;\
+#define BITUNZIGZAG(__p, __n, __start) {\
+ typeof(__p[0]) *_p, _z;\
for(_p = __p; _p != __p+(__n&~(4-1)); ) {\
_z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
_z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
_z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
_z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
}\
- while(_p != __p+__n) { _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++; }\
+ while(_p != __p+__n) {\
+ _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
+ }\
}
unsigned bitz32(unsigned *in, unsigned n, unsigned start) {
@@ -216,10 +230,15 @@ unsigned bitz32(unsigned *in, unsigned n, unsigned start) {
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
HOR128_32(bv, b);
while(ip != in+n) {
- int x = ((int)(*ip)-(int)start); x = (x << 1) ^ (x >> 31); start = *ip++; b |= x;
+ int x = ((int)(*ip)-(int)start);
+ x = (x << 1) ^ (x >> 31);
+ start = *ip++;
+ b |= x;
}
#else
- typeof(in[0]) b = 0,*op = out; int _x; BITZIGZAG(in, n, start, b |= (unsigned)_x);
+ typeof(in[0]) b = 0,*op = out;
+ int _x;
+ BITZIGZAG(in, n, start, b |= (unsigned)_x);
#endif
return bsr32(b);
}
@@ -228,22 +247,27 @@ unsigned bitzigzag32(unsigned *in, unsigned n, unsigned *out, unsigned start) {
#ifdef __SSE2__
unsigned *ip,b,*op = out;
__m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), dv;
- for(ip = in; ip != in+(n&~(4-1)); ip += 4) {
+ for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip);
dv = DELTA128_32(iv,sv);
sv = iv;
dv = ZIGZAG128_32(dv);
bv = _mm_or_si128(bv, dv);
_mm_storeu_si128((__m128i *)op, dv);
- op += 4;
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
HOR128_32(bv, b);
while(ip != in+n) {
- int x = ((int)(*ip)-(int)start); x = (x << 1) ^ (x >> 31); start = *ip++; b |= x; *op++ = x;
+ int x = ((int)(*ip)-(int)start);
+ x = (x << 1) ^ (x >> 31);
+ start = *ip++;
+ b |= x;
+ *op++ = x;
}
#else
- typeof(in[0]) b = 0,*op = out; int _x; BITZIGZAG(in, n, start, b |= (unsigned)_x; *op++ = _x);
+ typeof(in[0]) b = 0, *op = out;
+ int _x;
+ BITZIGZAG(in, n, start, b |= (unsigned)_x; *op++ = _x);
#endif
return bsr32(b);
}
@@ -252,61 +276,81 @@ void bitunzigzag32(unsigned *p, unsigned n, unsigned start) {
#ifdef __SSE2__
__m128i sv = _mm_set1_epi32(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
unsigned *ip;
- for(ip = p; ip != p+(n&~(4-1)); ) {
+ for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip);
iv = UNZIGZAG128_32(iv);
SCAN128_32(iv, sv);
_mm_storeu_si128((__m128i *)ip, sv);
- ip += 4;
}
start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
while(ip != p+n) {
- unsigned z = *ip; *ip = (start += (z >> 1 ^ -(z & 1))); ip++;
+ unsigned z = *ip;
+ *ip++ = (start += (z >> 1 ^ -(z & 1)));
}
#else
BITUNZIGZAG(p, n, start);
#endif
}
-unsigned bitzigzag64(unsigned *in, unsigned n, unsigned *out, unsigned start) {
- typeof(in[0]) b = 0,*op = out; long long _x; BITZIGZAG(in, n, start, b |= (unsigned long long)_x; *op++ = _x);
+unsigned bitzigzag64(uint64_t *in, unsigned n, uint64_t *out, unsigned start) {
+ typeof(in[0]) b = 0,*op = out;
+ long long _x;
+ BITZIGZAG(in, n, start, b |= (unsigned long long)_x; *op++ = _x);
return bsr32(b);
}
-void bitunzigzag64(unsigned *p, unsigned n, unsigned start) {
+void bitunzigzag64(uint64_t *p, unsigned n, unsigned start) {
BITUNZIGZAG(p, n, start);
}
//------------------- De-/Compose Floating Point -----------------------------------------
-void bitdouble(double *in, unsigned n, unsigned *sgn, unsigned *expo, uint64_t *mant) {
+void bitdouble(double *in, unsigned n, int *expo, uint64_t *mant) {
double *ip;
- uint64_t u;
for(ip = in; ip < in+n; ip++) {
- u = *(uint64_t *)ip; BITFLOAT(u, *sgn++, *expo++, *mant++, DMANT_BITS, 1ull);
+ uint64_t u = *(uint64_t *)ip;
+ *expo++ = FLTEXPO(u, DMANT_BITS, 1ull);
+ *mant++ = FLTMANT(u, DMANT_BITS, 1ull);
}
}
-void bitundouble(unsigned *sgn, unsigned *expo, uint64_t *mant, unsigned n, double *out) {
+void bitundouble(int *expo, uint64_t *mant, unsigned n, double *out) {
double *op;
uint64_t u;
- for(op = out; op < out+n; op++) {
- BITUNFLOAT((uint64_t)(*sgn++), (uint64_t)(*expo++), *mant++, u, DMANT_BITS); *op = *(double *)&u;
+ for(op = out; op < out+n; ) {
+ BITUNFLOAT( (int64_t)(*expo++), *mant++, u, DMANT_BITS); *op++ = *(double *)&u;
}
}
-void bitfloat(float *in, unsigned n, unsigned *sgn, unsigned *expo, unsigned *mant) {
- float *ip;
- unsigned u;
+void bitzdouble(double *in, unsigned n, int *expo, uint64_t *mant) {
+ double *ip;
for(ip = in; ip < in+n; ip++) {
- u = *(unsigned *)ip; BITFLOAT(u, *sgn++, *expo++, *mant++, FMANT_BITS, 1u);
+ uint64_t u = *(uint64_t *)ip;
+ *expo++ = zigzagenc32((int)FLTEXPO(u, DZMANT_BITS, 1ull)-1023);
+ *mant++ = FLTMANT(u, DZMANT_BITS, 1ull);
}
}
-void bitunfloat(unsigned *sgn, unsigned *expo, unsigned *mant, unsigned n, float *out) {
+void bitzundouble(int *expo, uint64_t *mant, unsigned n, double *out) {
+ double *op;
+ uint64_t u;
+ for(op = out; op < out+n; ) {
+ BITUNFLOAT( (int64_t)zigzagdec32(*expo++)+1023, *mant++, u, DZMANT_BITS); *op++ = *(double *)&u;
+ }
+}
+
+void bitfloat(float *in, unsigned n, int *expo, unsigned *mant) {
+ float *ip;
+ for(ip = in; ip < in+n; ip++) {
+ unsigned u = *(unsigned *)ip;
+ *expo++ = FLTEXPO(u, FMANT_BITS, 1u);
+ *mant++ = FLTMANT(u, FMANT_BITS, 1u);
+ }
+}
+
+void bitunfloat(int *expo, unsigned *mant, unsigned n, float *out) {
float *op;
unsigned u;
for(op = out; op < out+n; op++) {
- BITUNFLOAT((*sgn++), (*expo++), *mant++, u, FMANT_BITS); *op = *(float *)&u;
+ BITUNFLOAT( (*expo++), *mant++, u, FMANT_BITS); *op = *(float *)&u;
}
}
-
diff --git a/bitutil.h b/bitutil.h
index 4cccece..ae4e667 100644
--- a/bitutil.h
+++ b/bitutil.h
@@ -1,5 +1,5 @@
/**
- Copyright (C) powturbo 2013-2015
+ Copyright (C) powturbo 2013-2016
GPL v2 License
This program is free software; you can redistribute it and/or modify
@@ -24,89 +24,103 @@
// bitutil.h - "Integer Compression"
#include
-#define _BITFORZERO(out, n, start, inc) do {\
- for(i = 0; i != (n&~3); ) {\
- out[i] = start+i*inc; i++;\
- out[i] = start+i*inc; i++;\
- out[i] = start+i*inc; i++;\
- out[i] = start+i*inc; i++;\
- }\
- while(i < n) out[i] = start+i*inc,++i;\
+#define _BITFORZERO(_out_, _n_, _start_, _inc_) do { unsigned _i;\
+ for(_i = 0; _i != (_n_&~3); ) {\
+ _out_[_i] = _start_+_i*_inc_; _i++;\
+ _out_[_i] = _start_+_i*_inc_; _i++;\
+ _out_[_i] = _start_+_i*_inc_; _i++;\
+ _out_[_i] = _start_+_i*_inc_; _i++;\
+ }\
+ while(_i != _n_)\
+ _out_[_i] = _start_+_i*_inc_, ++_i;\
} while(0)
-#define BITSIZE(__in, __n, __b, __usize) { typeof(__in[0]) *_ip;\
- for(__b=0,_ip = __in; _ip != __in+(__n&~(4-1)); )\
- __b |= *_ip++ | *_ip++ | *_ip++ | *_ip++;\
- while(_ip != __in+__n) __b |= *_ip++;\
- __b = TEMPLATE(bsr, __usize)(__b);\
+#define BITSIZE(_in_, _n_, _b_, _usize_) { typeof(_in_[0]) *_ip;\
+ for(_b_=0,_ip = _in_; _ip != _in_+(_n_&~(4-1)); )\
+ _b_ |= *_ip++ | *_ip++ | *_ip++ | *_ip++;\
+ while(_ip != _in_+_n_) \
+ _b_ |= *_ip++;\
+ _b_ = TEMPLATE(bsr, _usize_)(_b_);\
}
-static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; }
-static inline unsigned zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); }
+static inline unsigned zigzagenc31(int x) { x = (x << 2 | ((x>>30)& 2)) ^ x >> 31; return x; }
+static inline unsigned zigzagdec31(unsigned x) { return (x >> 2 | (x& 2)<<30 ) ^ -(x & 1); }
+
+static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; }
+static inline unsigned zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); }
+
+static inline uint64_t zigzagenc64(int64_t x) { return x << 1 ^ x >> 63; }
+static inline uint64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); }
#ifdef __SSE2__
#include
+// SIMD Delta
+#define DELTA128_32(_v_, _sv_) _mm_sub_epi32(_v_, _mm_or_si128(_mm_srli_si128(_sv_, 12), _mm_slli_si128(_v_, 4)))
-#define DELTA128_32(__v, __sv) _mm_sub_epi32(__v, _mm_or_si128(_mm_srli_si128(__sv, 12), _mm_slli_si128(__v, 4)))
+// SIMD Scan ( prefix sum )
+#define SCAN128_32( _v_, _sv_) _v_ = _mm_add_epi32(_v_, _mm_slli_si128(_v_, 4)); _sv_ = _mm_add_epi32(_mm_shuffle_epi32(_sv_, _MM_SHUFFLE(3, 3, 3, 3)), _mm_add_epi32(_mm_slli_si128(_v_, 8), _v_) )
+#define SCANI128_32(_v_, _sv_, _vi_) SCAN128_32(_v_, _sv_); _sv_ = _mm_add_epi32(_sv_, _vi_)
-#define SCAN128_32( __v, __sv) __v = _mm_add_epi32(__v, _mm_slli_si128(__v, 4)); __sv = _mm_add_epi32(_mm_shuffle_epi32(__sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm_add_epi32(_mm_slli_si128(__v, 8), __v) )
-#define SCANI128_32(__v, __sv, __vi) SCAN128_32(__v, __sv); __sv = _mm_add_epi32(__sv, __vi)
+// SIMD ZigZag
+#define ZIGZAG128_32(_v_) _mm_xor_si128(_mm_slli_epi32(_v_,1), _mm_srai_epi32(_v_,31))
+#define UNZIGZAG128_32(_v_) _mm_xor_si128(_mm_srli_epi32(_v_,1), _mm_srai_epi32(_mm_slli_epi32(_v_,31),31) ) //_mm_sub_epi32(cz, _mm_and_si128(iv,c1))
-#define ZIGZAG128_32(__v) _mm_xor_si128(_mm_slli_epi32(__v,1), _mm_srai_epi32(__v,31))
-#define UNZIGZAG128_32(__v) _mm_xor_si128(_mm_srli_epi32(__v,1), _mm_srai_epi32(_mm_slli_epi32(__v,31),31) ) //_mm_sub_epi32(cz, _mm_and_si128(iv,c1))
// SIMD Horizontal OR
-#define HOR128_32(__v,__b) __v = _mm_or_si128(__v, _mm_srli_si128(__v, 8)); __v = _mm_or_si128(__v, _mm_srli_si128(__v, 4)); __b = (unsigned)_mm_cvtsi128_si32(__v)
+#define HOR128_32(_v_,_b_) _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 8)); _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 4)); _b_ = (unsigned)_mm_cvtsi128_si32(_v_)
-#define BITSIZE32(__in, __n, __b) { typeof(__in[0]) *_ip; __m128i v = _mm_setzero_si128();\
- for(_ip = __in; _ip != __in+(__n&~(4-1)); _ip+=4) v = _mm_or_si128(v, _mm_loadu_si128((__m128i*)_ip));\
- HOR128_32(v,__b);\
- while(_ip != __in+__n) __b |= *_ip++;\
- __b = bsr32(__b);\
+#define BITSIZE32(_in_, _n_, _b_) { typeof(_in_[0]) *_ip; __m128i _v = _mm_setzero_si128();\
+ for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip+=4)\
+ _v = _mm_or_si128(_v, _mm_loadu_si128((__m128i*)_ip));\
+ HOR128_32(_v,_b_);\
+ while(_ip != _in_+_n_)\
+ _b_ |= *_ip++;\
+ _b_ = bsr32(_b_);\
}
-
-#define BITZERO32(out, n, start) do {\
- __m128i sv = _mm_set1_epi32(start), *ov = (__m128i *)(out), *ove = (__m128i *)(out + n);\
- do { _mm_storeu_si128(ov++, sv); } while(ov < ove); \
+// SIMD set value
+#define BITZERO32(_out_, _n_, _start_) do {\
+ __m128i _sv_ = _mm_set1_epi32(_start_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
+ do _mm_storeu_si128(_ov++, _sv_); while(_ov < _ove); \
} while(0)
-#define BITFORZERO32(out, n, start, inc) do {\
- __m128i sv = _mm_set1_epi32(start), *ov=(__m128i *)(out), *ove = (__m128i *)(out + n), cv = _mm_set_epi32(3*inc,2*inc,1*inc,0); \
- sv = _mm_add_epi32(sv, cv);\
- cv = _mm_set1_epi32(4);\
- do { _mm_storeu_si128(ov++, sv); sv = _mm_add_epi32(sv, cv); } while(ov < ove);\
+#define BITFORZERO32(_out_, _n_, _start_, _inc_) do {\
+ __m128i _sv = _mm_set1_epi32(_start_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_), _cv = _mm_set_epi32(3*_inc_,2*_inc_,1*_inc_,0); \
+ _sv = _mm_add_epi32(_sv, _cv);\
+ _cv = _mm_set1_epi32(4);\
+ do { _mm_storeu_si128(_ov++, _sv); _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
} while(0)
-#define BITDIZERO32(out, n, start, inc) do { __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(3+inc,2+inc,1+inc,inc), *ov=(__m128i *)(out), *ove = (__m128i *)(out + n);\
- sv = _mm_add_epi32(sv, cv); cv = _mm_set1_epi32(4*inc); do { _mm_storeu_si128(ov++, sv), sv = _mm_add_epi32(sv, cv); } while(ov < ove);\
+#define BITDIZERO32(_out_, _n_, _start_, _inc_) do { __m128i _sv = _mm_set1_epi32(_start_), _cv = _mm_set_epi32(3+_inc_,2+_inc_,1+_inc_,_inc_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
+ _sv = _mm_add_epi32(_sv, _cv); _cv = _mm_set1_epi32(4*_inc_); do { _mm_storeu_si128(_ov++, _sv), _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
} while(0)
#else
-#define BITSIZE32(__in, __n, __b) BITSIZE(__in, __n, __b, 32)
-#define BITFORZERO32(out, n, start, inc) _BITFORZERO(out, n, start, inc)
-#define BITZERO32(out, n, start) _BITFORZERO(out, n, start, 0)
+#define BITSIZE32(_in_, _n_, _b_) BITSIZE(_in_, _n_, _b_, 32)
+#define BITFORZERO32(_out_, _n_, _start_, _inc_) _BITFORZERO(_out_, _n_, _start_, _inc_)
+#define BITZERO32(_out_, _n_, _start_) _BITFORZERO(_out_, _n_, _start_, 0)
#endif
-
-#define DELTR( __in, __n, __mode, __out) { unsigned _v; for( __out[0]=__in[0],_v = 1; _v < __n; _v++) __out[_v] = (__in[_v] - __out[0]) - _v*__mode; }
-#define DELTRB(__in, __n, __mode, __b, __out) { unsigned _v; for(__b=0,__out[0]=__in[0],_v = 1; _v < __n; _v++) __out[_v] = (__in[_v] - __out[0]) - _v*__mode, __b |= __out[_v]; __b = bsr32(__b); }
+#define DELTR( _in_, _n_, _mode_, _out_) { unsigned _v; for( _out_[0]=_in_[0],_v = 1; _v < _n_; _v++) _out_[_v] = (_in_[_v] - _out_[0]) - _v*_mode_; }
+#define DELTRB(_in_, _n_, _mode_, _b_, _out_) { unsigned _v; for(_b_=0,_out_[0]=_in_[0],_v = 1; _v < _n_; _v++) _out_[_v] = (_in_[_v] - _out_[0]) - _v*_mode_, _b_ |= _out_[_v]; _b_ = bsr32(_b_); }
#ifdef __cplusplus
extern "C" {
#endif
-// get maximum bit length of the elements in the integer array
+//------------- get maximum bit length of the elements in the integer array -----------------------
unsigned bit32( unsigned *in, unsigned n);
-// transform sorted integer array to delta array. inc = increment
+//------------- Delta for sorted integer array ----------------------------------------------------
+//-- transform sorted integer array to delta array. inc = increment: out[i] = in[i] - in[i-1] - inc
unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, unsigned inc);
unsigned bitdelta64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, unsigned inc);
-// get delta maximum bit length of the non decreasing integer array
+//-- get delta maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1]
unsigned bitd32( unsigned *in, unsigned n, unsigned start);
-// get delta maximum bit length of the non strictly decreasing integer array
+//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
unsigned bitd132( unsigned *in, unsigned n, unsigned start);
+//-- in-place reverse delta transform
void bitund32( unsigned *p, unsigned n, unsigned x);
void bitund64( uint64_t *p, unsigned n, uint64_t x);
@@ -115,32 +129,47 @@ void bitundx64( uint64_t *p, unsigned n, uint64_t x, unsigned inc);
void bitund132( unsigned *p, unsigned n, unsigned x);
-// for
+//------------- FOR array bit length: out[i] = in[i] - start -------------------------------------
+
unsigned bitf32( unsigned *in, unsigned n, unsigned start); // sorted
unsigned bitf132( unsigned *in, unsigned n, unsigned start);
unsigned bitfm32( unsigned *in, unsigned n, unsigned *pmin); // unsorted
unsigned bitf1m32( unsigned *in, unsigned n, unsigned *pmin);
-// zigzag encoding for unsorted integer lists
+//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] -------------
+
+//-- get maximum zigzag bit length integer array
unsigned bitz32( unsigned *in, unsigned n, unsigned start);
+
+//-- Zigzag transform
unsigned bitzigzag32(unsigned *in, unsigned n, unsigned *out, unsigned start);
-unsigned bitzigzag64(unsigned *in, unsigned n, unsigned *out, unsigned start);
+unsigned bitzigzag64(uint64_t *in, unsigned n, uint64_t *out, unsigned start);
+
+//-- Zigzag reverse transform
void bitunzigzag32( unsigned *p, unsigned n, unsigned start);
-void bitunzigzag64( unsigned *p, unsigned n, unsigned start);
+void bitunzigzag64( uint64_t *p, unsigned n, unsigned start);
//---- Floating point to Integer de-/composition ---------------------------------
+#define FMANT_BITS 16
+#define DMANT_BITS 32
+#define DZMANT_BITS 36
-#define FMANT_BITS 23
-#define DMANT_BITS 52
-#define BITFLOAT(__u, __sgn, __expo, __mant, __mantbits, __one) __sgn = __u >> (sizeof(__u)*8-1); __expo = ((__u >> (__mantbits)) & ( (__one<<(sizeof(__u)*8 - 1 - __mantbits)) -1)); __mant = __u & ((__one<<__mantbits)-1);
-#define BITUNFLOAT( __sgn, __expo, __mant, __u, __mantbits) __u = (__sgn) << (sizeof(__u)*8-1) | (__expo) << __mantbits | (__mant)
+#define FLTEXPO(__u,__mantbits, __one) ( ((__u) >> __mantbits) & ( (__one<<(sizeof(__u)*8 - __mantbits)) - 1 ) )
+#define FLTMANT(__u,__mantbits, __one) ((__u) & ((__one<<__mantbits)-1))
+
+#define BITUNFLOAT(__expo, __mant, __u, __mantbits) __u = ((__expo) << __mantbits) | (__mant)//>>1 | (__mant)<<(sizeof(__u)*8 - 1)
+
+/*#define BITFLOAT(__u, __sgn, __expo, __mant, __mantbits, __one) __sgn = __u >> (sizeof(__u)*8-1); __expo = EXPO(__u,__mantbits; __mant = __u & ((__one<<__mantbits)-1)
+#define BITUNFLOAT( __sgn, __expo, __mant, __u, __mantbits) __u = (__sgn) << (sizeof(__u)*8-1) | (__expo) << __mantbits | (__mant) */
// De-/Compose floating point array to/from integer arrays (sign,exponent,mantissa) for using with "Integer Compression" functions ------------
-void bitdouble( double *in, unsigned n, unsigned *sgn, unsigned *expo, uint64_t *mant);
-void bitundouble( unsigned *sgn, unsigned *expo, uint64_t *mant, unsigned n, double *out);
-void bitfloat( float *in, unsigned n, unsigned *sgn, unsigned *expo, unsigned *mant);
-void bitunfloat( unsigned *sgn, unsigned *expo, unsigned *mant, unsigned n, float *out);
+void bitdouble( double *in, unsigned n, int *expo, uint64_t *mant);
+void bitundouble( int *expo, uint64_t *mant, unsigned n, double *out);
+void bitzdouble( double *in, unsigned n, int *expo, uint64_t *mant);
+void bitzundouble( int *expo, uint64_t *mant, unsigned n, double *out);
+void bitfloat( float *in, unsigned n, int *expo, unsigned *mant);
+void bitunfloat( int *expo, unsigned *mant, unsigned n, float *out);
#ifdef __cplusplus
}
diff --git a/conf.h b/conf.h
index 73f7cd9..27c7a96 100644
--- a/conf.h
+++ b/conf.h
@@ -38,13 +38,19 @@
#define popcnt64(_x_) __builtin_popcountll(_x_)
#if defined(__i386__) || defined(__x86_64__)
-static inline int __bsr32(int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; }
-static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
-static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; }
-#define bsr16(_x_) bsr32(_x_)
+static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; }
+static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
+static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; }
+#define bsr16(_x_) bsr32(_x_)
+
+static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+
#else
-static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; }
-static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; }
+static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; }
+static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; }
+static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
+static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
#endif
#define ctz64(_x_) __builtin_ctzll(_x_)
@@ -65,6 +71,8 @@ static inline int bsr64(unsigned long long x) { unsigned long z = 0; _BitScanFor
static inline int ctz64(unsigned long long x) { unsigned long z = 0; _BitScanForward64(&z, x); return z; }
#endif
static inline int ctz32(unsigned x) { unsigned z = 0; _BitScanForward(&z, x); return z; }
+#define rol32(x,s) _lrotl(x, s)
+#define ror32(x,s) _lrotr(x, s)
#define fseeko _fseeki64
#define ftello _ftelli64
#define sleep(x) Sleep(x/1000)
diff --git a/icbench.c b/icbench.c
index a4ac427..3e8e09f 100644
--- a/icbench.c
+++ b/icbench.c
@@ -1,5 +1,5 @@
/**
- Copyright (C) powturbo 2013-2015
+ Copyright (C) powturbo 2013-2016
GPL v2 License
This program is free software; you can redistribute it and/or modify
@@ -1080,8 +1080,8 @@ int main(int argc, char *argv[]) { int r;
uint64_t *mantissa = malloc(n*sizeof(mantissa[0]));
unsigned *sign = malloc(n*sizeof(sign[0]));
unsigned *exp = malloc(n*sizeof(exp[0])); if(!mantissa || !exp || !sign || !dcpy) die("alloc error\n");
- bitdouble( din, n, sign, exp, mantissa);
- bitundouble( sign, exp, mantissa, n, dcpy);
+ bitdouble( din, n, exp, mantissa);
+ bitundouble( exp, mantissa, n, dcpy);
int i; for(i=0;i < n; i++) { printf("%d,%d,%llu,%e,%e\n", sign[i], exp[i],(long long unsigned int)mantissa[i], din[i], dcpy[i]); if(din[i]!=dcpy[i]) die("check error at %d %e %e\n", i, din[i], dcpy[i]); }
free(din); free(mantissa); free(exp); free(sign); free(dcpy);
exit(0);
diff --git a/vint.c b/vint.c
index 55698d2..2737426 100644
--- a/vint.c
+++ b/vint.c
@@ -1,5 +1,5 @@
/**
- Copyright (C) powturbo 2013-2015
+ Copyright (C) powturbo 2013-2016
GPL v2 License
This program is free software; you can redistribute it and/or modify
@@ -22,67 +22,45 @@
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// vint.c - "Integer Compression" variable byte
-#include
+#include
#include "conf.h"
#include "vint.h"
#include "bitutil.h"
-#define _vbputu32(__op, __x, __act) {\
- if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\
- else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\
- else if(likely(__x < (1<<21))) { *(unsigned *)__op = __x << 3 | 0x03; __op += 3; __act;}\
- else if(likely(__x < (1<<28))) { *(unsigned *)__op = __x << 4 | 0x07; __op += 4; __act;}\
- else { *(unsigned *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\
-}
-
-#define _vbgetu32(__ip, __x, __act) do {\
- if(!((__x = *__ip) & (1<<0))) { __ip++; __x >>= 1; __act;}\
- else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\
- else if(!(__x & (1<<2))) { __x = (*(unsigned *)__ip & 0xffffffu) >> 3; __ip += 3; __act;}\
- else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\
- else { __x = (unsigned long long)(*(unsigned *)__ip) >> 4 | (unsigned long long)(__ip[4]) << 28; __ip += 5; __act;}\
-} while(0)
-
-#define vbputu32(__op, __x) { unsigned _x_ = __x; _vbputu32(__op, _x_, ;); }
-
//-------------------------------------- variable byte : 32 bits ----------------------------------------------------------------
- #if defined(__AVX2__) && defined(__AVX2__VINT)
-#include
-#define M1 0xfeull //7
-#define M2 0xfffcull //14
-#define M3 0xfffff8ull //21
-#define M4 0xfffffff0ull //28
-#define M5 0xfffffffff0ull //36
-
- //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
-unsigned long long mtab[] = { M1, M2, M1, M3, M1, M2, M1, M4, M1, M2, M1, M3, M1, M2, M1, M5 };
- #endif
-//------------------------------------------------------------------------------------------------------------------------
//0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
-unsigned char vtab[] = { 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5 };
+unsigned char vtab[] = { 1, 1, 1, 1, 1, 1, 1, 1, 5, 4, 3, 3, 2, 2, 2, 2 };
// decompress buffer into an array of n unsigned values. Return value = end of decompressed buffer in
-unsigned char *vbdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out) { unsigned x,*op;
- for(op = out; op != out+(n&~(4-1)); op += 4) {
- _vbgetu32(in, x, op[0] = x);
- _vbgetu32(in, x, op[1] = x);
- _vbgetu32(in, x, op[2] = x);
- _vbgetu32(in, x, op[3] = x);
+unsigned char *vbdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out) { register unsigned x, *op;
+ for(op = out; op != out+(n&~(8-1)); op += 8) {
+ _vbget32(in, x, op[0] = x);
+ _vbget32(in, x, op[1] = x);
+ _vbget32(in, x, op[2] = x);
+ _vbget32(in, x, op[3] = x); __builtin_prefetch(in+256, 0);
+ _vbget32(in, x, op[4] = x);
+ _vbget32(in, x, op[5] = x);
+ _vbget32(in, x, op[6] = x);
+ _vbget32(in, x, op[7] = x);
}
- while(op != out+n) { _vbgetu32(in, x, ; ); *op++ = x; }
+ while(op != out+n) _vbget32(in, x, *op++ = x );
return in;
}
// encode array with n unsigned (32 bits in[n]) values to the buffer out. Return value = end of compressed buffer out
-unsigned char *vbenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out) { unsigned *ip;
- for(ip = in; ip != in+(n&~(4-1)); ) {
- vbputu32(out, *ip++);
- vbputu32(out, *ip++);
- vbputu32(out, *ip++);
- vbputu32(out, *ip++);
+unsigned char *vbenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out) { register unsigned x, *ip;
+ for(ip = in; ip != in+(n&~(8-1)); ip += 8) { __builtin_prefetch(ip+128, 0);
+ x = ip[0]; _vbput32(out, x, ;);
+ x = ip[1]; _vbput32(out, x, ;);
+ x = ip[2]; _vbput32(out, x, ;);
+ x = ip[3]; _vbput32(out, x, ;);
+ x = ip[4]; _vbput32(out, x, ;);
+ x = ip[5]; _vbput32(out, x, ;);
+ x = ip[6]; _vbput32(out, x, ;);
+ x = ip[7]; _vbput32(out, x, ;);
}
- while(ip != in+n) vbputu32(out, *ip++);
+ while(ip != in+n) { x = *ip++; _vbput32(out, x, ;); }
return out;
}
@@ -113,28 +91,28 @@ unsigned char *vbenc64(uint64_t *__restrict in, unsigned n, unsigned char *__res
unsigned char *vbdenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start) {
unsigned *ip,v;
for(ip = in; ip != in+(n&~(4-1)); ) {
- v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;);
- v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;);
- v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;);
- v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;);
+ v = (*ip)-start; start=*ip++; _vbput32(out, v, ;);
+ v = (*ip)-start; start=*ip++; _vbput32(out, v, ;);
+ v = (*ip)-start; start=*ip++; _vbput32(out, v, ;);
+ v = (*ip)-start; start=*ip++; _vbput32(out, v, ;);
}
- while(ip < in+n) { v = (*ip)-start; start = *ip++; _vbputu32(out, v, ;); }
+ while(ip < in+n) { v = (*ip)-start; start = *ip++; _vbput32(out, v, ;); }
return out;
}
unsigned char *vbddec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) {
unsigned x,*op;
for(op = out; op != out+(n&~(8-1)); ) {
- _vbgetu32(in, x, ;); *op++ = (start += x);
- _vbgetu32(in, x, ;); *op++ = (start += x);
- _vbgetu32(in, x, ;); *op++ = (start += x);
- _vbgetu32(in, x, ;); *op++ = (start += x);
- _vbgetu32(in, x, ;); *op++ = (start += x);
- _vbgetu32(in, x, ;); *op++ = (start += x);
- _vbgetu32(in, x, ;); *op++ = (start += x);
- _vbgetu32(in, x, ;); *op++ = (start += x);
+ _vbget32(in, x, ;); *op++ = (start += x);
+ _vbget32(in, x, ;); *op++ = (start += x);
+ _vbget32(in, x, ;); *op++ = (start += x);
+ _vbget32(in, x, ;); *op++ = (start += x);
+ _vbget32(in, x, ;); *op++ = (start += x);
+ _vbget32(in, x, ;); *op++ = (start += x);
+ _vbget32(in, x, ;); *op++ = (start += x);
+ _vbget32(in, x, ;); *op++ = (start += x);
}
- while(op != out+n) _vbgetu32(in, x, *op++ = (start += x));
+ while(op != out+n) _vbget32(in, x, *op++ = (start += x));
return in;
}
@@ -147,21 +125,21 @@ unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__r
v = in[0] - start - 1;
unsigned long long u = (unsigned long long)v<<1;
if(n == 1) u |= 1;
- _vbputu32(op, u, ;);
+ _vbput32(op, u, ;);
if(!--n) return op;
start = *in++;
#endif
for(ip = in; ip != in + (n&~(4-1)); ) {
- v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v;
- v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v;
- v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v;
- v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v;
+ v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v;
+ v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v;
+ v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v;
+ v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v;
}
- while(ip != in+n) { v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v; }
+ while(ip != in+n) { v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v; }
#ifdef VINT_Z
if(!b) {
u = (unsigned long long)in[-1] << 1 | 1;
- _vbputu32(out, u, ;);
+ _vbput32(out, u, ;);
return out;
}
#endif
@@ -171,7 +149,7 @@ unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__r
unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) {
unsigned x,*op;
#ifdef VINT_Z
- unsigned long long u; _vbgetu32(in, u, ;); x = u>>1; *out = (start += x+1);
+ unsigned long long u; _vbget32(in, u, ;); x = u>>1; *out = (start += x+1);
if(u & 1) {
#ifdef __SSE2__
out++; --n; BITDIZERO32(out, n, start, 1);
@@ -184,16 +162,16 @@ unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__r
#endif
for(op = out; op != out+(n&~(8-1)); ) {
- _vbgetu32(in, x, ++x); *op++ = (start += x);
- _vbgetu32(in, x, ++x); *op++ = (start += x);
- _vbgetu32(in, x, ++x); *op++ = (start += x);
- _vbgetu32(in, x, ++x); *op++ = (start += x);
- _vbgetu32(in, x, ++x); *op++ = (start += x);
- _vbgetu32(in, x, ++x); *op++ = (start += x);
- _vbgetu32(in, x, ++x); *op++ = (start += x);
- _vbgetu32(in, x, ++x); *op++ = (start += x);
+ _vbget32(in, x, ++x); *op++ = (start += x);
+ _vbget32(in, x, ++x); *op++ = (start += x);
+ _vbget32(in, x, ++x); *op++ = (start += x);
+ _vbget32(in, x, ++x); *op++ = (start += x);
+ _vbget32(in, x, ++x); *op++ = (start += x);
+ _vbget32(in, x, ++x); *op++ = (start += x);
+ _vbget32(in, x, ++x); *op++ = (start += x);
+ _vbget32(in, x, ++x); *op++ = (start += x);
}
- while(op != out+n) { _vbgetu32(in, x, ++x); *op++ = (start += x); }
+ while(op != out+n) { _vbget32(in, x, ++x); *op++ = (start += x); }
return in;
}
@@ -208,27 +186,51 @@ unsigned char *vbdec16(unsigned char *__restrict in, unsigned n, unsigned short
unsigned char *vbzenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start) {
unsigned *ip,v;
for(ip = in; ip != in+(n&~(4-1)); ) {
- v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;);
- v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;);
- v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;);
- v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;);
+ v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;);
+ v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;);
+ v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;);
+ v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;);
}
- while(ip < in+n) { v = zigzagenc32((*ip)-start); start = *ip++; _vbputu32(out, v, ;); }
+ while(ip < in+n) { v = zigzagenc32((*ip)-start); start = *ip++; _vbput32(out, v, ;); }
return out;
}
unsigned char *vbzdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) {
unsigned x,*op;
for(op = out; op != out+(n&~(8-1)); ) {
- _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
- _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
- _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
- _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
- _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
- _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
- _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
- _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
+ _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+ _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+ _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+ _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+ _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+ _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+ _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+ _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
}
- while(op != out+n) _vbgetu32(in, x, *op++ = (start += zigzagdec32(x)));
+ while(op != out+n) _vbget32(in, x, *op++ = (start += zigzagdec32(x)));
+ return in;
+}
+
+unsigned char *vbzenc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start) {
+ uint64_t *ip,v;
+ for(ip = in; ip != in+(n&~(4-1)); ) {
+ v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;);
+ v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;);
+ v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;);
+ v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;);
+ }
+ while(ip < in+n) { v = zigzagenc64((*ip)-start); start = *ip++; _vbput64(out, v, ;); }
+ return out;
+}
+
+unsigned char *vbzdec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start) {
+ uint64_t x,*op;
+ for(op = out; op != out+(n&~(4-1)); ) {
+ _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x));
+ _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x));
+ _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x));
+ _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x));
+ }
+ while(op != out+n) _vbget64(in, x, *op++ = (start += zigzagdec64(x)));
return in;
}
diff --git a/vint.h b/vint.h
index b8f47e6..ec9426c 100644
--- a/vint.h
+++ b/vint.h
@@ -1,5 +1,5 @@
/**
- Copyright (C) powturbo 2013-2015
+ Copyright (C) powturbo 2013-2016
GPL v2 License
This program is free software; you can redistribute it and/or modify
@@ -31,39 +31,31 @@
extern "C" {
#endif
-//--------- 32 bits ------------------
+//--------------------------- 32 bits ---------------------------------------------------------------------------------------
extern unsigned char vtab[];
-#define vbvlen32(__x) vtab[(__x)&0xf]
+#define vbvlen32(__x) vtab[((unsigned char)(__x))>>4]
#define _vbput32(__op, __x, __act) {\
- if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\
- else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\
- else if(likely(__x < (1<<21))) { *(unsigned short *)__op = __x << 3 | 0x03; __op += 2; *__op++ = __x >> 13; __act;}\
- else if(likely(__x < (1<<28))) { *(unsigned *)__op = __x << 4 | 0x07; __op += 4; __act;}\
- else { *(unsigned *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\
+ if(likely(__x < (1<< 7))) { *__op++ = __x; __act;}\
+ else if(likely(__x < (1<<14))) { ctou16(__op) = __x << 8 | __x >> 8 | 0x80; __op += 2; __act;}\
+ else if(likely(__x < (1<<21))) { *__op++ = __x >> 16 | 0xc0; ctou32(__op) = __x; __op += 2; __act;}\
+ else if(likely(__x < (1<<28))) { ctou32(__op) = rol32(__x,8) | 0xe0; __op += 4; __act;}\
+ else { *__op++ = (unsigned long long)__x >> 32 | 0xf0; ctou32(__op) = __x; __op += 4; __act;}\
}
-//#define __AVX2__VINT
- #if defined(__AVX2__) && defined(__AVX2__VINT)
-#include
-
-extern unsigned long long mtab[];
-
-#define _vbget32(__ip, __x, __act) do { unsigned _vdx=(*__ip)&0xf; __x = _pext_u64(*(unsigned long long *)__ip, mtab[_vdx]); __ip+=vtab[_vdx]; __act; } while(0)
- #else
-#define _vbget32(__ip, __x, __act) do {\
- if(!((__x = *__ip) & (1<<0))) { __ip++; __x >>= 1; __act;}\
- else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\
- else if(!(__x & (1<<2))) { __x = (*(unsigned short *)__ip) >> 3 | (unsigned)(*(__ip+2)) << 13; __ip += 3; __act;}\
- else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\
- else { __x = (unsigned long long)(*(unsigned *)__ip) >> 4 | (unsigned long long)(__ip[4]) << 28; __ip += 5; __act;}\
+#define _vbget32(__ip, __x, __act) do { __x = *__ip++;\
+ if(!(__x & 0x80)) { __act;}\
+ else if(!(__x & 0x40)) { __x = (__x & 0x3f)<< 8 | *__ip++; __act;}\
+ else if(!(__x & 0x20)) { __x = (__x & 0x1f)<<16 | ctou16(__ip); __ip += 2; __act;}\
+ else if(!(__x & 0x10)) { __x = ror32(ctou32(__ip-1),8) & 0xfffffff; __ip += 3; __act;}\
+ else { __x = (unsigned long long)(__x & 0x07)<<32 | ctou32(__ip); __ip += 4; __act;}\
} while(0)
- #endif
-//----------------- 16 bits --------------------------
+//----------------- 16 bits -------------------------------------------------------------------------------------------------------
#define _vbput16(__op, __x) _vbput32(__op, __x)
#define _vbget16(__ip, __x, __act) _vbget32(__ip, __x, __act)
-//----------------- 64 bits --------------------------
+
+//----------------- 64 bits -------------------------------------------------------------------------------------------------------
#define _vbput64(__op, __x, __act) {\
if(__x < 1 << 7) { *__op++ = __x << 1; __act;}\
else if(__x < 1 <<14) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\
@@ -96,8 +88,8 @@ extern unsigned long long mtab[];
#define vbput16(__op, __x) vbput32(__op, __x)
#define vbget16(__ip) vbget32(__ip)
-#define vbput32(__op, __x) { unsigned _x_ = __x; _vbput32(__op, _x_, ;); }
-#define vbget32(__ip) ({ unsigned _x_; _vbget32(__ip, _x_, ;); _x_; })
+#define vbput32(__op, __x) { register unsigned _x_ = __x; _vbput32(__op, _x_, ;); }
+#define vbget32(__ip) ({ register unsigned _x_; _vbget32(__ip, _x_, ;); _x_; })
#define vbput64(__op, __x) { unsigned long long _x_ = __x; _vbput64(__op, _x_, ;); }
#define vbget64(__ip) ({ unsigned long long _x_; _vbget64(__ip, _x_, ;); _x_; })
@@ -122,6 +114,8 @@ unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned
//------ zigzag encoding integer lists -------------------------------------------------------------
unsigned char *vbzenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *vbzdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
+unsigned char *vbzenc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
+unsigned char *vbzdec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
//--- 15 bits integer lists ------------
#define vbput15(__op, __x) do { unsigned _x = __x; if(likely(_x < 0x80)) *__op++ = _x; else { *__op++ = (_x) >> 8 | 0x80; *__op++ = _x; } } while(0)