diff --git a/README.md b/README.md index 926ba27..5ca8901 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/po

+ **Variable byte** - :sparkles: Scalar **"Variable Byte"** faster and more efficient than **ANY** other (incl. SIMD MaskedVByte) implementation + - :new: **now up to 25% more faster**

+ **Simple family** - :sparkles: **Novel** **"Variable Simple"** (incl. **RLE**) faster and more efficient than simple16, simple-8b @@ -71,7 +72,7 @@ CPU: Sandy bridge i7-2600k at 4.2GHz, gcc 5.1, ubuntu 15.04, single thread. | 99.910.930| 24.98| 7.99| 2524.50|1943.41|[SIMDPack FPF](#FastPFor)| | 99.910.930| 24.98| 7.99| 1883.21|1898.11|**TurboPack**| | 99.910.930| 24.98| 7.99| 1877.25| 935.83|**TurboForDA**| -|102.074.663| 25.52| 8.17| 1621.64|1694.64|**TurboVbyte**| +|102.074.663| 25.52| 8.17| 1993.95|1827.04|**TurboVbyte**| |102.074.663| 25.52|8.17|1214.12|1688.95|[MaskedVByte](#MaskedVByte)| |102.074.663| 25.52| 8.17| 1178.72| 949.59|[Vbyte FPF](#FastPFor)| |103.035.930| 25.76| 8.24| 1480.47|1746.51|[libfor](#libfor)| @@ -90,16 +91,16 @@ CPU: Skylake i7-6700 w/ only 3.7GHz | 63392801| 15.85| 5.07| 387.30| 243.62|**TurboPForDA**| | 65359916| 16.34| 5.23| 7.58| 609.12|OptPFD| | 73477088| 18.37| 5.88| 101.68| 621.37|Simple16| -| 78514276| 19.63| 6.28|256.83|676.45|**VSimple**| -| 95915096| 23.98| 7.67| 211.79|954.62|Simple-8b| +| 78514276| 19.63| 6.28|258.31|691.48|**VSimple**| +| 95915096| 23.98| 7.67| 211.79|957.62|Simple-8b| | 98546814| 24.64| 7.88| 70.85|**2349.71**|[QMX](#QMX)| | 99910930| 24.98| 7.99|**3537.57**|**3081.79**|**TurboPackV**| | 99910930| 24.98| 7.99| 3099.52|3071.77|SIMDPack FPF| -| 99910930| 24.98| 7.99| 2050.47|2402.54|**TurboPack**| +| 99910930| 24.98| 7.99| 2095.79|2495.22|**TurboPack**| | 99910930| 24.98| 7.99| 2049.85|2364.52|**TurboFor**| | 99910930| 24.98| 7.99| 2049.70|1124.12|**TurboForDA**| |102074663| 25.52| 8.17| 1354.42|1745.69|MaskedVByte| -|102074663| 25.52| 8.17| 1660.76|1626.67|**TurboVbyte**| +|102074663| 25.52| 8.17| 1825.64|1844.34|**TurboVbyte**| |102074663| 25.52| 8.17| 1249.77|1051.85|Vbyte FPF| |112500000| 28.12| 9.00| 466.94|3003.70|VarintG8IU| |128125000| 32.03| 10.25| 1109.67|1271.32|[StreamVbyte FPF](#FastPFor)| @@ -310,4 +311,4 @@ header files to use with documentation:
- [On Inverted Index Compression for Search Engine Efficiency](http://www.dcs.gla.ac.uk/~craigm/publications/catena14compression.pdf) - [Google's Group Varint Encoding](http://static.googleusercontent.com/media/research.google.com/de//people/jeff/WSDM09-keynote.pdf) -Last update: 27 MAR 2016 +Last update: 08 APR 2016 diff --git a/bitunpack.c b/bitunpack.c index dbfe47d..0dca5f7 100644 --- a/bitunpack.c +++ b/bitunpack.c @@ -35,17 +35,17 @@ #define DSTI(__op) #define BPI(__w, __x, __parm) __w #include __FILE__ -unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return ip; } -unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return ip; } -unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK64(in, n, b, out, 0); return ip; } +unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return (unsigned char *)ip; } +unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return (unsigned char *)ip; } +unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK64(in, n, b, out, 0); return (unsigned char *)ip; } #undef BPI #undef DSTI //----------------------------------------------------------------------------------------------------------------- #define DSTI(__op) #define BPI(__w, __x, __parm) (__parm += (__w) + 1) #include __FILE__ -unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } -unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } +unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } #undef BPI #undef DSTI @@ -53,8 +53,8 @@ unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, uns #define DSTI(__op) #define BPI(__w, __x, __parm) (__parm += (__w)) #include __FILE__ -unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } -unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } +unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } #undef BPI #undef DSTI @@ -63,7 +63,7 @@ unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, uns #define DSTI(__op) #define BPI(__w, __x, __parm) (__parm += zigzagdec32(__w)) #include __FILE__ -unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } //unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } #undef BPI #undef DSTI @@ -73,8 +73,8 @@ unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, uns #define BPI(__w, __x, __parm) (__parm + (__w)) #include __FILE__ -unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } -unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } +unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } #undef BPI #undef DSTI @@ -82,8 +82,8 @@ unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, uns #define DSTI(__op) start += 32 #define BPI(__w, __x, __parm) (__parm + (__w)+__x+1) #include __FILE__ -unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } -unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } +unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; } #undef BPI #undef DSTI diff --git a/bitunpackv.c b/bitunpackv.c index 29ad14b..7405626 100644 --- a/bitunpackv.c +++ b/bitunpackv.c @@ -78,7 +78,7 @@ unsigned char *bitunpackv32( const unsigned char *__restrict in, unsigned n, uns const unsigned char *ip = in+PAD8(n*b); __m128i sv; BITUNPACKV32(in, n, b, out, sv); - return ip; + return (unsigned char *)ip; } #undef VSTO #undef VSTO0 @@ -116,7 +116,7 @@ unsigned char *_bitunpackv32( const unsigned char *__restrict in, unsigned n, un const unsigned char *ip = in+PAD8(n*b); unsigned m; __m128i sv; BITUNPACKV32(in, n, b, out, sv); - return ip; + return (unsigned char *)ip; } #undef VSTO #undef VSTO0 @@ -134,7 +134,7 @@ unsigned char *bitzunpackv32( const unsigned char *__restrict in, unsigned n, un const unsigned char *ip = in+PAD8(n*b); __m128i sv = _mm_set1_epi32(start); BITUNPACKV32(in, n, b, out, sv); - return ip; + return (unsigned char *)ip; } #undef VSTO #undef BITUNPACK0 @@ -149,7 +149,7 @@ unsigned char *bitdunpackv32( const unsigned char *__restrict in, unsigned n, un const unsigned char *ip = in+PAD8(n*b); __m128i sv = _mm_set1_epi32(start); BITUNPACKV32(in, n, b, out, sv); - return ip; + return (unsigned char *)ip; } #undef VSTO #undef VSTO0 @@ -171,7 +171,7 @@ unsigned char *_bitdunpackv32( const unsigned char *__restrict in, unsigned n, u const unsigned char *ip = in+PAD8(n*b); unsigned m; __m128i sv = _mm_set1_epi32(start); BITUNPACKV32(in, n, b, out, sv); - return ip; + return (unsigned char *)ip; } #undef VSTO #undef VSTO0 @@ -188,7 +188,7 @@ unsigned char *bitd1unpackv32( const unsigned char *__restrict in, unsigned n, u const unsigned char *ip = in+PAD8(n*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); BITUNPACKV32(in, n, b, out, sv); - return ip; + return (unsigned char *)ip; } #undef VSTO #undef VSTO0 @@ -209,7 +209,7 @@ unsigned char *_bitd1unpackv32( const unsigned char *__restrict in, unsigned n, const unsigned char *ip = in+PAD8(n*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); BITUNPACKV32(in, n, b, out, sv); - return ip; + return (unsigned char *)ip; } #undef VSTO #undef VSTO0 diff --git a/bitutil.c b/bitutil.c index a0c543f..4b35125 100644 --- a/bitutil.c +++ b/bitutil.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2016 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -33,19 +33,22 @@ _x = (*_p)-__start-__inc; __start = *_p++; __act;\ _x = (*_p)-__start-__inc; __start = *_p++; __act;\ }\ - while(_p < __p+__n) { \ + while(_p != __p+__n) { \ _x = *_p-__start-__inc; __start = *_p++; __act;\ }\ } -#define BITUNDELTA(__p, __n, __start, __inc) { typeof(__p[0]) *_p;\ +#define BITUNDELTA(__p, __n, __start, __inc) {\ + typeof(__p[0]) *_p;\ for(_p = __p; _p != __p+(__n&~(4-1)); ) {\ *_p = (__start += (*_p) + __inc); _p++;\ *_p = (__start += (*_p) + __inc); _p++;\ *_p = (__start += (*_p) + __inc); _p++;\ *_p = (__start += (*_p) + __inc); _p++;\ }\ - while(_p < __p+__n) { *_p = (__start += (*_p) + __inc); _p++; }\ + while(_p != __p+__n) {\ + *_p = (__start += (*_p) + __inc); _p++;\ + }\ } #define BITMINMAX(__p,__n, __mi, __mx) {\ @@ -56,7 +59,7 @@ if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \ if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \ }\ - while(_p < __p+__n) { \ + while(_p != __p+__n) { \ if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \ }\ } @@ -65,29 +68,36 @@ unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, uns #ifdef __SSE2__ unsigned *ip,b,*op = out; __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(inc), dv; - for(ip = in; ip != in+(n&~(4-1)); ip += 4) { + for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); bv = _mm_or_si128(bv, dv = _mm_sub_epi32(DELTA128_32(iv,sv),cv)); sv = iv; _mm_storeu_si128((__m128i *)op, dv); - op += 4; } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); HOR128_32(bv, b); - while(ip < in+n) { unsigned x = *ip-start-inc; start = *ip++; b |= x; *op++ = x; } + while(ip != in+n) { + unsigned x = *ip-start-inc; + start = *ip++; + b |= x; + *op++ = x; + } #else - typeof(in[0]) b = 0,*op = out; BITDELTA(in, n, inc, start, b |= _x;*op++ = _x); + typeof(in[0]) b = 0,*op = out; + BITDELTA(in, n, inc, start, b |= _x;*op++ = _x); #endif return bsr32(b); } unsigned bitdelta64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, unsigned inc) { - typeof(in[0]) b = 0,*op = out; BITDELTA(in, n, inc, start, b |= _x; *op++ = _x); + typeof(in[0]) b = 0,*op = out; + BITDELTA(in, n, inc, start, b |= _x; *op++ = _x); return bsr64(b); } unsigned bit32(unsigned *in, unsigned n) { - typeof(in[0]) b; BITSIZE32(in, n, b); + typeof(in[0]) b; + BITSIZE32(in, n, b); return b; } @@ -119,13 +129,14 @@ unsigned bitd32(unsigned *in, unsigned n, unsigned start) { start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); HOR128_32(bv, b); - while(ip < in+n) { + while(ip != in+n) { unsigned x = *ip-start; start = *ip++; b |= x; } #else - typeof(in[0]) b = 0; BITDELTA(in,n, 0, start, b |= _x); + typeof(in[0]) b = 0; + BITDELTA(in,n, 0, start, b |= _x); #endif return bsr32(b); } @@ -141,13 +152,14 @@ unsigned bitd132(unsigned *in, unsigned n, unsigned start) { start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); HOR128_32(bv, b); - while(ip < in+n) { + while(ip != in+n) { unsigned x = *ip-start-1; start = *ip++; b |= x; } #else - typeof(in[0]) b = 0; BITDELTA(in, n, 1, start, b |= _x); + typeof(in[0]) b = 0; + BITDELTA(in, n, 1, start, b |= _x); #endif return bsr32(b); } @@ -159,14 +171,13 @@ void bitund132(unsigned *p, unsigned n, unsigned x) { #ifdef __SSE2__ __m128i sv = _mm_set1_epi32(x), cv = _mm_set_epi32(4,3,2,1); unsigned *ip; - for(ip = p; ip != p+(n&~(4-1)); ) { + for(ip = p; ip != p+(n&~(4-1)); ip += 4) { __m128i v = _mm_loadu_si128((__m128i *)ip); SCANI128_32(v, sv, cv); _mm_storeu_si128((__m128i *)ip, sv); - ip += 4; } x = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); - while(ip < p+n) { + while(ip != p+n) { *ip = (x += (*ip) + 1); ip++; } @@ -188,18 +199,21 @@ void bitundx64(uint64_t *p, unsigned n, uint64_t x, unsigned inc) { BITUNDELTA(p _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\ }\ while(_p != __p+__n) { \ - _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\ + _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\ }\ } -#define BITUNZIGZAG(__p, __n, __start) { typeof(__p[0]) *_p, _z;\ +#define BITUNZIGZAG(__p, __n, __start) {\ + typeof(__p[0]) *_p, _z;\ for(_p = __p; _p != __p+(__n&~(4-1)); ) {\ _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ }\ - while(_p != __p+__n) { _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++; }\ + while(_p != __p+__n) {\ + _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ + }\ } unsigned bitz32(unsigned *in, unsigned n, unsigned start) { @@ -216,10 +230,15 @@ unsigned bitz32(unsigned *in, unsigned n, unsigned start) { start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); HOR128_32(bv, b); while(ip != in+n) { - int x = ((int)(*ip)-(int)start); x = (x << 1) ^ (x >> 31); start = *ip++; b |= x; + int x = ((int)(*ip)-(int)start); + x = (x << 1) ^ (x >> 31); + start = *ip++; + b |= x; } #else - typeof(in[0]) b = 0,*op = out; int _x; BITZIGZAG(in, n, start, b |= (unsigned)_x); + typeof(in[0]) b = 0,*op = out; + int _x; + BITZIGZAG(in, n, start, b |= (unsigned)_x); #endif return bsr32(b); } @@ -228,22 +247,27 @@ unsigned bitzigzag32(unsigned *in, unsigned n, unsigned *out, unsigned start) { #ifdef __SSE2__ unsigned *ip,b,*op = out; __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), dv; - for(ip = in; ip != in+(n&~(4-1)); ip += 4) { + for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); dv = DELTA128_32(iv,sv); sv = iv; dv = ZIGZAG128_32(dv); bv = _mm_or_si128(bv, dv); _mm_storeu_si128((__m128i *)op, dv); - op += 4; } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); HOR128_32(bv, b); while(ip != in+n) { - int x = ((int)(*ip)-(int)start); x = (x << 1) ^ (x >> 31); start = *ip++; b |= x; *op++ = x; + int x = ((int)(*ip)-(int)start); + x = (x << 1) ^ (x >> 31); + start = *ip++; + b |= x; + *op++ = x; } #else - typeof(in[0]) b = 0,*op = out; int _x; BITZIGZAG(in, n, start, b |= (unsigned)_x; *op++ = _x); + typeof(in[0]) b = 0, *op = out; + int _x; + BITZIGZAG(in, n, start, b |= (unsigned)_x; *op++ = _x); #endif return bsr32(b); } @@ -252,61 +276,81 @@ void bitunzigzag32(unsigned *p, unsigned n, unsigned start) { #ifdef __SSE2__ __m128i sv = _mm_set1_epi32(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128(); unsigned *ip; - for(ip = p; ip != p+(n&~(4-1)); ) { + for(ip = p; ip != p+(n&~(4-1)); ip += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); iv = UNZIGZAG128_32(iv); SCAN128_32(iv, sv); _mm_storeu_si128((__m128i *)ip, sv); - ip += 4; } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); while(ip != p+n) { - unsigned z = *ip; *ip = (start += (z >> 1 ^ -(z & 1))); ip++; + unsigned z = *ip; + *ip++ = (start += (z >> 1 ^ -(z & 1))); } #else BITUNZIGZAG(p, n, start); #endif } -unsigned bitzigzag64(unsigned *in, unsigned n, unsigned *out, unsigned start) { - typeof(in[0]) b = 0,*op = out; long long _x; BITZIGZAG(in, n, start, b |= (unsigned long long)_x; *op++ = _x); +unsigned bitzigzag64(uint64_t *in, unsigned n, uint64_t *out, unsigned start) { + typeof(in[0]) b = 0,*op = out; + long long _x; + BITZIGZAG(in, n, start, b |= (unsigned long long)_x; *op++ = _x); return bsr32(b); } -void bitunzigzag64(unsigned *p, unsigned n, unsigned start) { +void bitunzigzag64(uint64_t *p, unsigned n, unsigned start) { BITUNZIGZAG(p, n, start); } //------------------- De-/Compose Floating Point ----------------------------------------- -void bitdouble(double *in, unsigned n, unsigned *sgn, unsigned *expo, uint64_t *mant) { +void bitdouble(double *in, unsigned n, int *expo, uint64_t *mant) { double *ip; - uint64_t u; for(ip = in; ip < in+n; ip++) { - u = *(uint64_t *)ip; BITFLOAT(u, *sgn++, *expo++, *mant++, DMANT_BITS, 1ull); + uint64_t u = *(uint64_t *)ip; + *expo++ = FLTEXPO(u, DMANT_BITS, 1ull); + *mant++ = FLTMANT(u, DMANT_BITS, 1ull); } } -void bitundouble(unsigned *sgn, unsigned *expo, uint64_t *mant, unsigned n, double *out) { +void bitundouble(int *expo, uint64_t *mant, unsigned n, double *out) { double *op; uint64_t u; - for(op = out; op < out+n; op++) { - BITUNFLOAT((uint64_t)(*sgn++), (uint64_t)(*expo++), *mant++, u, DMANT_BITS); *op = *(double *)&u; + for(op = out; op < out+n; ) { + BITUNFLOAT( (int64_t)(*expo++), *mant++, u, DMANT_BITS); *op++ = *(double *)&u; } } -void bitfloat(float *in, unsigned n, unsigned *sgn, unsigned *expo, unsigned *mant) { - float *ip; - unsigned u; +void bitzdouble(double *in, unsigned n, int *expo, uint64_t *mant) { + double *ip; for(ip = in; ip < in+n; ip++) { - u = *(unsigned *)ip; BITFLOAT(u, *sgn++, *expo++, *mant++, FMANT_BITS, 1u); + uint64_t u = *(uint64_t *)ip; + *expo++ = zigzagenc32((int)FLTEXPO(u, DZMANT_BITS, 1ull)-1023); + *mant++ = FLTMANT(u, DZMANT_BITS, 1ull); } } -void bitunfloat(unsigned *sgn, unsigned *expo, unsigned *mant, unsigned n, float *out) { +void bitzundouble(int *expo, uint64_t *mant, unsigned n, double *out) { + double *op; + uint64_t u; + for(op = out; op < out+n; ) { + BITUNFLOAT( (int64_t)zigzagdec32(*expo++)+1023, *mant++, u, DZMANT_BITS); *op++ = *(double *)&u; + } +} + +void bitfloat(float *in, unsigned n, int *expo, unsigned *mant) { + float *ip; + for(ip = in; ip < in+n; ip++) { + unsigned u = *(unsigned *)ip; + *expo++ = FLTEXPO(u, FMANT_BITS, 1u); + *mant++ = FLTMANT(u, FMANT_BITS, 1u); + } +} + +void bitunfloat(int *expo, unsigned *mant, unsigned n, float *out) { float *op; unsigned u; for(op = out; op < out+n; op++) { - BITUNFLOAT((*sgn++), (*expo++), *mant++, u, FMANT_BITS); *op = *(float *)&u; + BITUNFLOAT( (*expo++), *mant++, u, FMANT_BITS); *op = *(float *)&u; } } - diff --git a/bitutil.h b/bitutil.h index 4cccece..ae4e667 100644 --- a/bitutil.h +++ b/bitutil.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2016 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -24,89 +24,103 @@ // bitutil.h - "Integer Compression" #include -#define _BITFORZERO(out, n, start, inc) do {\ - for(i = 0; i != (n&~3); ) {\ - out[i] = start+i*inc; i++;\ - out[i] = start+i*inc; i++;\ - out[i] = start+i*inc; i++;\ - out[i] = start+i*inc; i++;\ - }\ - while(i < n) out[i] = start+i*inc,++i;\ +#define _BITFORZERO(_out_, _n_, _start_, _inc_) do { unsigned _i;\ + for(_i = 0; _i != (_n_&~3); ) {\ + _out_[_i] = _start_+_i*_inc_; _i++;\ + _out_[_i] = _start_+_i*_inc_; _i++;\ + _out_[_i] = _start_+_i*_inc_; _i++;\ + _out_[_i] = _start_+_i*_inc_; _i++;\ + }\ + while(_i != _n_)\ + _out_[_i] = _start_+_i*_inc_, ++_i;\ } while(0) -#define BITSIZE(__in, __n, __b, __usize) { typeof(__in[0]) *_ip;\ - for(__b=0,_ip = __in; _ip != __in+(__n&~(4-1)); )\ - __b |= *_ip++ | *_ip++ | *_ip++ | *_ip++;\ - while(_ip != __in+__n) __b |= *_ip++;\ - __b = TEMPLATE(bsr, __usize)(__b);\ +#define BITSIZE(_in_, _n_, _b_, _usize_) { typeof(_in_[0]) *_ip;\ + for(_b_=0,_ip = _in_; _ip != _in_+(_n_&~(4-1)); )\ + _b_ |= *_ip++ | *_ip++ | *_ip++ | *_ip++;\ + while(_ip != _in_+_n_) \ + _b_ |= *_ip++;\ + _b_ = TEMPLATE(bsr, _usize_)(_b_);\ } -static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; } -static inline unsigned zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); } +static inline unsigned zigzagenc31(int x) { x = (x << 2 | ((x>>30)& 2)) ^ x >> 31; return x; } +static inline unsigned zigzagdec31(unsigned x) { return (x >> 2 | (x& 2)<<30 ) ^ -(x & 1); } + +static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; } +static inline unsigned zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); } + +static inline uint64_t zigzagenc64(int64_t x) { return x << 1 ^ x >> 63; } +static inline uint64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); } #ifdef __SSE2__ #include +// SIMD Delta +#define DELTA128_32(_v_, _sv_) _mm_sub_epi32(_v_, _mm_or_si128(_mm_srli_si128(_sv_, 12), _mm_slli_si128(_v_, 4))) -#define DELTA128_32(__v, __sv) _mm_sub_epi32(__v, _mm_or_si128(_mm_srli_si128(__sv, 12), _mm_slli_si128(__v, 4))) +// SIMD Scan ( prefix sum ) +#define SCAN128_32( _v_, _sv_) _v_ = _mm_add_epi32(_v_, _mm_slli_si128(_v_, 4)); _sv_ = _mm_add_epi32(_mm_shuffle_epi32(_sv_, _MM_SHUFFLE(3, 3, 3, 3)), _mm_add_epi32(_mm_slli_si128(_v_, 8), _v_) ) +#define SCANI128_32(_v_, _sv_, _vi_) SCAN128_32(_v_, _sv_); _sv_ = _mm_add_epi32(_sv_, _vi_) -#define SCAN128_32( __v, __sv) __v = _mm_add_epi32(__v, _mm_slli_si128(__v, 4)); __sv = _mm_add_epi32(_mm_shuffle_epi32(__sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm_add_epi32(_mm_slli_si128(__v, 8), __v) ) -#define SCANI128_32(__v, __sv, __vi) SCAN128_32(__v, __sv); __sv = _mm_add_epi32(__sv, __vi) +// SIMD ZigZag +#define ZIGZAG128_32(_v_) _mm_xor_si128(_mm_slli_epi32(_v_,1), _mm_srai_epi32(_v_,31)) +#define UNZIGZAG128_32(_v_) _mm_xor_si128(_mm_srli_epi32(_v_,1), _mm_srai_epi32(_mm_slli_epi32(_v_,31),31) ) //_mm_sub_epi32(cz, _mm_and_si128(iv,c1)) -#define ZIGZAG128_32(__v) _mm_xor_si128(_mm_slli_epi32(__v,1), _mm_srai_epi32(__v,31)) -#define UNZIGZAG128_32(__v) _mm_xor_si128(_mm_srli_epi32(__v,1), _mm_srai_epi32(_mm_slli_epi32(__v,31),31) ) //_mm_sub_epi32(cz, _mm_and_si128(iv,c1)) // SIMD Horizontal OR -#define HOR128_32(__v,__b) __v = _mm_or_si128(__v, _mm_srli_si128(__v, 8)); __v = _mm_or_si128(__v, _mm_srli_si128(__v, 4)); __b = (unsigned)_mm_cvtsi128_si32(__v) +#define HOR128_32(_v_,_b_) _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 8)); _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 4)); _b_ = (unsigned)_mm_cvtsi128_si32(_v_) -#define BITSIZE32(__in, __n, __b) { typeof(__in[0]) *_ip; __m128i v = _mm_setzero_si128();\ - for(_ip = __in; _ip != __in+(__n&~(4-1)); _ip+=4) v = _mm_or_si128(v, _mm_loadu_si128((__m128i*)_ip));\ - HOR128_32(v,__b);\ - while(_ip != __in+__n) __b |= *_ip++;\ - __b = bsr32(__b);\ +#define BITSIZE32(_in_, _n_, _b_) { typeof(_in_[0]) *_ip; __m128i _v = _mm_setzero_si128();\ + for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip+=4)\ + _v = _mm_or_si128(_v, _mm_loadu_si128((__m128i*)_ip));\ + HOR128_32(_v,_b_);\ + while(_ip != _in_+_n_)\ + _b_ |= *_ip++;\ + _b_ = bsr32(_b_);\ } - -#define BITZERO32(out, n, start) do {\ - __m128i sv = _mm_set1_epi32(start), *ov = (__m128i *)(out), *ove = (__m128i *)(out + n);\ - do { _mm_storeu_si128(ov++, sv); } while(ov < ove); \ +// SIMD set value +#define BITZERO32(_out_, _n_, _start_) do {\ + __m128i _sv_ = _mm_set1_epi32(_start_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\ + do _mm_storeu_si128(_ov++, _sv_); while(_ov < _ove); \ } while(0) -#define BITFORZERO32(out, n, start, inc) do {\ - __m128i sv = _mm_set1_epi32(start), *ov=(__m128i *)(out), *ove = (__m128i *)(out + n), cv = _mm_set_epi32(3*inc,2*inc,1*inc,0); \ - sv = _mm_add_epi32(sv, cv);\ - cv = _mm_set1_epi32(4);\ - do { _mm_storeu_si128(ov++, sv); sv = _mm_add_epi32(sv, cv); } while(ov < ove);\ +#define BITFORZERO32(_out_, _n_, _start_, _inc_) do {\ + __m128i _sv = _mm_set1_epi32(_start_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_), _cv = _mm_set_epi32(3*_inc_,2*_inc_,1*_inc_,0); \ + _sv = _mm_add_epi32(_sv, _cv);\ + _cv = _mm_set1_epi32(4);\ + do { _mm_storeu_si128(_ov++, _sv); _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\ } while(0) -#define BITDIZERO32(out, n, start, inc) do { __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(3+inc,2+inc,1+inc,inc), *ov=(__m128i *)(out), *ove = (__m128i *)(out + n);\ - sv = _mm_add_epi32(sv, cv); cv = _mm_set1_epi32(4*inc); do { _mm_storeu_si128(ov++, sv), sv = _mm_add_epi32(sv, cv); } while(ov < ove);\ +#define BITDIZERO32(_out_, _n_, _start_, _inc_) do { __m128i _sv = _mm_set1_epi32(_start_), _cv = _mm_set_epi32(3+_inc_,2+_inc_,1+_inc_,_inc_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\ + _sv = _mm_add_epi32(_sv, _cv); _cv = _mm_set1_epi32(4*_inc_); do { _mm_storeu_si128(_ov++, _sv), _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\ } while(0) #else -#define BITSIZE32(__in, __n, __b) BITSIZE(__in, __n, __b, 32) -#define BITFORZERO32(out, n, start, inc) _BITFORZERO(out, n, start, inc) -#define BITZERO32(out, n, start) _BITFORZERO(out, n, start, 0) +#define BITSIZE32(_in_, _n_, _b_) BITSIZE(_in_, _n_, _b_, 32) +#define BITFORZERO32(_out_, _n_, _start_, _inc_) _BITFORZERO(_out_, _n_, _start_, _inc_) +#define BITZERO32(_out_, _n_, _start_) _BITFORZERO(_out_, _n_, _start_, 0) #endif - -#define DELTR( __in, __n, __mode, __out) { unsigned _v; for( __out[0]=__in[0],_v = 1; _v < __n; _v++) __out[_v] = (__in[_v] - __out[0]) - _v*__mode; } -#define DELTRB(__in, __n, __mode, __b, __out) { unsigned _v; for(__b=0,__out[0]=__in[0],_v = 1; _v < __n; _v++) __out[_v] = (__in[_v] - __out[0]) - _v*__mode, __b |= __out[_v]; __b = bsr32(__b); } +#define DELTR( _in_, _n_, _mode_, _out_) { unsigned _v; for( _out_[0]=_in_[0],_v = 1; _v < _n_; _v++) _out_[_v] = (_in_[_v] - _out_[0]) - _v*_mode_; } +#define DELTRB(_in_, _n_, _mode_, _b_, _out_) { unsigned _v; for(_b_=0,_out_[0]=_in_[0],_v = 1; _v < _n_; _v++) _out_[_v] = (_in_[_v] - _out_[0]) - _v*_mode_, _b_ |= _out_[_v]; _b_ = bsr32(_b_); } #ifdef __cplusplus extern "C" { #endif -// get maximum bit length of the elements in the integer array +//------------- get maximum bit length of the elements in the integer array ----------------------- unsigned bit32( unsigned *in, unsigned n); -// transform sorted integer array to delta array. inc = increment +//------------- Delta for sorted integer array ---------------------------------------------------- +//-- transform sorted integer array to delta array. inc = increment: out[i] = in[i] - in[i-1] - inc unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, unsigned inc); unsigned bitdelta64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, unsigned inc); -// get delta maximum bit length of the non decreasing integer array +//-- get delta maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1] unsigned bitd32( unsigned *in, unsigned n, unsigned start); -// get delta maximum bit length of the non strictly decreasing integer array +//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1 unsigned bitd132( unsigned *in, unsigned n, unsigned start); +//-- in-place reverse delta transform void bitund32( unsigned *p, unsigned n, unsigned x); void bitund64( uint64_t *p, unsigned n, uint64_t x); @@ -115,32 +129,47 @@ void bitundx64( uint64_t *p, unsigned n, uint64_t x, unsigned inc); void bitund132( unsigned *p, unsigned n, unsigned x); -// for +//------------- FOR array bit length: out[i] = in[i] - start ------------------------------------- + unsigned bitf32( unsigned *in, unsigned n, unsigned start); // sorted unsigned bitf132( unsigned *in, unsigned n, unsigned start); unsigned bitfm32( unsigned *in, unsigned n, unsigned *pmin); // unsorted unsigned bitf1m32( unsigned *in, unsigned n, unsigned *pmin); -// zigzag encoding for unsorted integer lists +//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] ------------- + +//-- get maximum zigzag bit length integer array unsigned bitz32( unsigned *in, unsigned n, unsigned start); + +//-- Zigzag transform unsigned bitzigzag32(unsigned *in, unsigned n, unsigned *out, unsigned start); -unsigned bitzigzag64(unsigned *in, unsigned n, unsigned *out, unsigned start); +unsigned bitzigzag64(uint64_t *in, unsigned n, uint64_t *out, unsigned start); + +//-- Zigzag reverse transform void bitunzigzag32( unsigned *p, unsigned n, unsigned start); -void bitunzigzag64( unsigned *p, unsigned n, unsigned start); +void bitunzigzag64( uint64_t *p, unsigned n, unsigned start); //---- Floating point to Integer de-/composition --------------------------------- +#define FMANT_BITS 16 +#define DMANT_BITS 32 +#define DZMANT_BITS 36 -#define FMANT_BITS 23 -#define DMANT_BITS 52 -#define BITFLOAT(__u, __sgn, __expo, __mant, __mantbits, __one) __sgn = __u >> (sizeof(__u)*8-1); __expo = ((__u >> (__mantbits)) & ( (__one<<(sizeof(__u)*8 - 1 - __mantbits)) -1)); __mant = __u & ((__one<<__mantbits)-1); -#define BITUNFLOAT( __sgn, __expo, __mant, __u, __mantbits) __u = (__sgn) << (sizeof(__u)*8-1) | (__expo) << __mantbits | (__mant) +#define FLTEXPO(__u,__mantbits, __one) ( ((__u) >> __mantbits) & ( (__one<<(sizeof(__u)*8 - __mantbits)) - 1 ) ) +#define FLTMANT(__u,__mantbits, __one) ((__u) & ((__one<<__mantbits)-1)) + +#define BITUNFLOAT(__expo, __mant, __u, __mantbits) __u = ((__expo) << __mantbits) | (__mant)//>>1 | (__mant)<<(sizeof(__u)*8 - 1) + +/*#define BITFLOAT(__u, __sgn, __expo, __mant, __mantbits, __one) __sgn = __u >> (sizeof(__u)*8-1); __expo = EXPO(__u,__mantbits; __mant = __u & ((__one<<__mantbits)-1) +#define BITUNFLOAT( __sgn, __expo, __mant, __u, __mantbits) __u = (__sgn) << (sizeof(__u)*8-1) | (__expo) << __mantbits | (__mant) */ // De-/Compose floating point array to/from integer arrays (sign,exponent,mantissa) for using with "Integer Compression" functions ------------ -void bitdouble( double *in, unsigned n, unsigned *sgn, unsigned *expo, uint64_t *mant); -void bitundouble( unsigned *sgn, unsigned *expo, uint64_t *mant, unsigned n, double *out); -void bitfloat( float *in, unsigned n, unsigned *sgn, unsigned *expo, unsigned *mant); -void bitunfloat( unsigned *sgn, unsigned *expo, unsigned *mant, unsigned n, float *out); +void bitdouble( double *in, unsigned n, int *expo, uint64_t *mant); +void bitundouble( int *expo, uint64_t *mant, unsigned n, double *out); +void bitzdouble( double *in, unsigned n, int *expo, uint64_t *mant); +void bitzundouble( int *expo, uint64_t *mant, unsigned n, double *out); +void bitfloat( float *in, unsigned n, int *expo, unsigned *mant); +void bitunfloat( int *expo, unsigned *mant, unsigned n, float *out); #ifdef __cplusplus } diff --git a/conf.h b/conf.h index 73f7cd9..27c7a96 100644 --- a/conf.h +++ b/conf.h @@ -38,13 +38,19 @@ #define popcnt64(_x_) __builtin_popcountll(_x_) #if defined(__i386__) || defined(__x86_64__) -static inline int __bsr32(int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; } -static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; } -static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } -#define bsr16(_x_) bsr32(_x_) +static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; } +static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; } +static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } +#define bsr16(_x_) bsr32(_x_) + +static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } +static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } + #else -static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; } -static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } +static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; } +static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } +static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); } +static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); } #endif #define ctz64(_x_) __builtin_ctzll(_x_) @@ -65,6 +71,8 @@ static inline int bsr64(unsigned long long x) { unsigned long z = 0; _BitScanFor static inline int ctz64(unsigned long long x) { unsigned long z = 0; _BitScanForward64(&z, x); return z; } #endif static inline int ctz32(unsigned x) { unsigned z = 0; _BitScanForward(&z, x); return z; } +#define rol32(x,s) _lrotl(x, s) +#define ror32(x,s) _lrotr(x, s) #define fseeko _fseeki64 #define ftello _ftelli64 #define sleep(x) Sleep(x/1000) diff --git a/icbench.c b/icbench.c index a4ac427..3e8e09f 100644 --- a/icbench.c +++ b/icbench.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2016 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -1080,8 +1080,8 @@ int main(int argc, char *argv[]) { int r; uint64_t *mantissa = malloc(n*sizeof(mantissa[0])); unsigned *sign = malloc(n*sizeof(sign[0])); unsigned *exp = malloc(n*sizeof(exp[0])); if(!mantissa || !exp || !sign || !dcpy) die("alloc error\n"); - bitdouble( din, n, sign, exp, mantissa); - bitundouble( sign, exp, mantissa, n, dcpy); + bitdouble( din, n, exp, mantissa); + bitundouble( exp, mantissa, n, dcpy); int i; for(i=0;i < n; i++) { printf("%d,%d,%llu,%e,%e\n", sign[i], exp[i],(long long unsigned int)mantissa[i], din[i], dcpy[i]); if(din[i]!=dcpy[i]) die("check error at %d %e %e\n", i, din[i], dcpy[i]); } free(din); free(mantissa); free(exp); free(sign); free(dcpy); exit(0); diff --git a/vint.c b/vint.c index 55698d2..2737426 100644 --- a/vint.c +++ b/vint.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2016 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -22,67 +22,45 @@ - email : powturbo [_AT_] gmail [_DOT_] com **/ // vint.c - "Integer Compression" variable byte -#include +#include #include "conf.h" #include "vint.h" #include "bitutil.h" -#define _vbputu32(__op, __x, __act) {\ - if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\ - else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\ - else if(likely(__x < (1<<21))) { *(unsigned *)__op = __x << 3 | 0x03; __op += 3; __act;}\ - else if(likely(__x < (1<<28))) { *(unsigned *)__op = __x << 4 | 0x07; __op += 4; __act;}\ - else { *(unsigned *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\ -} - -#define _vbgetu32(__ip, __x, __act) do {\ - if(!((__x = *__ip) & (1<<0))) { __ip++; __x >>= 1; __act;}\ - else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\ - else if(!(__x & (1<<2))) { __x = (*(unsigned *)__ip & 0xffffffu) >> 3; __ip += 3; __act;}\ - else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\ - else { __x = (unsigned long long)(*(unsigned *)__ip) >> 4 | (unsigned long long)(__ip[4]) << 28; __ip += 5; __act;}\ -} while(0) - -#define vbputu32(__op, __x) { unsigned _x_ = __x; _vbputu32(__op, _x_, ;); } - //-------------------------------------- variable byte : 32 bits ---------------------------------------------------------------- - #if defined(__AVX2__) && defined(__AVX2__VINT) -#include -#define M1 0xfeull //7 -#define M2 0xfffcull //14 -#define M3 0xfffff8ull //21 -#define M4 0xfffffff0ull //28 -#define M5 0xfffffffff0ull //36 - - //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 -unsigned long long mtab[] = { M1, M2, M1, M3, M1, M2, M1, M4, M1, M2, M1, M3, M1, M2, M1, M5 }; - #endif -//------------------------------------------------------------------------------------------------------------------------ //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 -unsigned char vtab[] = { 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5 }; +unsigned char vtab[] = { 1, 1, 1, 1, 1, 1, 1, 1, 5, 4, 3, 3, 2, 2, 2, 2 }; // decompress buffer into an array of n unsigned values. Return value = end of decompressed buffer in -unsigned char *vbdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out) { unsigned x,*op; - for(op = out; op != out+(n&~(4-1)); op += 4) { - _vbgetu32(in, x, op[0] = x); - _vbgetu32(in, x, op[1] = x); - _vbgetu32(in, x, op[2] = x); - _vbgetu32(in, x, op[3] = x); +unsigned char *vbdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out) { register unsigned x, *op; + for(op = out; op != out+(n&~(8-1)); op += 8) { + _vbget32(in, x, op[0] = x); + _vbget32(in, x, op[1] = x); + _vbget32(in, x, op[2] = x); + _vbget32(in, x, op[3] = x); __builtin_prefetch(in+256, 0); + _vbget32(in, x, op[4] = x); + _vbget32(in, x, op[5] = x); + _vbget32(in, x, op[6] = x); + _vbget32(in, x, op[7] = x); } - while(op != out+n) { _vbgetu32(in, x, ; ); *op++ = x; } + while(op != out+n) _vbget32(in, x, *op++ = x ); return in; } // encode array with n unsigned (32 bits in[n]) values to the buffer out. Return value = end of compressed buffer out -unsigned char *vbenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out) { unsigned *ip; - for(ip = in; ip != in+(n&~(4-1)); ) { - vbputu32(out, *ip++); - vbputu32(out, *ip++); - vbputu32(out, *ip++); - vbputu32(out, *ip++); +unsigned char *vbenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out) { register unsigned x, *ip; + for(ip = in; ip != in+(n&~(8-1)); ip += 8) { __builtin_prefetch(ip+128, 0); + x = ip[0]; _vbput32(out, x, ;); + x = ip[1]; _vbput32(out, x, ;); + x = ip[2]; _vbput32(out, x, ;); + x = ip[3]; _vbput32(out, x, ;); + x = ip[4]; _vbput32(out, x, ;); + x = ip[5]; _vbput32(out, x, ;); + x = ip[6]; _vbput32(out, x, ;); + x = ip[7]; _vbput32(out, x, ;); } - while(ip != in+n) vbputu32(out, *ip++); + while(ip != in+n) { x = *ip++; _vbput32(out, x, ;); } return out; } @@ -113,28 +91,28 @@ unsigned char *vbenc64(uint64_t *__restrict in, unsigned n, unsigned char *__res unsigned char *vbdenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start) { unsigned *ip,v; for(ip = in; ip != in+(n&~(4-1)); ) { - v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;); - v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;); - v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;); - v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;); + v = (*ip)-start; start=*ip++; _vbput32(out, v, ;); + v = (*ip)-start; start=*ip++; _vbput32(out, v, ;); + v = (*ip)-start; start=*ip++; _vbput32(out, v, ;); + v = (*ip)-start; start=*ip++; _vbput32(out, v, ;); } - while(ip < in+n) { v = (*ip)-start; start = *ip++; _vbputu32(out, v, ;); } + while(ip < in+n) { v = (*ip)-start; start = *ip++; _vbput32(out, v, ;); } return out; } unsigned char *vbddec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) { unsigned x,*op; for(op = out; op != out+(n&~(8-1)); ) { - _vbgetu32(in, x, ;); *op++ = (start += x); - _vbgetu32(in, x, ;); *op++ = (start += x); - _vbgetu32(in, x, ;); *op++ = (start += x); - _vbgetu32(in, x, ;); *op++ = (start += x); - _vbgetu32(in, x, ;); *op++ = (start += x); - _vbgetu32(in, x, ;); *op++ = (start += x); - _vbgetu32(in, x, ;); *op++ = (start += x); - _vbgetu32(in, x, ;); *op++ = (start += x); + _vbget32(in, x, ;); *op++ = (start += x); + _vbget32(in, x, ;); *op++ = (start += x); + _vbget32(in, x, ;); *op++ = (start += x); + _vbget32(in, x, ;); *op++ = (start += x); + _vbget32(in, x, ;); *op++ = (start += x); + _vbget32(in, x, ;); *op++ = (start += x); + _vbget32(in, x, ;); *op++ = (start += x); + _vbget32(in, x, ;); *op++ = (start += x); } - while(op != out+n) _vbgetu32(in, x, *op++ = (start += x)); + while(op != out+n) _vbget32(in, x, *op++ = (start += x)); return in; } @@ -147,21 +125,21 @@ unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__r v = in[0] - start - 1; unsigned long long u = (unsigned long long)v<<1; if(n == 1) u |= 1; - _vbputu32(op, u, ;); + _vbput32(op, u, ;); if(!--n) return op; start = *in++; #endif for(ip = in; ip != in + (n&~(4-1)); ) { - v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v; - v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v; - v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v; - v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v; + v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v; + v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v; + v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v; + v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v; } - while(ip != in+n) { v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v; } + while(ip != in+n) { v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v; } #ifdef VINT_Z if(!b) { u = (unsigned long long)in[-1] << 1 | 1; - _vbputu32(out, u, ;); + _vbput32(out, u, ;); return out; } #endif @@ -171,7 +149,7 @@ unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__r unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) { unsigned x,*op; #ifdef VINT_Z - unsigned long long u; _vbgetu32(in, u, ;); x = u>>1; *out = (start += x+1); + unsigned long long u; _vbget32(in, u, ;); x = u>>1; *out = (start += x+1); if(u & 1) { #ifdef __SSE2__ out++; --n; BITDIZERO32(out, n, start, 1); @@ -184,16 +162,16 @@ unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__r #endif for(op = out; op != out+(n&~(8-1)); ) { - _vbgetu32(in, x, ++x); *op++ = (start += x); - _vbgetu32(in, x, ++x); *op++ = (start += x); - _vbgetu32(in, x, ++x); *op++ = (start += x); - _vbgetu32(in, x, ++x); *op++ = (start += x); - _vbgetu32(in, x, ++x); *op++ = (start += x); - _vbgetu32(in, x, ++x); *op++ = (start += x); - _vbgetu32(in, x, ++x); *op++ = (start += x); - _vbgetu32(in, x, ++x); *op++ = (start += x); + _vbget32(in, x, ++x); *op++ = (start += x); + _vbget32(in, x, ++x); *op++ = (start += x); + _vbget32(in, x, ++x); *op++ = (start += x); + _vbget32(in, x, ++x); *op++ = (start += x); + _vbget32(in, x, ++x); *op++ = (start += x); + _vbget32(in, x, ++x); *op++ = (start += x); + _vbget32(in, x, ++x); *op++ = (start += x); + _vbget32(in, x, ++x); *op++ = (start += x); } - while(op != out+n) { _vbgetu32(in, x, ++x); *op++ = (start += x); } + while(op != out+n) { _vbget32(in, x, ++x); *op++ = (start += x); } return in; } @@ -208,27 +186,51 @@ unsigned char *vbdec16(unsigned char *__restrict in, unsigned n, unsigned short unsigned char *vbzenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start) { unsigned *ip,v; for(ip = in; ip != in+(n&~(4-1)); ) { - v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;); - v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;); - v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;); - v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;); + v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;); + v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;); + v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;); + v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;); } - while(ip < in+n) { v = zigzagenc32((*ip)-start); start = *ip++; _vbputu32(out, v, ;); } + while(ip < in+n) { v = zigzagenc32((*ip)-start); start = *ip++; _vbput32(out, v, ;); } return out; } unsigned char *vbzdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) { unsigned x,*op; for(op = out; op != out+(n&~(8-1)); ) { - _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x)); - _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x)); - _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x)); - _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x)); - _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x)); - _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x)); - _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x)); - _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x)); + _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x)); + _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x)); + _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x)); + _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x)); + _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x)); + _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x)); + _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x)); + _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x)); } - while(op != out+n) _vbgetu32(in, x, *op++ = (start += zigzagdec32(x))); + while(op != out+n) _vbget32(in, x, *op++ = (start += zigzagdec32(x))); + return in; +} + +unsigned char *vbzenc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start) { + uint64_t *ip,v; + for(ip = in; ip != in+(n&~(4-1)); ) { + v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;); + v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;); + v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;); + v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;); + } + while(ip < in+n) { v = zigzagenc64((*ip)-start); start = *ip++; _vbput64(out, v, ;); } + return out; +} + +unsigned char *vbzdec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start) { + uint64_t x,*op; + for(op = out; op != out+(n&~(4-1)); ) { + _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x)); + _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x)); + _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x)); + _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x)); + } + while(op != out+n) _vbget64(in, x, *op++ = (start += zigzagdec64(x))); return in; } diff --git a/vint.h b/vint.h index b8f47e6..ec9426c 100644 --- a/vint.h +++ b/vint.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2016 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -31,39 +31,31 @@ extern "C" { #endif -//--------- 32 bits ------------------ +//--------------------------- 32 bits --------------------------------------------------------------------------------------- extern unsigned char vtab[]; -#define vbvlen32(__x) vtab[(__x)&0xf] +#define vbvlen32(__x) vtab[((unsigned char)(__x))>>4] #define _vbput32(__op, __x, __act) {\ - if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\ - else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\ - else if(likely(__x < (1<<21))) { *(unsigned short *)__op = __x << 3 | 0x03; __op += 2; *__op++ = __x >> 13; __act;}\ - else if(likely(__x < (1<<28))) { *(unsigned *)__op = __x << 4 | 0x07; __op += 4; __act;}\ - else { *(unsigned *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\ + if(likely(__x < (1<< 7))) { *__op++ = __x; __act;}\ + else if(likely(__x < (1<<14))) { ctou16(__op) = __x << 8 | __x >> 8 | 0x80; __op += 2; __act;}\ + else if(likely(__x < (1<<21))) { *__op++ = __x >> 16 | 0xc0; ctou32(__op) = __x; __op += 2; __act;}\ + else if(likely(__x < (1<<28))) { ctou32(__op) = rol32(__x,8) | 0xe0; __op += 4; __act;}\ + else { *__op++ = (unsigned long long)__x >> 32 | 0xf0; ctou32(__op) = __x; __op += 4; __act;}\ } -//#define __AVX2__VINT - #if defined(__AVX2__) && defined(__AVX2__VINT) -#include - -extern unsigned long long mtab[]; - -#define _vbget32(__ip, __x, __act) do { unsigned _vdx=(*__ip)&0xf; __x = _pext_u64(*(unsigned long long *)__ip, mtab[_vdx]); __ip+=vtab[_vdx]; __act; } while(0) - #else -#define _vbget32(__ip, __x, __act) do {\ - if(!((__x = *__ip) & (1<<0))) { __ip++; __x >>= 1; __act;}\ - else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\ - else if(!(__x & (1<<2))) { __x = (*(unsigned short *)__ip) >> 3 | (unsigned)(*(__ip+2)) << 13; __ip += 3; __act;}\ - else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\ - else { __x = (unsigned long long)(*(unsigned *)__ip) >> 4 | (unsigned long long)(__ip[4]) << 28; __ip += 5; __act;}\ +#define _vbget32(__ip, __x, __act) do { __x = *__ip++;\ + if(!(__x & 0x80)) { __act;}\ + else if(!(__x & 0x40)) { __x = (__x & 0x3f)<< 8 | *__ip++; __act;}\ + else if(!(__x & 0x20)) { __x = (__x & 0x1f)<<16 | ctou16(__ip); __ip += 2; __act;}\ + else if(!(__x & 0x10)) { __x = ror32(ctou32(__ip-1),8) & 0xfffffff; __ip += 3; __act;}\ + else { __x = (unsigned long long)(__x & 0x07)<<32 | ctou32(__ip); __ip += 4; __act;}\ } while(0) - #endif -//----------------- 16 bits -------------------------- +//----------------- 16 bits ------------------------------------------------------------------------------------------------------- #define _vbput16(__op, __x) _vbput32(__op, __x) #define _vbget16(__ip, __x, __act) _vbget32(__ip, __x, __act) -//----------------- 64 bits -------------------------- + +//----------------- 64 bits ------------------------------------------------------------------------------------------------------- #define _vbput64(__op, __x, __act) {\ if(__x < 1 << 7) { *__op++ = __x << 1; __act;}\ else if(__x < 1 <<14) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\ @@ -96,8 +88,8 @@ extern unsigned long long mtab[]; #define vbput16(__op, __x) vbput32(__op, __x) #define vbget16(__ip) vbget32(__ip) -#define vbput32(__op, __x) { unsigned _x_ = __x; _vbput32(__op, _x_, ;); } -#define vbget32(__ip) ({ unsigned _x_; _vbget32(__ip, _x_, ;); _x_; }) +#define vbput32(__op, __x) { register unsigned _x_ = __x; _vbput32(__op, _x_, ;); } +#define vbget32(__ip) ({ register unsigned _x_; _vbget32(__ip, _x_, ;); _x_; }) #define vbput64(__op, __x) { unsigned long long _x_ = __x; _vbput64(__op, _x_, ;); } #define vbget64(__ip) ({ unsigned long long _x_; _vbget64(__ip, _x_, ;); _x_; }) @@ -122,6 +114,8 @@ unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned //------ zigzag encoding integer lists ------------------------------------------------------------- unsigned char *vbzenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); unsigned char *vbzdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *vbzenc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); +unsigned char *vbzdec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); //--- 15 bits integer lists ------------ #define vbput15(__op, __x) do { unsigned _x = __x; if(likely(_x < 0x80)) *__op++ = _x; else { *__op++ = (_x) >> 8 | 0x80; *__op++ = _x; } } while(0)