.

2016-04-17 15:56:51 +02:00
parent 62fb4b0115
commit ab6f60b2d8
9 changed files with 337 additions and 259 deletions
--- a/README.md
+++ b/README.md
@ -26,6 +26,7 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/po
 <p>
 + **Variable byte**
 - :sparkles: Scalar **"Variable Byte"** faster and more efficient than **ANY** other (incl. SIMD MaskedVByte) implementation
+ - :new: **now up to 25% more faster**
 <p>
 + **Simple family**
 - :sparkles: **Novel** **"Variable Simple"** (incl. **RLE**) faster and more efficient than simple16, simple-8b
@ -71,7 +72,7 @@ CPU: Sandy bridge i7-2600k at 4.2GHz, gcc 5.1, ubuntu 15.04, single thread.
 | 99.910.930| 24.98| 7.99| 2524.50|1943.41|[SIMDPack FPF](#FastPFor)|
 | 99.910.930| 24.98| 7.99| 1883.21|1898.11|**TurboPack**|
 | 99.910.930| 24.98| 7.99| 1877.25| 935.83|**TurboForDA**|
-|102.074.663| 25.52| 8.17| 1621.64|1694.64|**TurboVbyte**|
+|102.074.663| 25.52| 8.17| 1993.95|1827.04|**TurboVbyte**|
 |102.074.663| 25.52|8.17|1214.12|1688.95|[MaskedVByte](#MaskedVByte)|
 |102.074.663| 25.52| 8.17| 1178.72| 949.59|[Vbyte FPF](#FastPFor)|
 |103.035.930| 25.76| 8.24| 1480.47|1746.51|[libfor](#libfor)|
@ -90,16 +91,16 @@ CPU: Skylake i7-6700 w/ only 3.7GHz
 | 63392801|	15.85|	 5.07|  387.30| 243.62|**TurboPForDA**|
 | 65359916|	16.34|	 5.23|    7.58| 609.12|OptPFD|
 | 73477088|	18.37|	 5.88|  101.68| 621.37|Simple16|
-| 78514276|	19.63|	 6.28|256.83|676.45|**VSimple**|
-| 95915096|	23.98|	 7.67|  211.79|954.62|Simple-8b|
+| 78514276|	19.63|	 6.28|258.31|691.48|**VSimple**|
+| 95915096|	23.98|	 7.67|  211.79|957.62|Simple-8b|
 | 98546814|	24.64|	 7.88|   70.85|**2349.71**|[QMX](#QMX)|
 | 99910930|	24.98|	 7.99|**3537.57**|**3081.79**|**TurboPackV**|
 | 99910930|	24.98|	 7.99| 3099.52|3071.77|SIMDPack FPF|
-| 99910930|	24.98|	 7.99| 2050.47|2402.54|**TurboPack**|
+| 99910930|	24.98|	 7.99| 2095.79|2495.22|**TurboPack**|
 | 99910930|	24.98|	 7.99| 2049.85|2364.52|**TurboFor**|
 | 99910930|	24.98|	 7.99| 2049.70|1124.12|**TurboForDA**|
 |102074663|	25.52|	 8.17| 1354.42|1745.69|MaskedVByte|
-|102074663|	25.52|	 8.17| 1660.76|1626.67|**TurboVbyte**|
+|102074663|	25.52|	 8.17| 1825.64|1844.34|**TurboVbyte**|
 |102074663|	25.52|	 8.17| 1249.77|1051.85|Vbyte FPF|
 |112500000|	28.12|	 9.00|  466.94|3003.70|VarintG8IU|
 |128125000|	32.03|	10.25| 1109.67|1271.32|[StreamVbyte FPF](#FastPFor)|
@ -310,4 +311,4 @@ header files to use with documentation:<br />
   - [On Inverted Index Compression for Search Engine Efficiency](http://www.dcs.gla.ac.uk/~craigm/publications/catena14compression.pdf)
   - [Google's Group Varint Encoding](http://static.googleusercontent.com/media/research.google.com/de//people/jeff/WSDM09-keynote.pdf)

-Last update: 27 MAR 2016
+Last update: 08 APR 2016
--- a/bitunpack.c
+++ b/bitunpack.c
@ -35,17 +35,17 @@
 #define DSTI(__op)
 #define BPI(__w, __x, __parm) __w
 #include __FILE__
-unsigned char *bitunpack32(  const unsigned char *__restrict in, unsigned n, unsigned           *__restrict out				 , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out,    0); return ip; }
-unsigned char *bitunpack16(  const unsigned char *__restrict in, unsigned n, unsigned short     *__restrict out				 , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out,    0); return ip; }
-unsigned char *bitunpack64(  const unsigned char *__restrict in, unsigned n, uint64_t           *__restrict out				 , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK64(in, n, b, out,    0); return ip; }
+unsigned char *bitunpack32(  const unsigned char *__restrict in, unsigned n, unsigned           *__restrict out				 , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out,    0); return (unsigned char *)ip; }
+unsigned char *bitunpack16(  const unsigned char *__restrict in, unsigned n, unsigned short     *__restrict out				 , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out,    0); return (unsigned char *)ip; }
+unsigned char *bitunpack64(  const unsigned char *__restrict in, unsigned n, uint64_t           *__restrict out				 , unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK64(in, n, b, out,    0); return (unsigned char *)ip; }
 #undef BPI
 #undef DSTI
 //-----------------------------------------------------------------------------------------------------------------
 #define DSTI(__op)
 #define BPI(__w, __x, __parm) (__parm += (__w) + 1)
 #include __FILE__
-unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
-unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
+unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
 #undef BPI
 #undef DSTI

@ -53,8 +53,8 @@ unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, uns
 #define DSTI(__op)
 #define BPI(__w, __x, __parm) (__parm += (__w))
 #include __FILE__
-unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
-unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
+unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
 #undef BPI
 #undef DSTI

@ -63,7 +63,7 @@ unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, uns
 #define DSTI(__op)
 #define BPI(__w, __x, __parm) (__parm += zigzagdec32(__w))
 #include __FILE__
-unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
 //unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
 #undef BPI
 #undef DSTI
@ -73,8 +73,8 @@ unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, uns
 #define BPI(__w, __x, __parm) (__parm + (__w))
 #include __FILE__

-unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
-unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
+unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
 #undef BPI
 #undef DSTI

@ -82,8 +82,8 @@ unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, uns
 #define DSTI(__op) start += 32
 #define BPI(__w, __x, __parm) (__parm + (__w)+__x+1)
 #include __FILE__
-unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
-unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
+unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return (unsigned char *)ip; }
 #undef BPI
 #undef DSTI

--- a/bitunpackv.c
+++ b/bitunpackv.c
@ -78,7 +78,7 @@ unsigned char *bitunpackv32( const unsigned char *__restrict in, unsigned n, uns
  const unsigned char *ip = in+PAD8(n*b); 
  __m128i sv; 
  BITUNPACKV32(in, n, b, out, sv); 
-  return ip; 
+  return (unsigned char *)ip; 
 }
 #undef VSTO
 #undef VSTO0
@ -116,7 +116,7 @@ unsigned char *_bitunpackv32( const unsigned char *__restrict in, unsigned n, un
  const unsigned char *ip = in+PAD8(n*b); unsigned m;
  __m128i sv; 
  BITUNPACKV32(in, n, b, out, sv);
-  return ip; 
+  return (unsigned char *)ip; 
 }
 #undef VSTO
 #undef VSTO0
@ -134,7 +134,7 @@ unsigned char *bitzunpackv32( const unsigned char *__restrict in, unsigned n, un
  const unsigned char *ip = in+PAD8(n*b); 
  __m128i sv = _mm_set1_epi32(start); 
  BITUNPACKV32(in, n, b, out, sv); 
-  return ip; 
+  return (unsigned char *)ip; 
 }
 #undef VSTO
 #undef BITUNPACK0
@ -149,7 +149,7 @@ unsigned char *bitdunpackv32( const unsigned char *__restrict in, unsigned n, un
  const unsigned char *ip = in+PAD8(n*b); 
  __m128i sv = _mm_set1_epi32(start);
  BITUNPACKV32(in, n, b, out, sv); 
-  return ip; 
+  return (unsigned char *)ip; 
 }
 #undef VSTO
 #undef VSTO0
@ -171,7 +171,7 @@ unsigned char *_bitdunpackv32( const unsigned char *__restrict in, unsigned n, u
  const unsigned char *ip = in+PAD8(n*b); unsigned m;
  __m128i sv = _mm_set1_epi32(start);
  BITUNPACKV32(in, n, b, out, sv); 
-  return ip; 
+  return (unsigned char *)ip; 
 }
 #undef VSTO
 #undef VSTO0
@ -188,7 +188,7 @@ unsigned char *bitd1unpackv32( const unsigned char *__restrict in, unsigned n, u
  const unsigned char *ip = in+PAD8(n*b);
  __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
  BITUNPACKV32(in, n, b, out, sv); 
-  return ip; 
+  return (unsigned char *)ip; 
 }
 #undef VSTO
 #undef VSTO0
@ -209,7 +209,7 @@ unsigned char *_bitd1unpackv32( const unsigned char *__restrict in, unsigned n,
  const unsigned char *ip = in+PAD8(n*b); unsigned m;
  __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
  BITUNPACKV32(in, n, b, out, sv); 
-  return ip; 
+  return (unsigned char *)ip; 
 }
 #undef VSTO
 #undef VSTO0
--- a/bitutil.c
+++ b/bitutil.c
@ -1,5 +1,5 @@
 /**
-    Copyright (C) powturbo 2013-2015
+    Copyright (C) powturbo 2013-2016
    GPL v2 License
  
    This program is free software; you can redistribute it and/or modify
@ -33,19 +33,22 @@
 	_x = (*_p)-__start-__inc; __start = *_p++; __act;\
 	_x = (*_p)-__start-__inc; __start = *_p++; __act;\
  }\
-  while(_p < __p+__n) { \
+  while(_p != __p+__n) { \
    _x = *_p-__start-__inc; __start = *_p++; __act;\
  }\
 }

-#define BITUNDELTA(__p, __n, __start, __inc) { typeof(__p[0]) *_p;\
+#define BITUNDELTA(__p, __n, __start, __inc) {\
+  typeof(__p[0]) *_p;\
  for(_p = __p; _p != __p+(__n&~(4-1)); ) {\
    *_p = (__start += (*_p) + __inc); _p++;\
    *_p = (__start += (*_p) + __inc); _p++;\
    *_p = (__start += (*_p) + __inc); _p++;\
    *_p = (__start += (*_p) + __inc); _p++;\
  }\
-  while(_p < __p+__n) { *_p = (__start += (*_p) + __inc); _p++; }\
+  while(_p != __p+__n) {\
+    *_p = (__start += (*_p) + __inc); _p++;\
+  }\
 }

 #define BITMINMAX(__p,__n, __mi, __mx) {\
@ -56,7 +59,7 @@
 	if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \
 	if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \
  }\
-  while(_p < __p+__n) { \
+  while(_p != __p+__n) { \
 	if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \
  }\
 }
@ -65,29 +68,36 @@ unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, uns
    #ifdef __SSE2__
  unsigned *ip,b,*op = out; 
  __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(inc), dv;
-  for(ip = in; ip != in+(n&~(4-1)); ip += 4) { 
+  for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) { 
    __m128i iv = _mm_loadu_si128((__m128i *)ip); 
 	bv = _mm_or_si128(bv, dv = _mm_sub_epi32(DELTA128_32(iv,sv),cv)); 
 	sv = iv; 
 	_mm_storeu_si128((__m128i *)op, dv); 
-	op += 4; 
  }
  start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
  HOR128_32(bv, b);
-  while(ip < in+n) { unsigned x = *ip-start-inc; start = *ip++; b |= x; *op++ = x; }
+  while(ip != in+n) { 
+    unsigned x = *ip-start-inc; 
+	start = *ip++;
+	b    |= x; 
+	*op++ = x;
+  }
    #else
-  typeof(in[0]) b = 0,*op = out; BITDELTA(in, n, inc, start, b |= _x;*op++ = _x);
+  typeof(in[0]) b = 0,*op = out;
+  BITDELTA(in, n, inc, start, b |= _x;*op++ = _x);
    #endif
  return bsr32(b);
 }

 unsigned bitdelta64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, unsigned inc) {
-  typeof(in[0]) b = 0,*op = out; BITDELTA(in, n, inc, start, b |= _x; *op++ = _x);
+  typeof(in[0]) b = 0,*op = out;
+  BITDELTA(in, n, inc, start, b |= _x; *op++ = _x);
  return bsr64(b);
 }

 unsigned bit32(unsigned *in, unsigned n) {
-  typeof(in[0]) b; BITSIZE32(in, n, b);
+  typeof(in[0]) b;
+  BITSIZE32(in, n, b);
  return b; 
 }

@ -119,13 +129,14 @@ unsigned bitd32(unsigned *in, unsigned n, unsigned start) {
  
  start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
  HOR128_32(bv, b);
-  while(ip < in+n) { 
+  while(ip != in+n) { 
    unsigned x = *ip-start; 
 	start = *ip++; 
 	b |= x; 
  }
    #else
-  typeof(in[0]) b = 0; BITDELTA(in,n, 0, start, b |= _x);
+  typeof(in[0]) b = 0;
+  BITDELTA(in,n, 0, start, b |= _x);
    #endif
  return bsr32(b); 
 }
@ -141,13 +152,14 @@ unsigned bitd132(unsigned *in, unsigned n, unsigned start) {
  
  start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
  HOR128_32(bv, b);
-  while(ip < in+n) { 
+  while(ip != in+n) { 
    unsigned x = *ip-start-1; 
 	start = *ip++; 
 	b |= x; 
  }
    #else
-  typeof(in[0]) b = 0; BITDELTA(in, n, 1, start, b |= _x);
+  typeof(in[0]) b = 0; 
+  BITDELTA(in, n, 1, start, b |= _x);
 	#endif
  return bsr32(b); 
 }
@ -159,14 +171,13 @@ void bitund132(unsigned *p, unsigned n, unsigned x) {
    #ifdef __SSE2__
  __m128i sv = _mm_set1_epi32(x), cv = _mm_set_epi32(4,3,2,1);
  unsigned *ip;
-  for(ip = p; ip != p+(n&~(4-1)); ) {
+  for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
    __m128i v =  _mm_loadu_si128((__m128i *)ip); 
 	SCANI128_32(v, sv, cv); 
 	_mm_storeu_si128((__m128i *)ip, sv); 
-	ip += 4;
  }
  x = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
-  while(ip < p+n) { 
+  while(ip != p+n) { 
    *ip = (x += (*ip) + 1); 
 	ip++; 
  }
@ -188,18 +199,21 @@ void bitundx64(uint64_t *p, unsigned n, uint64_t x, unsigned inc) { BITUNDELTA(p
 	_x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\
  }\
  while(_p != __p+__n) { \
-	_x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\
+	_x = ((int)(*_p)-(int)__start);	_x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\
  }\
 }

-#define BITUNZIGZAG(__p, __n, __start) { typeof(__p[0]) *_p, _z;\
+#define BITUNZIGZAG(__p, __n, __start) {\
+  typeof(__p[0]) *_p, _z;\
  for(_p = __p; _p != __p+(__n&~(4-1)); ) {\
    _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
    _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
    _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
    _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\
  }\
-  while(_p != __p+__n) { _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++; }\
+  while(_p != __p+__n) {\
+    _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1)));	_p++;\
+  }\
 }

 unsigned bitz32(unsigned *in, unsigned n, unsigned start) { 
@ -216,10 +230,15 @@ unsigned bitz32(unsigned *in, unsigned n, unsigned start) {
  start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
  HOR128_32(bv, b);
  while(ip != in+n) { 
-    int x = ((int)(*ip)-(int)start); x = (x << 1) ^ (x >> 31); start = *ip++; b |= x; 
+    int x = ((int)(*ip)-(int)start); 
+	x = (x << 1) ^ (x >> 31); 
+	start = *ip++; 
+	b |= x; 
  }
    #else
-  typeof(in[0]) b = 0,*op = out; int _x; BITZIGZAG(in, n, start, b |= (unsigned)_x);
+  typeof(in[0]) b = 0,*op = out; 
+  int _x; 
+  BITZIGZAG(in, n, start, b |= (unsigned)_x);
    #endif
  return bsr32(b);
 }
@ -228,22 +247,27 @@ unsigned bitzigzag32(unsigned *in, unsigned n, unsigned *out, unsigned start) {
    #ifdef __SSE2__
  unsigned *ip,b,*op = out; 
  __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), dv;
-  for(ip = in; ip != in+(n&~(4-1)); ip += 4) { 
+  for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) { 
    __m128i iv = _mm_loadu_si128((__m128i *)ip); 
 	dv = DELTA128_32(iv,sv); 
 	sv = iv; 
    dv = ZIGZAG128_32(dv); 
    bv = _mm_or_si128(bv, dv);
 	_mm_storeu_si128((__m128i *)op, dv); 
-	op += 4; 
  }
  start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
  HOR128_32(bv, b);
  while(ip != in+n) { 
-    int x = ((int)(*ip)-(int)start); x = (x << 1) ^ (x >> 31); start = *ip++; b |= x; *op++ = x; 
+    int x = ((int)(*ip)-(int)start); 
+	x = (x << 1) ^ (x >> 31); 
+	start = *ip++; 
+	b |= x; 
+	*op++ = x; 
  }
    #else
-  typeof(in[0]) b = 0,*op = out; int _x; BITZIGZAG(in, n, start, b |= (unsigned)_x; *op++ = _x);
+  typeof(in[0]) b = 0, *op = out; 
+  int _x; 
+  BITZIGZAG(in, n, start, b |= (unsigned)_x; *op++ = _x);
    #endif
  return bsr32(b);
 }
@ -252,61 +276,81 @@ void bitunzigzag32(unsigned *p, unsigned n, unsigned start) {
    #ifdef __SSE2__
  __m128i sv = _mm_set1_epi32(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
  unsigned *ip;
-  for(ip = p; ip != p+(n&~(4-1)); ) {
+  for(ip = p; ip != p+(n&~(4-1)); ip += 4) {
    __m128i iv =  _mm_loadu_si128((__m128i *)ip); 
    iv = UNZIGZAG128_32(iv); 
 	SCAN128_32(iv, sv);
 	_mm_storeu_si128((__m128i *)ip, sv); 
-	ip += 4;
  }
  start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
  while(ip != p+n) { 
-    unsigned z = *ip; *ip = (start += (z >> 1 ^ -(z & 1))); ip++;
+    unsigned z = *ip; 
+	*ip++ = (start += (z >> 1 ^ -(z & 1))); 
  }
    #else
  BITUNZIGZAG(p, n, start);
    #endif
 }

-unsigned bitzigzag64(unsigned *in, unsigned n, unsigned *out, unsigned start) {
-  typeof(in[0]) b = 0,*op = out; long long _x; BITZIGZAG(in, n, start, b |= (unsigned long long)_x; *op++ = _x);
+unsigned bitzigzag64(uint64_t *in, unsigned n, uint64_t *out, unsigned start) {
+  typeof(in[0]) b = 0,*op = out; 
+  long long _x; 
+  BITZIGZAG(in, n, start, b |= (unsigned long long)_x; *op++ = _x);
  return bsr32(b);
 }

-void bitunzigzag64(unsigned *p, unsigned n, unsigned start) { 
+void bitunzigzag64(uint64_t *p, unsigned n, unsigned start) { 
  BITUNZIGZAG(p, n, start);
 }

 //------------------- De-/Compose Floating Point -----------------------------------------
-void bitdouble(double *in, unsigned n, unsigned *sgn, unsigned *expo, uint64_t *mant) {
+void bitdouble(double *in, unsigned n, int *expo, uint64_t *mant) {
  double *ip;
-  uint64_t u;
  for(ip = in; ip < in+n; ip++) { 
-    u = *(uint64_t *)ip; BITFLOAT(u, *sgn++, *expo++, *mant++, DMANT_BITS, 1ull);
+    uint64_t u = *(uint64_t *)ip; 
+    *expo++ = FLTEXPO(u, DMANT_BITS, 1ull); 
+    *mant++ = FLTMANT(u, DMANT_BITS, 1ull);
  }
 }

-void bitundouble(unsigned *sgn, unsigned *expo, uint64_t *mant, unsigned n, double *out) {
+void bitundouble(int *expo, uint64_t *mant, unsigned n, double *out) {
  double *op; 
  uint64_t u;
-  for(op = out; op < out+n; op++) {
-    BITUNFLOAT((uint64_t)(*sgn++), (uint64_t)(*expo++), *mant++, u, DMANT_BITS); *op = *(double *)&u;
+  for(op = out; op < out+n; ) {
+    BITUNFLOAT( (int64_t)(*expo++), *mant++, u, DMANT_BITS); *op++ = *(double *)&u;
  }
 }

-void bitfloat(float *in, unsigned n, unsigned *sgn, unsigned *expo, unsigned *mant) {
-  float *ip;
-  unsigned u;
+void bitzdouble(double *in, unsigned n, int *expo, uint64_t *mant) {
+  double *ip;
  for(ip = in; ip < in+n; ip++) { 
-    u = *(unsigned *)ip; BITFLOAT(u, *sgn++, *expo++, *mant++, FMANT_BITS, 1u);
+    uint64_t u = *(uint64_t *)ip; 
+    *expo++ = zigzagenc32((int)FLTEXPO(u, DZMANT_BITS, 1ull)-1023);
+    *mant++ = FLTMANT(u, DZMANT_BITS, 1ull);
  }
 }

-void bitunfloat(unsigned *sgn, unsigned *expo, unsigned *mant, unsigned n, float *out) {
+void bitzundouble(int *expo, uint64_t *mant, unsigned n, double *out) {
+  double *op; 
+  uint64_t u;
+  for(op = out; op < out+n; ) {
+    BITUNFLOAT( (int64_t)zigzagdec32(*expo++)+1023, *mant++, u, DZMANT_BITS); *op++ = *(double *)&u;
+  }
+}
+
+void bitfloat(float *in, unsigned n, int *expo, unsigned *mant) {
+  float *ip;
+  for(ip = in; ip < in+n; ip++) { 
+    unsigned u = *(unsigned *)ip; 
+    *expo++ = FLTEXPO(u, FMANT_BITS, 1u);
+    *mant++ = FLTMANT(u, FMANT_BITS, 1u);
+  }
+}
+
+void bitunfloat(int *expo, unsigned *mant, unsigned n, float *out) {
  float *op; 
  unsigned u;
  for(op = out; op < out+n; op++) {
-    BITUNFLOAT((*sgn++), (*expo++), *mant++, u, FMANT_BITS); *op = *(float *)&u;
+    BITUNFLOAT( (*expo++), *mant++, u, FMANT_BITS); *op = *(float *)&u;
  }
 }
-
--- a/bitutil.h
+++ b/bitutil.h
@ -1,5 +1,5 @@
 /**
-    Copyright (C) powturbo 2013-2015
+    Copyright (C) powturbo 2013-2016
    GPL v2 License
  
    This program is free software; you can redistribute it and/or modify
@ -24,89 +24,103 @@
 //     bitutil.h - "Integer Compression" 
 #include <stdint.h>

-#define _BITFORZERO(out, n, start, inc) do {\
- for(i = 0; i != (n&~3); ) {\
-   out[i] = start+i*inc; i++;\
-   out[i] = start+i*inc; i++;\
-   out[i] = start+i*inc; i++;\
-   out[i] = start+i*inc; i++;\
- }\
- while(i < n) out[i] = start+i*inc,++i;\
+#define _BITFORZERO(_out_, _n_, _start_, _inc_) do { unsigned _i;\
+  for(_i = 0; _i != (_n_&~3); ) {\
+    _out_[_i] = _start_+_i*_inc_; _i++;\
+    _out_[_i] = _start_+_i*_inc_; _i++;\
+    _out_[_i] = _start_+_i*_inc_; _i++;\
+    _out_[_i] = _start_+_i*_inc_; _i++;\
+  }\
+  while(_i != _n_)\
+    _out_[_i] = _start_+_i*_inc_, ++_i;\
 } while(0)

-#define BITSIZE(__in, __n, __b, __usize) { typeof(__in[0]) *_ip;\
-  for(__b=0,_ip = __in; _ip != __in+(__n&~(4-1)); )\
-    __b |= *_ip++ | *_ip++ | *_ip++ | *_ip++;\
-  while(_ip != __in+__n) __b |= *_ip++;\
-  __b = TEMPLATE(bsr, __usize)(__b);\
+#define BITSIZE(_in_, _n_, _b_, _usize_) { typeof(_in_[0]) *_ip;\
+  for(_b_=0,_ip = _in_; _ip != _in_+(_n_&~(4-1)); )\
+    _b_ |= *_ip++ | *_ip++ | *_ip++ | *_ip++;\
+  while(_ip != _in_+_n_) \
+    _b_ |= *_ip++;\
+  _b_ = TEMPLATE(bsr, _usize_)(_b_);\
 }

-static inline unsigned zigzagenc32(int      x) { return x << 1 ^ x >> 31; }
-static inline unsigned zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); }
+static inline unsigned zigzagenc31(int      x) { x = (x << 2 | ((x>>30)&  2)) ^   x >> 31; return x; }
+static inline unsigned zigzagdec31(unsigned x) { return (x >> 2 |  (x&  2)<<30 ) ^ -(x &   1); }
+
+static inline unsigned zigzagenc32(int      x) { return x << 1 ^   x >> 31; }
+static inline unsigned zigzagdec32(unsigned x) { return x >> 1 ^ -(x &   1); }
+
+static inline uint64_t zigzagenc64(int64_t  x) { return x << 1 ^ x >> 63; }
+static inline uint64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); }

  #ifdef __SSE2__
 #include <emmintrin.h>
+// SIMD Delta
+#define DELTA128_32(_v_, _sv_) _mm_sub_epi32(_v_, _mm_or_si128(_mm_srli_si128(_sv_, 12), _mm_slli_si128(_v_, 4)))

-#define DELTA128_32(__v, __sv) _mm_sub_epi32(__v, _mm_or_si128(_mm_srli_si128(__sv, 12), _mm_slli_si128(__v, 4)))
+// SIMD Scan ( prefix sum ) 
+#define SCAN128_32( _v_, _sv_) _v_ = _mm_add_epi32(_v_, _mm_slli_si128(_v_, 4)); _sv_ = _mm_add_epi32(_mm_shuffle_epi32(_sv_, _MM_SHUFFLE(3, 3, 3, 3)), _mm_add_epi32(_mm_slli_si128(_v_, 8), _v_) )
+#define SCANI128_32(_v_, _sv_, _vi_) SCAN128_32(_v_, _sv_); _sv_ = _mm_add_epi32(_sv_, _vi_)

-#define SCAN128_32( __v, __sv) __v = _mm_add_epi32(__v, _mm_slli_si128(__v, 4)); __sv = _mm_add_epi32(_mm_shuffle_epi32(__sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm_add_epi32(_mm_slli_si128(__v, 8), __v) )
-#define SCANI128_32(__v, __sv, __vi) SCAN128_32(__v, __sv); __sv = _mm_add_epi32(__sv, __vi)
+// SIMD ZigZag
+#define   ZIGZAG128_32(_v_) _mm_xor_si128(_mm_slli_epi32(_v_,1), _mm_srai_epi32(_v_,31))
+#define UNZIGZAG128_32(_v_) _mm_xor_si128(_mm_srli_epi32(_v_,1), _mm_srai_epi32(_mm_slli_epi32(_v_,31),31) ) //_mm_sub_epi32(cz, _mm_and_si128(iv,c1))

-#define   ZIGZAG128_32(__v) _mm_xor_si128(_mm_slli_epi32(__v,1), _mm_srai_epi32(__v,31))
-#define UNZIGZAG128_32(__v) _mm_xor_si128(_mm_srli_epi32(__v,1), _mm_srai_epi32(_mm_slli_epi32(__v,31),31) ) //_mm_sub_epi32(cz, _mm_and_si128(iv,c1))
 // SIMD Horizontal OR
-#define HOR128_32(__v,__b) __v = _mm_or_si128(__v, _mm_srli_si128(__v, 8)); __v = _mm_or_si128(__v, _mm_srli_si128(__v, 4)); __b = (unsigned)_mm_cvtsi128_si32(__v)
+#define HOR128_32(_v_,_b_) _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 8)); _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 4)); _b_ = (unsigned)_mm_cvtsi128_si32(_v_)

-#define BITSIZE32(__in, __n, __b) { typeof(__in[0]) *_ip; __m128i v = _mm_setzero_si128();\
-  for(_ip = __in; _ip != __in+(__n&~(4-1)); _ip+=4) v = _mm_or_si128(v, _mm_loadu_si128((__m128i*)_ip));\
-  HOR128_32(v,__b);\
-  while(_ip != __in+__n) __b |= *_ip++;\
-  __b = bsr32(__b);\
+#define BITSIZE32(_in_, _n_, _b_) { typeof(_in_[0]) *_ip; __m128i _v = _mm_setzero_si128();\
+  for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip+=4)\
+    _v = _mm_or_si128(_v, _mm_loadu_si128((__m128i*)_ip));\
+  HOR128_32(_v,_b_);\
+  while(_ip != _in_+_n_)\
+    _b_ |= *_ip++;\
+  _b_ = bsr32(_b_);\
 }
-
-#define BITZERO32(out, n, start) do {\
-  __m128i sv = _mm_set1_epi32(start), *ov = (__m128i *)(out), *ove = (__m128i *)(out + n);\
-  do { _mm_storeu_si128(ov++, sv); } while(ov < ove); \
+// SIMD set value
+#define BITZERO32(_out_, _n_, _start_) do {\
+  __m128i _sv_ = _mm_set1_epi32(_start_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
+  do _mm_storeu_si128(_ov++, _sv_); while(_ov < _ove); \
 } while(0)

-#define BITFORZERO32(out, n, start, inc) do {\
-  __m128i sv = _mm_set1_epi32(start), *ov=(__m128i *)(out), *ove = (__m128i *)(out + n), cv = _mm_set_epi32(3*inc,2*inc,1*inc,0); \
-    sv = _mm_add_epi32(sv, cv);\
-    cv = _mm_set1_epi32(4);\
-  do { _mm_storeu_si128(ov++, sv); sv = _mm_add_epi32(sv, cv); } while(ov < ove);\
+#define BITFORZERO32(_out_, _n_, _start_, _inc_) do {\
+  __m128i _sv = _mm_set1_epi32(_start_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_), _cv = _mm_set_epi32(3*_inc_,2*_inc_,1*_inc_,0); \
+    _sv = _mm_add_epi32(_sv, _cv);\
+    _cv = _mm_set1_epi32(4);\
+  do { _mm_storeu_si128(_ov++, _sv); _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
 } while(0)

-#define BITDIZERO32(out, n, start, inc) do { __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(3+inc,2+inc,1+inc,inc), *ov=(__m128i *)(out), *ove = (__m128i *)(out + n);\
-  sv = _mm_add_epi32(sv, cv); cv = _mm_set1_epi32(4*inc); do { _mm_storeu_si128(ov++, sv), sv = _mm_add_epi32(sv, cv); } while(ov < ove);\
+#define BITDIZERO32(_out_, _n_, _start_, _inc_) do { __m128i _sv = _mm_set1_epi32(_start_), _cv = _mm_set_epi32(3+_inc_,2+_inc_,1+_inc_,_inc_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
+  _sv = _mm_add_epi32(_sv, _cv); _cv = _mm_set1_epi32(4*_inc_); do { _mm_storeu_si128(_ov++, _sv), _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
 } while(0)

  #else																					
-#define BITSIZE32(__in, __n, __b)        BITSIZE(__in, __n, __b, 32)
-#define BITFORZERO32(out, n, start, inc) _BITFORZERO(out, n, start, inc)
-#define BITZERO32(out, n, start)         _BITFORZERO(out, n, start, 0)
+#define BITSIZE32(_in_, _n_, _b_)        BITSIZE(_in_, _n_, _b_, 32)
+#define BITFORZERO32(_out_, _n_, _start_, _inc_) _BITFORZERO(_out_, _n_, _start_, _inc_)
+#define BITZERO32(_out_, _n_, _start_)         _BITFORZERO(_out_, _n_, _start_, 0)
  #endif

-
-#define DELTR( __in, __n, __mode,      __out) { unsigned _v; for(      __out[0]=__in[0],_v = 1;     _v < __n; _v++) __out[_v] = (__in[_v] - __out[0]) -   _v*__mode; }
-#define DELTRB(__in, __n, __mode, __b, __out) { unsigned _v; for(__b=0,__out[0]=__in[0],_v = 1;     _v < __n; _v++) __out[_v] = (__in[_v] - __out[0]) -   _v*__mode, __b |= __out[_v]; __b = bsr32(__b); }
+#define DELTR( _in_, _n_, _mode_,      _out_) { unsigned _v; for(      _out_[0]=_in_[0],_v = 1;     _v < _n_; _v++) _out_[_v] = (_in_[_v] - _out_[0]) -   _v*_mode_; }
+#define DELTRB(_in_, _n_, _mode_, _b_, _out_) { unsigned _v; for(_b_=0,_out_[0]=_in_[0],_v = 1;     _v < _n_; _v++) _out_[_v] = (_in_[_v] - _out_[0]) -   _v*_mode_, _b_ |= _out_[_v]; _b_ = bsr32(_b_); }

 #ifdef __cplusplus
 extern "C" {
 #endif

-// get maximum bit length of the elements in the integer array
+//------------- get maximum bit length of the elements in the integer array -----------------------
 unsigned bit32(     unsigned *in, unsigned n);  

-// transform sorted integer array to delta array. inc = increment
+//------------- Delta for sorted integer array ----------------------------------------------------
+//-- transform sorted integer array to delta array. inc = increment: out[i] = in[i] - in[i-1] - inc
 unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, unsigned inc);
 unsigned bitdelta64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, unsigned inc);

-// get delta maximum bit length of the non decreasing integer array
+//-- get delta maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1]
 unsigned bitd32(    unsigned *in, unsigned n, unsigned start);  

-// get delta maximum bit length of the non strictly decreasing integer array
+//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
 unsigned bitd132(   unsigned *in, unsigned n, unsigned start);

+//-- in-place reverse delta transform 
 void bitund32(      unsigned *p, unsigned n, unsigned x);
 void bitund64(      uint64_t *p, unsigned n, uint64_t x);

@ -115,32 +129,47 @@ void bitundx64(     uint64_t *p, unsigned n, uint64_t x, unsigned inc);

 void bitund132(     unsigned *p, unsigned n, unsigned x);

-// for
+//------------- FOR array bit length: out[i] = in[i] - start -------------------------------------
+
 unsigned bitf32(    unsigned *in, unsigned n, unsigned start);  // sorted
 unsigned bitf132(   unsigned *in, unsigned n, unsigned start);
 unsigned bitfm32(   unsigned *in, unsigned n, unsigned *pmin);  // unsorted
 unsigned bitf1m32(  unsigned *in, unsigned n, unsigned *pmin);

-// zigzag encoding for unsorted integer lists
+//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] -------------
+
+//-- get maximum zigzag bit length integer array
 unsigned bitz32(     unsigned *in, unsigned n, unsigned start);
+
+//-- Zigzag transform
 unsigned bitzigzag32(unsigned *in, unsigned n, unsigned *out, unsigned start);
-unsigned bitzigzag64(unsigned *in, unsigned n, unsigned *out, unsigned start);
+unsigned bitzigzag64(uint64_t *in, unsigned n, uint64_t *out, unsigned start);
+
+//-- Zigzag reverse transform
 void bitunzigzag32(  unsigned *p,  unsigned n, unsigned start);
-void bitunzigzag64(  unsigned *p,  unsigned n, unsigned start);
+void bitunzigzag64(  uint64_t *p,  unsigned n, unsigned start);

 //---- Floating point to Integer de-/composition ---------------------------------
+#define FMANT_BITS    16
+#define DMANT_BITS    32
+#define DZMANT_BITS   36

-#define FMANT_BITS    23
-#define DMANT_BITS    52

-#define BITFLOAT(__u, __sgn, __expo, __mant,      __mantbits, __one) __sgn = __u >> (sizeof(__u)*8-1); __expo = ((__u >> (__mantbits)) & ( (__one<<(sizeof(__u)*8 - 1 - __mantbits)) -1)); __mant = __u & ((__one<<__mantbits)-1);
-#define BITUNFLOAT(   __sgn, __expo, __mant, __u, __mantbits)        __u = (__sgn) << (sizeof(__u)*8-1) | (__expo) << __mantbits | (__mant) 
+#define FLTEXPO(__u,__mantbits, __one)  ( ((__u) >> __mantbits) & ( (__one<<(sizeof(__u)*8 - __mantbits)) - 1 ) )
+#define FLTMANT(__u,__mantbits, __one)    ((__u) & ((__one<<__mantbits)-1)) 
+
+#define BITUNFLOAT(__expo, __mant, __u, __mantbits) __u = ((__expo) << __mantbits) | (__mant)//>>1 | (__mant)<<(sizeof(__u)*8 - 1)
+
+/*#define BITFLOAT(__u, __sgn, __expo, __mant,      __mantbits, __one) __sgn = __u >> (sizeof(__u)*8-1); __expo = EXPO(__u,__mantbits; __mant = __u & ((__one<<__mantbits)-1)
+#define BITUNFLOAT(   __sgn, __expo, __mant, __u, __mantbits)        __u = (__sgn) << (sizeof(__u)*8-1) | (__expo) << __mantbits | (__mant) */

 // De-/Compose floating point array to/from integer arrays (sign,exponent,mantissa) for using with "Integer Compression" functions ------------
-void bitdouble(  double *in, unsigned n, unsigned *sgn, unsigned *expo, uint64_t *mant);
-void bitundouble(                        unsigned *sgn, unsigned *expo, uint64_t *mant, unsigned n, double *out);
-void bitfloat(   float *in,  unsigned n, unsigned *sgn, unsigned *expo, unsigned *mant);
-void bitunfloat(                         unsigned *sgn, unsigned *expo, unsigned *mant, unsigned n, float *out);
+void bitdouble(  double *in, unsigned n, int *expo, uint64_t *mant);
+void bitundouble(                        int *expo, uint64_t *mant, unsigned n, double *out);
+void bitzdouble( double *in, unsigned n, int *expo, uint64_t *mant);
+void bitzundouble(                       int *expo, uint64_t *mant, unsigned n, double *out);
+void bitfloat(   float *in,  unsigned n, int *expo, unsigned *mant);
+void bitunfloat(                         int *expo, unsigned *mant, unsigned n, float *out);

 #ifdef __cplusplus
 }
--- a/conf.h
+++ b/conf.h
@ -38,13 +38,19 @@
 #define popcnt64(_x_) 	__builtin_popcountll(_x_)

    #if defined(__i386__) || defined(__x86_64__)
-static inline int __bsr32(int x) {             asm("bsr  %1,%0" : "=r" (x) : "rm" (x) ); return x; }
-static inline int bsr32(  int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
-static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; }
-#define bsr16(_x_) bsr32(_x_)
+static inline int    __bsr32(               int x) {             asm("bsr  %1,%0" : "=r" (x) : "rm" (x) ); return x; }
+static inline int      bsr32(               int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
+static inline int      bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; }
+#define bsr16(_x_)     bsr32(_x_)
+
+static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
+
    #else
-static inline int bsr32(int x               ) { return x?32 - __builtin_clz(  x):0; }
-static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; }
+static inline int      bsr32(int x               ) { return x?32 - __builtin_clz(  x):0; }
+static inline int      bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; }
+static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
+static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
    #endif

 #define ctz64(_x_) __builtin_ctzll(_x_)
@ -65,6 +71,8 @@ static inline int bsr64(unsigned long long x) { unsigned long z = 0; _BitScanFor
 static inline int ctz64(unsigned long long x) { unsigned long z = 0; _BitScanForward64(&z, x); return z; }
    #endif
 static inline int ctz32(unsigned           x) { unsigned      z = 0; _BitScanForward(&z, x); return z; }
+#define rol32(x,s) _lrotl(x, s)
+#define ror32(x,s) _lrotr(x, s)
 #define fseeko _fseeki64
 #define ftello _ftelli64
 #define sleep(x) Sleep(x/1000)
--- a/icbench.c
+++ b/icbench.c
@ -1,5 +1,5 @@
 /**
-    Copyright (C) powturbo 2013-2015
+    Copyright (C) powturbo 2013-2016
    GPL v2 License
  
    This program is free software; you can redistribute it and/or modify
@ -1080,8 +1080,8 @@ int main(int argc, char *argv[]) { int r;
          uint64_t *mantissa = malloc(n*sizeof(mantissa[0]));
          unsigned *sign     = malloc(n*sizeof(sign[0]));
          unsigned *exp      = malloc(n*sizeof(exp[0]));   if(!mantissa || !exp || !sign || !dcpy) die("alloc error\n");
-          bitdouble(   din, n, sign, exp, mantissa);
-          bitundouble(         sign, exp, mantissa, n, dcpy);
+          bitdouble(   din, n, exp, mantissa);
+          bitundouble(         exp, mantissa, n, dcpy);
          int i; for(i=0;i < n; i++) { printf("%d,%d,%llu,%e,%e\n", sign[i], exp[i],(long long unsigned int)mantissa[i], din[i], dcpy[i]); if(din[i]!=dcpy[i]) die("check error at %d %e %e\n", i, din[i], dcpy[i]); }
          free(din); free(mantissa); free(exp); free(sign); free(dcpy);
          exit(0);
--- a/vint.c
+++ b/vint.c
@ -1,5 +1,5 @@
 /**
-    Copyright (C) powturbo 2013-2015
+    Copyright (C) powturbo 2013-2016
    GPL v2 License
  
    This program is free software; you can redistribute it and/or modify
@ -22,67 +22,45 @@
    - email    : powturbo [_AT_] gmail [_DOT_] com
 **/
 //    vint.c - "Integer Compression" variable byte 
-#include <stdio.h>   
+#include <stdio.h>
     
 #include "conf.h"
 #include "vint.h"
 #include "bitutil.h"

-#define _vbputu32(__op, __x, __act) {\
-       if(likely(__x < (1<< 7))) {		   		   *__op++ = __x << 1; 			 		                      __act;}\
-  else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; 		              __act;}\
-  else if(likely(__x < (1<<21))) { *(unsigned       *)__op = __x << 3 | 0x03; __op += 3;                      __act;}\
-  else if(likely(__x < (1<<28))) { *(unsigned       *)__op = __x << 4 | 0x07; __op += 4; 		              __act;}\
-  else { 		                   *(unsigned       *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\
-}
-
-#define _vbgetu32(__ip, __x, __act) do {\
-  if(!((__x = *__ip) & (1<<0))) {  __ip++; __x	                              >>= 1;                  __act;}\
-  else if(!(__x      & (1<<1))) { __x = (*(unsigned short *)__ip)             >>  2;       __ip += 2; __act;}\
-  else if(!(__x      & (1<<2))) { __x = (*(unsigned       *)__ip & 0xffffffu) >>  3;       __ip += 3; __act;}\
-  else if(!(__x      & (1<<3))) { __x = (*(unsigned       *)__ip)             >>  4; 	   __ip += 4; __act;}\
-  else 			   	            { __x = (unsigned long long)(*(unsigned       *)__ip) >>  4 | (unsigned long long)(__ip[4]) << 28; __ip += 5; __act;}\
-} while(0)
-
-#define vbputu32(__op, __x) { unsigned _x_ = __x; _vbputu32(__op, _x_, ;); }
-
 //-------------------------------------- variable byte : 32 bits ----------------------------------------------------------------
-  #if defined(__AVX2__) && defined(__AVX2__VINT)
-#include <immintrin.h>
-#define M1         0xfeull //7
-#define M2       0xfffcull //14
-#define M3     0xfffff8ull //21
-#define M4   0xfffffff0ull //28
-#define M5 0xfffffffff0ull //36 
-
-			                 //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 					 
-unsigned long long mtab[] = {    M1,  M2,  M1,  M3,  M1,  M2,  M1,  M4,  M1,  M2,  M1,  M3,  M1,  M2,  M1,  M5 };
-  #endif
-//------------------------------------------------------------------------------------------------------------------------
 			                //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 		
-unsigned char vtab[] =      {    1,   2,   1,   3,   1,   2,   1,   4,   1,   2,   1,   3,   1,   2,   1,   5 };
+unsigned char vtab[] =      {    1,   1,   1,   1,   1,   1,   1,   1,   5,   4,   3,   3,   2,   2,   2,   2 };

 // decompress buffer into an array of n unsigned values. Return value = end of decompressed buffer in
-unsigned char *vbdec32(unsigned char  *__restrict in, unsigned n, unsigned *__restrict out) { unsigned x,*op;
-  for(op = out; op != out+(n&~(4-1)); op += 4) { 
-    _vbgetu32(in, x, op[0] = x);             
-    _vbgetu32(in, x, op[1] = x); 
-    _vbgetu32(in, x, op[2] = x); 
-    _vbgetu32(in, x, op[3] = x); 
+unsigned char *vbdec32(unsigned char  *__restrict in, unsigned n, unsigned *__restrict out) { register unsigned x, *op; 
+  for(op = out; op != out+(n&~(8-1)); op += 8) {
+    _vbget32(in, x, op[0] = x);
+    _vbget32(in, x, op[1] = x);
+    _vbget32(in, x, op[2] = x);
+    _vbget32(in, x, op[3] = x); __builtin_prefetch(in+256, 0);
+    _vbget32(in, x, op[4] = x);
+    _vbget32(in, x, op[5] = x);
+    _vbget32(in, x, op[6] = x);
+    _vbget32(in, x, op[7] = x);
  }
-  while(op != out+n) { _vbgetu32(in, x, ; ); *op++ = x; }
+  while(op != out+n) _vbget32(in, x, *op++ = x );
  return in;
 }

 // encode array with n unsigned (32 bits in[n]) values to the buffer out. Return value = end of compressed buffer out
-unsigned char *vbenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out) { unsigned *ip;
-  for(ip = in; ip != in+(n&~(4-1)); ) { 
-    vbputu32(out, *ip++);
-    vbputu32(out, *ip++);
-    vbputu32(out, *ip++);
-    vbputu32(out, *ip++);  
+unsigned char *vbenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out) { register unsigned x, *ip;
+  for(ip = in; ip != in+(n&~(8-1)); ip += 8) {  __builtin_prefetch(ip+128, 0);
+    x = ip[0]; _vbput32(out, x, ;);
+    x = ip[1]; _vbput32(out, x, ;);
+    x = ip[2]; _vbput32(out, x, ;);
+    x = ip[3]; _vbput32(out, x, ;);
+    x = ip[4]; _vbput32(out, x, ;);
+    x = ip[5]; _vbput32(out, x, ;);
+    x = ip[6]; _vbput32(out, x, ;);
+    x = ip[7]; _vbput32(out, x, ;);
  }
-  while(ip !=  in+n) vbputu32(out, *ip++);
+  while(ip !=  in+n) { x = *ip++; _vbput32(out, x, ;); }
  return out;
 }

@ -113,28 +91,28 @@ unsigned char *vbenc64(uint64_t *__restrict in, unsigned n, unsigned char *__res
 unsigned char *vbdenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start) { 
  unsigned *ip,v;
  for(ip = in; ip != in+(n&~(4-1)); ) {
-    v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;);
-    v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;);
-    v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;);
-    v = (*ip)-start; start=*ip++; _vbputu32(out, v, ;);
+    v = (*ip)-start; start=*ip++; _vbput32(out, v, ;);
+    v = (*ip)-start; start=*ip++; _vbput32(out, v, ;);
+    v = (*ip)-start; start=*ip++; _vbput32(out, v, ;);
+    v = (*ip)-start; start=*ip++; _vbput32(out, v, ;);
  }
-  while(ip <  in+n) { v = (*ip)-start; start = *ip++; _vbputu32(out, v, ;); }
+  while(ip <  in+n) { v = (*ip)-start; start = *ip++; _vbput32(out, v, ;); }
  return out;
 }  

 unsigned char *vbddec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) { 
  unsigned x,*op;
  for(op = out; op != out+(n&~(8-1)); ) {
-    _vbgetu32(in, x, ;); *op++ = (start += x);
-    _vbgetu32(in, x, ;); *op++ = (start += x);
-    _vbgetu32(in, x, ;); *op++ = (start += x);
-    _vbgetu32(in, x, ;); *op++ = (start += x);
-    _vbgetu32(in, x, ;); *op++ = (start += x);
-    _vbgetu32(in, x, ;); *op++ = (start += x);
-    _vbgetu32(in, x, ;); *op++ = (start += x);
-    _vbgetu32(in, x, ;); *op++ = (start += x);
+    _vbget32(in, x, ;); *op++ = (start += x);
+    _vbget32(in, x, ;); *op++ = (start += x);
+    _vbget32(in, x, ;); *op++ = (start += x);
+    _vbget32(in, x, ;); *op++ = (start += x);
+    _vbget32(in, x, ;); *op++ = (start += x);
+    _vbget32(in, x, ;); *op++ = (start += x);
+    _vbget32(in, x, ;); *op++ = (start += x);
+    _vbget32(in, x, ;); *op++ = (start += x);
  }
-  while(op != out+n) _vbgetu32(in, x, *op++ = (start += x));
+  while(op != out+n) _vbget32(in, x, *op++ = (start += x));
  return in;
 }

@ -147,21 +125,21 @@ unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__r
  v = in[0] - start - 1; 
  unsigned long long u = (unsigned long long)v<<1; 
  if(n == 1) u |= 1;
-  _vbputu32(op, u, ;);
+  _vbput32(op, u, ;);
  if(!--n) return op;
  start = *in++;
    #endif
  for(ip = in; ip != in + (n&~(4-1)); ) {
-    v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v;
-    v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v;
-    v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v;
-    v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v;
+    v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v;
+    v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v;
+    v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v;
+    v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v;
  }
-  while(ip != in+n) { v = (*ip)-start-1; start = *ip++; _vbputu32(op, v, ;); b |= v; } 
+  while(ip != in+n) { v = (*ip)-start-1; start = *ip++; _vbput32(op, v, ;); b |= v; } 
    #ifdef VINT_Z
  if(!b) { 
    u = (unsigned long long)in[-1] << 1 | 1;
-	_vbputu32(out, u, ;); 
+	_vbput32(out, u, ;); 
 	return out; 
  }
    #endif
@ -171,7 +149,7 @@ unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__r
 unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) { 
  unsigned x,*op;
    #ifdef VINT_Z
-  unsigned long long u; _vbgetu32(in, u, ;); x = u>>1; *out = (start += x+1);
+  unsigned long long u; _vbget32(in, u, ;); x = u>>1; *out = (start += x+1);
  if(u & 1) { 
      #ifdef __SSE2__
 	out++; --n; BITDIZERO32(out, n, start, 1);
@ -184,16 +162,16 @@ unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__r
    #endif

  for(op = out; op != out+(n&~(8-1)); ) {
-    _vbgetu32(in, x, ++x); *op++ = (start += x);
-    _vbgetu32(in, x, ++x); *op++ = (start += x);
-    _vbgetu32(in, x, ++x); *op++ = (start += x);
-    _vbgetu32(in, x, ++x); *op++ = (start += x);
-    _vbgetu32(in, x, ++x); *op++ = (start += x);
-    _vbgetu32(in, x, ++x); *op++ = (start += x);
-    _vbgetu32(in, x, ++x); *op++ = (start += x);
-    _vbgetu32(in, x, ++x); *op++ = (start += x);
+    _vbget32(in, x, ++x); *op++ = (start += x);
+    _vbget32(in, x, ++x); *op++ = (start += x);
+    _vbget32(in, x, ++x); *op++ = (start += x);
+    _vbget32(in, x, ++x); *op++ = (start += x);
+    _vbget32(in, x, ++x); *op++ = (start += x);
+    _vbget32(in, x, ++x); *op++ = (start += x);
+    _vbget32(in, x, ++x); *op++ = (start += x);
+    _vbget32(in, x, ++x); *op++ = (start += x);
  }
-  while(op != out+n) { _vbgetu32(in, x, ++x); *op++ = (start += x); }
+  while(op != out+n) { _vbget32(in, x, ++x); *op++ = (start += x); }
  return in;
 }

@ -208,27 +186,51 @@ unsigned char *vbdec16(unsigned char  *__restrict in, unsigned n, unsigned short
 unsigned char *vbzenc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start) { 
  unsigned *ip,v;
  for(ip = in; ip != in+(n&~(4-1)); ) {
-    v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;);
-    v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;);
-    v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;);
-    v = zigzagenc32((*ip)-start); start=*ip++; _vbputu32(out, v, ;);
+    v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;);
+    v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;);
+    v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;);
+    v = zigzagenc32((*ip)-start); start=*ip++; _vbput32(out, v, ;);
  }
-  while(ip <  in+n) { v = zigzagenc32((*ip)-start); start = *ip++; _vbputu32(out, v, ;); }
+  while(ip <  in+n) { v = zigzagenc32((*ip)-start); start = *ip++; _vbput32(out, v, ;); }
  return out;
 }  

 unsigned char *vbzdec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start) { 
  unsigned x,*op;
  for(op = out; op != out+(n&~(8-1)); ) {
-    _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
-    _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
-    _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
-    _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
-    _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
-    _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
-    _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
-    _vbgetu32(in, x, ;); *op++ = (start += zigzagdec32(x));
+    _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+    _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+    _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+    _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+    _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+    _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+    _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
+    _vbget32(in, x, ;); *op++ = (start += zigzagdec32(x));
  }
-  while(op != out+n) _vbgetu32(in, x, *op++ = (start += zigzagdec32(x)));
+  while(op != out+n) _vbget32(in, x, *op++ = (start += zigzagdec32(x)));
+  return in;
+}
+
+unsigned char *vbzenc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start) { 
+  uint64_t *ip,v;
+  for(ip = in; ip != in+(n&~(4-1)); ) {
+    v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;);
+    v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;);
+    v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;);
+    v = zigzagenc64((*ip)-start); start=*ip++; _vbput64(out, v, ;);
+  }
+  while(ip <  in+n) { v = zigzagenc64((*ip)-start); start = *ip++; _vbput64(out, v, ;); }
+  return out;
+}  
+
+unsigned char *vbzdec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start) {
+  uint64_t x,*op;
+  for(op = out; op != out+(n&~(4-1)); ) {
+    _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x));
+    _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x));
+    _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x));
+    _vbget64(in, x, ;); *op++ = (start += zigzagdec64(x));
+  }
+  while(op != out+n) _vbget64(in, x, *op++ = (start += zigzagdec64(x)));
  return in;
 }
--- a/vint.h
+++ b/vint.h
@ -1,5 +1,5 @@
 /**
-    Copyright (C) powturbo 2013-2015
+    Copyright (C) powturbo 2013-2016
    GPL v2 License
  
    This program is free software; you can redistribute it and/or modify
@ -31,39 +31,31 @@
 extern "C" {
 #endif

-//--------- 32 bits ------------------
+//--------------------------- 32 bits ---------------------------------------------------------------------------------------
 extern unsigned char vtab[];
-#define vbvlen32(__x) vtab[(__x)&0xf]
+#define vbvlen32(__x) vtab[((unsigned char)(__x))>>4]

 #define _vbput32(__op, __x, __act) {\
-       if(likely(__x < (1<< 7))) {		   		   *__op++ = __x << 1; 			 		                      __act;}\
-  else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; 		              __act;}\
-  else if(likely(__x < (1<<21))) { *(unsigned short *)__op = __x << 3 | 0x03; __op += 2; *__op++ = __x >> 13; __act;}\
-  else if(likely(__x < (1<<28))) { *(unsigned       *)__op = __x << 4 | 0x07; __op += 4; 		              __act;}\
-  else { 		                   *(unsigned       *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\
+       if(likely(__x < (1<< 7))) {		  *__op++ = __x; 			 		                                             __act;}\
+  else if(likely(__x < (1<<14))) { ctou16(__op)   = __x << 8 |  __x >>  8         | 0x80;                     __op += 2; __act;}\
+  else if(likely(__x < (1<<21))) {        *__op++ = __x >> 16                     | 0xc0; ctou32(__op) = __x; __op += 2; __act;}\
+  else if(likely(__x < (1<<28))) { ctou32(__op)   = rol32(__x,8)                  | 0xe0;                     __op += 4; __act;}\
+  else { 		                          *__op++ = (unsigned long long)__x >> 32 | 0xf0; ctou32(__op) = __x; __op += 4; __act;}\
 }

-//#define __AVX2__VINT
-  #if defined(__AVX2__) && defined(__AVX2__VINT)
-#include <immintrin.h>
-
-extern unsigned long long mtab[];
-
-#define _vbget32(__ip, __x, __act) do { unsigned _vdx=(*__ip)&0xf; __x = _pext_u64(*(unsigned long long *)__ip, mtab[_vdx]); __ip+=vtab[_vdx]; __act; } while(0)
-  #else
-#define _vbget32(__ip, __x, __act) do {\
-  if(!((__x = *__ip) & (1<<0))) {  __ip++; __x	                  >>= 1; 		                                  __act;}\
-  else if(!(__x      & (1<<1))) { __x = (*(unsigned short *)__ip) >>  2;		           __ip += 2;             __act;}\
-  else if(!(__x      & (1<<2))) { __x = (*(unsigned short *)__ip) >>  3 | (unsigned)(*(__ip+2)) << 13; __ip += 3; __act;}\
-  else if(!(__x      & (1<<3))) { __x = (*(unsigned       *)__ip) >>  4; 		      	   __ip += 4;             __act;}\
-  else 			   	            { __x = (unsigned long long)(*(unsigned       *)__ip) >>  4 | (unsigned long long)(__ip[4]) << 28; __ip += 5;             __act;}\
+#define _vbget32(__ip, __x, __act) do { __x = *__ip++;\
+       if(!(__x & 0x80)) {   								        							 __act;}\
+  else if(!(__x & 0x40)) { __x = (__x & 0x3f)<< 8 | *__ip++; 								     __act;}\
+  else if(!(__x & 0x20)) { __x = (__x & 0x1f)<<16 | ctou16(__ip); 	    		      __ip += 2; __act;}\
+  else if(!(__x & 0x10)) { __x = ror32(ctou32(__ip-1),8) & 0xfffffff;				  __ip += 3; __act;}\
+  else 			   	     { __x = (unsigned long long)(__x & 0x07)<<32 | ctou32(__ip); __ip += 4; __act;}\
 } while(0)
-  #endif

-//----------------- 16 bits --------------------------
+//----------------- 16 bits -------------------------------------------------------------------------------------------------------
 #define _vbput16(__op, __x)        _vbput32(__op, __x)
 #define _vbget16(__ip, __x, __act) _vbget32(__ip, __x, __act)
-//----------------- 64 bits --------------------------
+
+//----------------- 64 bits -------------------------------------------------------------------------------------------------------
 #define _vbput64(__op, __x, __act) {\
       if(__x < 1   << 7) {		   		        *__op++ = __x << 1; 			 		                                                                   __act;}\
  else if(__x < 1   <<14) { *(unsigned short     *)__op = __x << 2 | 0x01; __op += 2; 		             	                                               __act;}\
@ -96,8 +88,8 @@ extern unsigned long long mtab[];
 #define  vbput16(__op, __x)  vbput32(__op, __x)
 #define  vbget16(__ip)       vbget32(__ip)

-#define vbput32(__op, __x) { unsigned _x_ = __x; _vbput32(__op, _x_, ;); }
-#define vbget32(__ip)     ({ unsigned _x_;       _vbget32(__ip, _x_, ;); _x_; })
+#define vbput32(__op, __x) { register unsigned _x_ = __x; _vbput32(__op, _x_, ;); }
+#define vbget32(__ip)     ({ register unsigned _x_;       _vbget32(__ip, _x_, ;); _x_; })

 #define vbput64(__op, __x) { unsigned long long _x_ = __x; _vbput64(__op, _x_, ;); }
 #define vbget64(__ip)     ({ unsigned long long _x_;       _vbget64(__ip, _x_, ;); _x_; })
@ -122,6 +114,8 @@ unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned
 //------ zigzag encoding integer lists -------------------------------------------------------------
 unsigned char *vbzenc32(unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned start);
 unsigned char *vbzdec32(unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start);
+unsigned char *vbzenc64(uint64_t       *__restrict in, unsigned n, unsigned char  *__restrict out, uint64_t start);
+unsigned char *vbzdec64(unsigned char  *__restrict in, unsigned n, uint64_t       *__restrict out, uint64_t start);

 //--- 15 bits integer lists ------------
 #define vbput15(__op, __x) do { unsigned _x = __x; if(likely(_x < 0x80)) *__op++ = _x; else { *__op++ = (_x) >> 8 | 0x80; *__op++ = _x; } } while(0)