.

2023-03-10 20:50:21 +01:00
parent 2d3dcf5dda
commit 59f99b9b3c
12 changed files with 0 additions and 2920 deletions
--- a/bitpack.h
+++ b/bitpack.h
@ -1,310 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-//     bitpack.h - "Integer Compression" Binary Packing header file
-#ifndef BITPACK_H_
-#define BITPACK_H_
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "vs/stdint.h"
-#else
-#include <stdint.h>
-#endif
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-//******************** Bit Packing High Level API - n unlimited ***************************************************
-size_t bitnpack8(         uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnpack16(        uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnpack32(        uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnpack64(        uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnpack128v16(    uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnpack128v32(    uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnpack128v64(    uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnpack256v32(    uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t bitndpack8(        uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitndpack16(       uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitndpack32(       uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitndpack64(       uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitndpack128v16(   uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitndpack128v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitndpack256v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t bitnd1pack8(       uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnd1pack16(      uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnd1pack32(      uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnd1pack64(      uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnd1pack128v16(  uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnd1pack128v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnd1pack256v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t bitnzpack8(        uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnzpack16(       uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnzpack32(       uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnzpack64(       uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnzpack128v16(   uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnzpack128v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnzpack256v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t bitnfpack8(        uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnfpack16(       uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnfpack32(       uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnfpack64(       uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnfpack128v16(   uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnfpack128v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t bitnfpack256v32(   uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t bitnunpack8(       unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t bitnunpack16(      unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnunpack32(      unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnunpack64(      unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t bitnunpack128v16(  unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnunpack128v32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnunpack128v64(  unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t bitnunpack256v32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t bitndunpack8(      unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t bitndunpack16(     unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitndunpack32(     unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitndunpack64(     unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t bitndunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitndunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitndunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t bitnd1unpack8(     unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t bitnd1unpack16(    unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnd1unpack32(    unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnd1unpack64(    unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t bitnd1unpack128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnd1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnd1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t bitnzunpack8(      unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t bitnzunpack16(     unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnzunpack32(     unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnzunpack64(     unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t bitnfunpack8(      unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t bitnfunpack16(     unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnfunpack32(     unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnfunpack64(     unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-//******** Bit Packing Low level API ****************************************************************
-// bipackNN: Pack array with n unsigned (NN bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
-unsigned char *bitpack8(    uint8_t  *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
-unsigned char *bitpack16(   uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
-unsigned char *bitpack32(   uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
-unsigned char *bitpack64(   uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
-
-// delta bit packing
-unsigned char *bitdpack8(   uint8_t  *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitdpack16(  uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitdpack32(  uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitdpack64(  uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
-
-unsigned char *bitd1pack8(  uint8_t  *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitd1pack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitd1pack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitd1pack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
-
-// FOR bit packing : sorted integer array
-unsigned char *bitfpack8(   uint8_t  *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitfpack16(  uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitfpack32(  uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitfpack64(  uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
-
-unsigned char *bitf1pack8(  uint8_t  *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitf1pack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitf1pack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitf1pack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
-
-// zigzag : unsorted integer array
-unsigned char *bitzpack8(   uint8_t  *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitzpack16(  uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitzpack32(  uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitzpack64(  uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
-
-//-------------------------------------- SIMD ------------------------------------------------------------------------------------------
-// Pack array with 128 unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
-unsigned char *bitpack128v16(  unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out                      , unsigned b);
-unsigned char *bitdpack128v16( unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitd1pack128v16(unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitfpack128v16( unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitf1pack128v16(unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitzpack128v16( unsigned short    *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
-
-unsigned char *bitpack128v32(  unsigned          *__restrict in, unsigned n, unsigned char *__restrict out                , unsigned b);
-unsigned char *bitdpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitd1pack128v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitfpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitf1pack128v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitzpack128v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-
-//unsigned char *bitpack256w32(  unsigned          *__restrict in, unsigned n, unsigned char *__restrict out                  , unsigned b);
-unsigned char *bitpack128v64(  uint64_t          *__restrict in, unsigned n, unsigned char *__restrict out                , unsigned b);
-
-unsigned char *bitpack256v32(  unsigned          *__restrict in, unsigned n, unsigned char *__restrict out                , unsigned b);
-unsigned char *bitdpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitd1pack256v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitfpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitf1pack256v32(unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-unsigned char *bitzpack256v32( unsigned          *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
-
-//********************************** Bit Packing : Unpack ****************************************************************
-
-// ---------------- Unpack a b-bits packed integer array -------------------------------------------------------------------------------
-// unpack a bitpacked integer array. Return value = end of packed buffer in
-unsigned char *bitunpack8(  const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, unsigned b);
-unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b);
-unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b);
-unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b);
-
-// ---------------- Direct Access to a single packed integer array entry --------------------------------------------------------------
-  #ifdef TURBOPFOR_DAC
-    #ifdef __AVX2__
-#include <immintrin.h>
-#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
-#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_)
-    #else
-#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
-#define bzhi32(_u_, _b_) ((_u_) & ((1u  <<(_b_))-1))
-    #endif
-
-#include "conf.h"
-
-static ALWAYS_INLINE unsigned  bitgetx32(const unsigned char *__restrict in, unsigned  idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
-//static ALWAYS_INLINE unsigned  bitgetx32(const unsigned char *__restrict in, unsigned  idx, unsigned b) { unsigned bidx = b*idx;
- //return (ctou64((uint32_t *)in+(bidx>>5)) << 32+(bidx&0x1f)) >> (64-b);
-// return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
-static ALWAYS_INLINE unsigned _bitgetx32(const unsigned char *__restrict in, uint64_t bidx, unsigned b) {                        return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
-
-// like  bitgetx32 but for 16 bits integer array
-static ALWAYS_INLINE unsigned  bitgetx8( const unsigned char *__restrict in, unsigned  idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou16((uint16_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
-static ALWAYS_INLINE unsigned _bitgetx8( const unsigned char *__restrict in, unsigned bidx, unsigned b) {                        return bzhi32( ctou16((uint16_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
-static ALWAYS_INLINE unsigned  bitgetx16(const unsigned char *__restrict in, unsigned  idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou32((uint32_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
-static ALWAYS_INLINE unsigned _bitgetx16(const unsigned char *__restrict in, unsigned bidx, unsigned b) {                        return bzhi32( ctou32((uint32_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
-
-// Set a single value with index "idx"
-static ALWAYS_INLINE void      bitsetx16(const unsigned char *__restrict in, unsigned  idx, unsigned v, unsigned b) { unsigned bidx = b*idx;  unsigned           *p = (unsigned           *)             in+(bidx>>4) ; *p = ( *p & ~(((1u  <<b)-1) << (bidx& 0xf)) ) |                     v<<(bidx& 0xf);}
-static ALWAYS_INLINE void      bitsetx32(const unsigned char *__restrict in, unsigned  idx, unsigned v, unsigned b) { unsigned bidx = b*idx;  unsigned long long *p = (unsigned long long *)((unsigned *)in+(bidx>>5)); *p = ( *p & ~(((1ull<<b)-1) << (bidx&0x1f)) ) | (unsigned long long)v<<(bidx&0x1f);}
-   #endif
-// ---------------- DFOR : integrated bitpacking, for delta packed SORTED array (Ex. DocId in inverted index) -------------------------------
-// start <= out[0] <= out[1] <= ... <= out[n-2] <= out[n-1] <= (1<<N)-1  N=8,16,32 or 64
-// out[0] = start + in[0];  out[1] = out[0] + in[1]; ... ;  out[i] = out[i-1] + in[i]
-unsigned char *bitdunpack8(  const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitdunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
-
-// start < out[0] < out[1] < ... < out[n-2] < out[n-1] < (1<<N)-1,    N=8,16,32 or 64
-// out[0] = start + in[0] + 1;  out[1] = out[0] + in[1] + 1; ... ;  out[i] = out[i-1] + in[i] +  1
-unsigned char *bitd1unpack8( const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitd1unpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
-
-// ---------------- ZigZag : integrated bitpacking, for zigzag packed unsorted
-unsigned char *bitzunpack8(  const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitzunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
-
-// ---------------- For : Direct Access for packed SORTED array  --------------------------------------------
-// out[i] = start + in[i] + i
-unsigned char *bitfunpack8(  const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitfunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
-
-// out[i] = start + in[i] + i + 1
-unsigned char *bitf1unpack8( const unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b);
-unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
-unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
-unsigned char *bitf1unpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
-
-// ---------------- SIMD : unpack a SIMD bit packed integer array -------------------------------------------------------------------------------
-// SIMD unpack a 128/256 bitpacked integer array. Return value = end of packed buffer in
-unsigned char *bitunpack128v16(  const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out,                       unsigned b);
-unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitd1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
-unsigned char *bitf1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
-
-unsigned char *bitunpack128v32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out,                 unsigned b);
-unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitf1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-
-unsigned char *bitunpack256w32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out,                 unsigned b);
-unsigned char *bitunpack128v64(  const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out,                 unsigned b);
-
-unsigned char *bitunpack256v32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out,                 unsigned b);
-unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitf1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-
-unsigned char *bitunpack128h32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out,                 unsigned b);
-unsigned char *bitzunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitdunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-unsigned char *bitd1unpack128h32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
-
-// internal TurboPFor functions: masked unpack
-unsigned char *_bitunpack128v16(  const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out,                       unsigned b, unsigned short *__restrict pex, unsigned char *bb);
-unsigned char *_bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
-unsigned char *_bitd1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
-unsigned char *_bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
-
-unsigned char *_bitunpack128v32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-
-unsigned char *_bitunpack128h32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitdunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitd1unpack128h32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-
-//unsigned char *_bitunpack256w32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitunpack128v64(  const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned char *bb);
-
-unsigned char *_bitunpack256v32(  const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
-#ifdef __cplusplus
-}
-#endif
-#endif
-
--- a/bitutil.h
+++ b/bitutil.h
@ -1,547 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-//     "Integer Compression: max.bits, delta, zigzag, xor"
-
-#ifdef BITUTIL_IN
-  #ifdef __AVX2__
-#include <immintrin.h>
-  #elif defined(__AVX__)
-#include <immintrin.h>
-  #elif defined(__SSE4_1__)
-#include <smmintrin.h>
-  #elif defined(__SSSE3__)
-    #ifdef __powerpc64__
-#define __SSE__   1
-#define __SSE2__  1
-#define __SSE3__  1
-#define NO_WARN_X86_INTRINSICS 1
-    #endif
-#include <tmmintrin.h>
-  #elif defined(__SSE2__)
-#include <emmintrin.h>
-  #elif defined(__ARM_NEON)
-#include <arm_neon.h>
-  #endif
-  #if defined(_MSC_VER) && _MSC_VER < 1600
-#include "vs/stdint.h"
-  #else
-#include <stdint.h>
-  #endif
-#include "sse_neon.h"
-
-  #ifdef __ARM_NEON
-#define PREFETCH(_ip_,_rw_)
-  #else
-#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
-  #endif
-//------------------------ zigzag encoding -------------------------------------------------------------
-static inline unsigned char  zigzagenc8( signed char    x) { return x << 1 ^   x >> 7;  }
-static inline          char  zigzagdec8( unsigned char  x) { return x >> 1 ^ -(x &  1); }
-
-static inline unsigned short zigzagenc16(short          x) { return x << 1 ^   x >> 15;  }
-static inline          short zigzagdec16(unsigned short x) { return x >> 1 ^ -(x &   1); }
-
-static inline unsigned       zigzagenc32(int      x)       { return x << 1 ^   x >> 31;  }
-static inline int            zigzagdec32(unsigned x)       { return x >> 1 ^ -(x &   1); }
-
-static inline uint64_t       zigzagenc64(int64_t  x)       { return x << 1 ^ x >> 63;  }
-static inline  int64_t       zigzagdec64(uint64_t x)       { return x >> 1 ^ -(x & 1); }
-
-  #if defined(__SSE2__) || defined(__ARM_NEON)
-static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1),  mm_srai_epi16(v,15)); }
-static ALWAYS_INLINE __m128i mm_zzage_epi32(__m128i v) { return _mm_xor_si128( mm_slli_epi32(v,1),  mm_srai_epi32(v,31)); }
-//static ALWAYS_INLINE __m128i mm_zzage_epi64(__m128i v) { return _mm_xor_si128( mm_slli_epi64(v,1), _mm_srai_epi64(v,63)); }
-
-static ALWAYS_INLINE __m128i mm_zzagd_epi16(__m128i v) { return _mm_xor_si128( mm_srli_epi16(v,1),  mm_srai_epi16( mm_slli_epi16(v,15),15) ); }
-static ALWAYS_INLINE __m128i mm_zzagd_epi32(__m128i v) { return _mm_xor_si128( mm_srli_epi32(v,1),  mm_srai_epi32( mm_slli_epi32(v,31),31) ); }
-//static ALWAYS_INLINE __m128i mm_zzagd_epi64(__m128i v) { return _mm_xor_si128(mm_srli_epi64(v,1), _mm_srai_epi64( m_slli_epi64(v,63),63) ); }
-
-  #endif
-  #ifdef __AVX2__
-static ALWAYS_INLINE __m256i mm256_zzage_epi32(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi32(v,1), _mm256_srai_epi32(v,31)); }
-static ALWAYS_INLINE __m256i mm256_zzagd_epi32(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi32(v,1), _mm256_srai_epi32(_mm256_slli_epi32(v,31),31) ); }
-  #endif
-
-//-------------- AVX2 delta + prefix sum (scan) / xor encode/decode ---------------------------------------------------------------------------------------
-  #ifdef __AVX2__
-static ALWAYS_INLINE __m256i mm256_delta_epi32(__m256i v, __m256i sv) { return _mm256_sub_epi32(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
-static ALWAYS_INLINE __m256i mm256_delta_epi64(__m256i v, __m256i sv) { return _mm256_sub_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)),  8)); }
-static ALWAYS_INLINE __m256i mm256_xore_epi32( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
-static ALWAYS_INLINE __m256i mm256_xore_epi64( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)),  8)); }
-
-static ALWAYS_INLINE __m256i mm256_scan_epi32(__m256i v, __m256i sv) {
-  v  = _mm256_add_epi32(v, _mm256_slli_si256(v, 4));
-  v  = _mm256_add_epi32(v, _mm256_slli_si256(v, 8));
-  return _mm256_add_epi32(     _mm256_permute2x128_si256(                       _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
-           _mm256_add_epi32(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)),     0x20)));
-}
-static ALWAYS_INLINE __m256i mm256_xord_epi32(__m256i v, __m256i sv) {
-  v  = _mm256_xor_si256(v, _mm256_slli_si256(v, 4));
-  v  = _mm256_xor_si256(v, _mm256_slli_si256(v, 8));
-  return _mm256_xor_si256(     _mm256_permute2x128_si256(                       _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
-           _mm256_xor_si256(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)),     0x20)));
-}
-
-static ALWAYS_INLINE __m256i mm256_scan_epi64(__m256i v, __m256i sv) {
-  v = _mm256_add_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
-  return _mm256_add_epi64(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_add_epi64(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
-}
-static ALWAYS_INLINE __m256i mm256_xord_epi64(__m256i v, __m256i sv) {
-  v = _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
-  return _mm256_xor_si256(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_xor_si256(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
-}
-
-static ALWAYS_INLINE __m256i mm256_scani_epi32(__m256i v, __m256i sv, __m256i vi) { return _mm256_add_epi32(mm256_scan_epi32(v, sv), vi); }
-  #endif
-
-  #if defined(__SSSE3__) || defined(__ARM_NEON)
-static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_alignr_epi8(v, sv, 14)); }
-static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_alignr_epi8(v, sv, 12)); }
-static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 14)); }
-static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 12)); }
-
-#define MM_HDEC_EPI32(_v_,_sv_,_hop_) { _v_ = _hop_(_v_, _mm_slli_si128(_v_, 4)); _v_ = _hop_(mm_shuffle_nnnn_epi32(_sv_, 3), _hop_(_mm_slli_si128(_v_, 8), _v_)); }
-static ALWAYS_INLINE __m128i mm_scan_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_add_epi32); return v; }
-static ALWAYS_INLINE __m128i mm_xord_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_xor_si128); return v; }
-
-#define MM_HDEC_EPI16(_v_,_sv_,_hop_) {\
-  _v_  = _hop_(      _v_, _mm_slli_si128(_v_, 2));\
-  _v_  = _hop_(      _v_, _mm_slli_si128(_v_, 4));\
-  _v_  = _hop_(_hop_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi16(0x0f0e)));\
-}
-
-static ALWAYS_INLINE __m128i mm_scan_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_add_epi16); return v; }
-static ALWAYS_INLINE __m128i mm_xord_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_xor_si128); return v; }
-//-------- scan with vi delta > 0 -----------------------------
-static ALWAYS_INLINE __m128i mm_scani_epi16(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi16(mm_scan_epi16(v, sv), vi); }
-static ALWAYS_INLINE __m128i mm_scani_epi32(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi32(mm_scan_epi32(v, sv), vi); }
-
-  #elif defined(__SSE2__)
-static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
-static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
-static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
-static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
-  #endif
-
-#if !defined(_M_X64) && !defined(__x86_64__) && defined(__AVX__)
-#define _mm256_extract_epi64(v, index) ((__int64)((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2) | (((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2 + 1)) << 32)))
-#endif
-
-//------------------ Horizontal OR -----------------------------------------------
-  #ifdef __AVX2__
-static ALWAYS_INLINE unsigned mm256_hor_epi32(__m256i v) {
-  v = _mm256_or_si256(v, _mm256_srli_si256(v, 8));
-  v = _mm256_or_si256(v, _mm256_srli_si256(v, 4));
-  return _mm256_extract_epi32(v,0) | _mm256_extract_epi32(v, 4);
-}
-
-static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
-  v = _mm256_or_si256(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(2, 0, 0, 1)));
-  return _mm256_extract_epi64(v, 1) | _mm256_extract_epi64(v,0);
-}
-  #endif
-
-  #if defined(__SSE2__) || defined(__ARM_NEON)
-#define MM_HOZ_EPI16(v,_hop_) {\
-  v = _hop_(v, _mm_srli_si128(v, 8));\
-  v = _hop_(v, _mm_srli_si128(v, 6));\
-  v = _hop_(v, _mm_srli_si128(v, 4));\
-  v = _hop_(v, _mm_srli_si128(v, 2));\
-}
-
-#define MM_HOZ_EPI32(v,_hop_) {\
-  v = _hop_(v, _mm_srli_si128(v, 8));\
-  v = _hop_(v, _mm_srli_si128(v, 4));\
-}
-
-static ALWAYS_INLINE uint16_t mm_hor_epi16( __m128i v) { MM_HOZ_EPI16(v,_mm_or_si128);               return (unsigned short)_mm_cvtsi128_si32(v); }
-static ALWAYS_INLINE uint32_t mm_hor_epi32( __m128i v) { MM_HOZ_EPI32(v,_mm_or_si128);               return (unsigned      )_mm_cvtsi128_si32(v); }
-static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _mm_srli_si128(v, 8)); return (uint64_t      )_mm_cvtsi128_si64(v); }
-  #endif
-
-//----------------- sub / add ----------------------------------------------------------
-  #if defined(__SSE2__) || defined(__ARM_NEON)
-#define SUBI16x8(_v_, _sv_)       _mm_sub_epi16(_v_, _sv_)
-#define SUBI32x4(_v_, _sv_)       _mm_sub_epi32(_v_, _sv_)
-#define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_)
-#define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_)
-
-//---------------- Convert mm_cvtsi128_siXX -------------------------------------------
-static ALWAYS_INLINE uint8_t  mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
-static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
-  #endif
-
-//--------- memset -----------------------------------------
-#define BITFORSET_(_out_, _n_, _start_, _mindelta_) do { unsigned _i;\
-  for(_i = 0; _i != (_n_&~3); _i+=4) {       \
-    _out_[_i+0] = _start_+(_i  )*_mindelta_; \
-    _out_[_i+1] = _start_+(_i+1)*_mindelta_; \
-    _out_[_i+2] = _start_+(_i+2)*_mindelta_; \
-    _out_[_i+3] = _start_+(_i+3)*_mindelta_; \
-  }                                          \
-  while(_i != _n_)                           \
-    _out_[_i] = _start_+_i*_mindelta_, ++_i; \
-} while(0)
-
-//--------- SIMD zero -----------------------------------------
-  #ifdef __AVX2__
-#define BITZERO32(_out_, _n_, _start_) do {\
-  __m256i _sv_ = _mm256_set1_epi32(_start_), *_ov = (__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\
-  do _mm256_storeu_si256(_ov++, _sv_); while(_ov < _ove);\
-} while(0)
-
-#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) do {\
-  __m256i _sv = _mm256_set1_epi32(_start_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_), _cv = _mm256_set_epi32(7+_mindelta_,6+_mindelta_,5+_mindelta_,4+_mindelta_,3*_mindelta_,2*_mindelta_,1*_mindelta_,0); \
-    _sv = _mm256_add_epi32(_sv, _cv);\
-    _cv = _mm256_set1_epi32(4);\
-  do { _mm256_storeu_si256(_ov++, _sv); _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\
-} while(0)
-
-#define BITDIZERO32(_out_, _n_, _start_, _mindelta_) do { __m256i _sv = _mm256_set1_epi32(_start_), _cv = _mm256_set_epi32(7+_mindelta_,6+_mindelta_,5+_mindelta_,4+_mindelta_,3+_mindelta_,2+_mindelta_,1+_mindelta_,_mindelta_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\
-  _sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_mindelta_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\
-} while(0)
-
-  #elif defined(__SSE2__) || defined(__ARM_NEON) // -------------
-// SIMD set value (memset)
-#define BITZERO32(_out_, _n_, _v_) do {\
-  __m128i _sv_ = _mm_set1_epi32(_v_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
-  do _mm_storeu_si128(_ov++, _sv_); while(_ov < _ove); \
-} while(0)
-
-#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) do {\
-  __m128i _sv = _mm_set1_epi32(_start_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_), _cv = _mm_set_epi32(3*_mindelta_,2*_mindelta_,1*_mindelta_,0); \
-    _sv = _mm_add_epi32(_sv, _cv);\
-    _cv = _mm_set1_epi32(4);\
-  do { _mm_storeu_si128(_ov++, _sv); _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
-} while(0)
-
-#define BITDIZERO32(_out_, _n_, _start_, _mindelta_) do { __m128i _sv = _mm_set1_epi32(_start_), _cv = _mm_set_epi32(3+_mindelta_,2+_mindelta_,1+_mindelta_,_mindelta_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
-  _sv = _mm_add_epi32(_sv, _cv); _cv = _mm_set1_epi32(4*_mindelta_); do { _mm_storeu_si128(_ov++, _sv), _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
-} while(0)
-  #else
-#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) BITFORSET_(_out_, _n_, _start_, _mindelta_)
-#define BITZERO32(   _out_, _n_, _start_)             BITFORSET_(_out_, _n_, _start_, 0)
-  #endif
-
-#define DELTR( _in_, _n_, _start_, _mindelta_,      _out_) { unsigned _v; for(      _v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_); }
-#define DELTRB(_in_, _n_, _start_, _mindelta_, _b_, _out_) { unsigned _v; for(_b_=0,_v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_), _b_ |= _out_[_v]; _b_ = bsr32(_b_); }
-
-//----------------------------------------- bitreverse scalar + SIMD -------------------------------------------
-  #if __clang__ && defined __has_builtin
-    #if __has_builtin(__builtin_bitreverse64)
-#define BUILTIN_BITREVERSE      
-    #else
-#define BUILTIN_BITREVERSE  
-    #endif 
-  #endif
-  #ifdef BUILTIN_BITREVERSE
-#define rbit8(x)  __builtin_bitreverse8( x)
-#define rbit16(x) __builtin_bitreverse16(x)
-#define rbit32(x) __builtin_bitreverse32(x)
-#define rbit64(x) __builtin_bitreverse64(x)
-  #else
-
-    #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
-static ALWAYS_INLINE uint32_t _rbit_(uint32_t x) { uint32_t rc; __asm volatile ("rbit %0, %1" : "=r" (rc) : "r" (x) ); }
-    #endif
-static ALWAYS_INLINE uint8_t rbit8(uint8_t x) {
-    #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
-  return _rbit_(x) >> 24;
-    #elif 0
-  x = (x & 0xaa) >> 1 | (x & 0x55) << 1;
-  x = (x & 0xcc) >> 2 | (x & 0x33) << 2;
-  return x << 4 | x >> 4;
-    #else
-  return (x * 0x0202020202ull & 0x010884422010ull) % 1023;
-    #endif
-}
-
-static ALWAYS_INLINE uint16_t rbit16(uint16_t x) {
-    #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
-  return _rbit_(x) >> 16;
-    #else
-  x = (x & 0xaaaa) >> 1 | (x & 0x5555) << 1;
-  x = (x & 0xcccc) >> 2 | (x & 0x3333) << 2;
-  x = (x & 0xf0f0) >> 4 | (x & 0x0f0f) << 4;
-  return x << 8 | x >> 8;
-    #endif
-}
-
-static ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
-    #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
-  return _rbit_(x);
-    #else
-  x = ((x & 0xaaaaaaaa) >> 1 | (x & 0x55555555) << 1);
-  x = ((x & 0xcccccccc) >> 2 | (x & 0x33333333) << 2);
-  x = ((x & 0xf0f0f0f0) >> 4 | (x & 0x0f0f0f0f) << 4);
-  x = ((x & 0xff00ff00) >> 8 | (x & 0x00ff00ff) << 8);
-  return x << 16 | x >> 16;
-    #endif
-}
-static ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
-    #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
-  return (uint64_t)_rbit_(x) << 32 | _rbit_(x >> 32);
-    #else
-  x = (x & 0xaaaaaaaaaaaaaaaa) >>  1 | (x & 0x5555555555555555) <<  1;
-  x = (x & 0xcccccccccccccccc) >>  2 | (x & 0x3333333333333333) <<  2;
-  x = (x & 0xf0f0f0f0f0f0f0f0) >>  4 | (x & 0x0f0f0f0f0f0f0f0f) <<  4;
-  x = (x & 0xff00ff00ff00ff00) >>  8 | (x & 0x00ff00ff00ff00ff) <<  8;
-  x = (x & 0xffff0000ffff0000) >> 16 | (x & 0x0000ffff0000ffff) << 16;
-  return x << 32 | x >> 32;
-    #endif
-}
-  #endif
-
-  #if defined(__SSSE3__) || defined(__ARM_NEON)
-static ALWAYS_INLINE __m128i mm_rbit_epi16(__m128i v) { return mm_rbit_epi8(mm_rev_epi16(v)); }
-static ALWAYS_INLINE __m128i mm_rbit_epi32(__m128i v) { return mm_rbit_epi8(mm_rev_epi32(v)); }
-static ALWAYS_INLINE __m128i mm_rbit_epi64(__m128i v) { return mm_rbit_epi8(mm_rev_epi64(v)); }
-//static ALWAYS_INLINE __m128i mm_rbit_si128(__m128i v) { return mm_rbit_epi8(mm_rev_si128(v)); }
-  #endif
-
-  #ifdef __AVX2__
-static ALWAYS_INLINE __m256i mm256_rbit_epi8(__m256i v) {
-  __m256i fv = _mm256_setr_epi8(0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15, 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15), cv0f_8 = _mm256_set1_epi8(0xf);
-  __m256i lv = _mm256_shuffle_epi8(fv,_mm256_and_si256(                  v,     cv0f_8));
-  __m256i hv = _mm256_shuffle_epi8(fv,_mm256_and_si256(_mm256_srli_epi64(v, 4), cv0f_8));
-  return _mm256_or_si256(_mm256_slli_epi64(lv,4), hv);
-}
-
-static ALWAYS_INLINE __m256i mm256_rev_epi16(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 1, 0, 3, 2, 5, 4, 7, 6,  9, 8,11,10,13,12,15,14,  1, 0, 3, 2, 5, 4, 7, 6,  9, 8,11,10,13,12,15,14)); }
-static ALWAYS_INLINE __m256i mm256_rev_epi32(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11,10, 9, 8,15,14,13,12,  3, 2, 1, 0, 7, 6, 5, 4, 11,10, 9, 8,15,14,13,12)); }
-static ALWAYS_INLINE __m256i mm256_rev_epi64(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8,  7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8)); }
-static ALWAYS_INLINE __m256i mm256_rev_si128(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8(15,14,13,12,11,10, 9, 8,  7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8,  7, 6, 5, 4, 3, 2, 1, 0)); }
-
-static ALWAYS_INLINE __m256i mm256_rbit_epi16(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi16(v)); }
-static ALWAYS_INLINE __m256i mm256_rbit_epi32(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi32(v)); }
-static ALWAYS_INLINE __m256i mm256_rbit_epi64(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi64(v)); }
-static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi8(mm256_rev_si128(v)); }
-  #endif
-
-// ------------------ bitio genaral macros ---------------------------
-  #ifdef __AVX2__
-    #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#include <intrin.h>
-    #else
-#include <x86intrin.h>
-    #endif
-#define bzhi_u32(_u_, _b_)               _bzhi_u32(_u_, _b_)
-
-    #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
-#define bzhi_u64(_u_, _b_)               ((_u_) & ((1ull<<(_b_))-1))
-    #else
-#define bzhi_u64(_u_, _b_)               _bzhi_u64(_u_, _b_)
-    #endif
-  #else
-#define bzhi_u64(_u_, _b_)               ((_u_) & ((1ull<<(_b_))-1))
-#define bzhi_u32(_u_, _b_)               ((_u_) & ((1u  <<(_b_))-1))
-  #endif
-
-#define BZHI64(_u_, _b_)                 (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))
-#define BZHI32(_u_, _b_)                 (_b_ == 32?        0xffffffffu  :((_u_) & ((1u  <<(_b_))-1)))
-
-#define bitdef(     _bw_,_br_)           uint64_t _bw_=0; unsigned _br_=0
-#define bitini(     _bw_,_br_)           _bw_=_br_=0
-//-- bitput ---------
-#define bitput(     _bw_,_br_,_nb_,_x_)  (_bw_) += (uint64_t)(_x_) << (_br_), (_br_) += (_nb_)
-#define bitenorm(   _bw_,_br_,_op_)      ctou64(_op_) = _bw_; _op_ += ((_br_)>>3), (_bw_) >>=((_br_)&~7), (_br_) &= 7
-#define bitflush(   _bw_,_br_,_op_)      ctou64(_op_) = _bw_, _op_ += ((_br_)+7)>>3, _bw_=_br_=0
-//-- bitget ---------
-#define bitbw(      _bw_,_br_)           ((_bw_)>>(_br_))
-#define bitrmv(     _bw_,_br_,_nb_)      (_br_) += _nb_
-
-#define bitdnorm(   _bw_,_br_,_ip_)      _bw_ = ctou64((_ip_) += ((_br_)>>3)), (_br_) &= 7
-#define bitalign(   _bw_,_br_,_ip_)      ((_ip_) += ((_br_)+7)>>3)
-
-#define BITPEEK32(  _bw_,_br_,_nb_)      BZHI32(bitbw(_bw_,_br_), _nb_)
-#define BITGET32(   _bw_,_br_,_nb_,_x_)  _x_ = BITPEEK32(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
-#define BITPEEK64(  _bw_,_br_,_nb_)      BZHI64(bitbw(_bw_,_br_), _nb_)
-#define BITGET64(   _bw_,_br_,_nb_,_x_)  _x_ = BITPEEK64(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
-
-#define bitpeek57(  _bw_,_br_,_nb_)      bzhi_u64(bitbw(_bw_,_br_), _nb_)
-#define bitget57(   _bw_,_br_,_nb_,_x_)  _x_ = bitpeek57(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
-#define bitpeek31(  _bw_,_br_,_nb_)      bzhi_u32(bitbw(_bw_,_br_), _nb_)
-#define bitget31(   _bw_,_br_,_nb_,_x_)  _x_ = bitpeek31(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
-//------------------ templates -----------------------------------
-#define bitput8( _bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
-#define bitput16(_bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
-#define bitput32(_bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
-#define bitput64(_bw_,_br_,_b_,_x_,_op_) if((_b_)>45) { bitput(_bw_,_br_,(_b_)-32, (_x_)>>32); bitenorm(_bw_,_br_,_op_); bitput(_bw_,_br_,32,(unsigned)(_x_)); } else bitput(_bw_,_br_,_b_,_x_)
-
-#define bitget8( _bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_)
-#define bitget16(_bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_)
-#define bitget32(_bw_,_br_,_b_,_x_,_ip_) bitget57(_bw_,_br_,_b_,_x_)
-#define bitget64(_bw_,_br_,_b_,_x_,_ip_) if((_b_)>45) { unsigned _v; bitget57(_bw_,_br_,(_b_)-32,_x_); bitdnorm(_bw_,_br_,_ip_); BITGET64(_bw_,_br_,32,_v); _x_ = _x_<<32|_v; } else bitget57(_bw_,_br_,_b_,_x_)  
-#endif
-
-//---------- max. bit length + transform for sorted/unsorted arrays, delta,delta 1, delta > 1, zigzag, zigzag of delta, xor, FOR,----------------
-#ifdef __cplusplus
-extern "C" {
-#endif
-//------ ORed array, used to determine the maximum bit length of the elements in an unsorted integer array ---------------------
-uint8_t  bit8( uint8_t  *in, unsigned n, uint8_t  *px);
-uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px);
-uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px);
-uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px);
-
-//-------------- delta = 0: Sorted integer array w/ mindelta = 0 ----------------------------------------------
-//-- ORed array, maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1]
-uint8_t  bitd8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
-uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
-uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
-uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
-
-//-- in-place reverse delta 0
-void bitddec8(  uint8_t  *p,  unsigned n, uint8_t  start); // non decreasing (out[i] = in[i] - in[i-1])
-void bitddec16( uint16_t *p,  unsigned n, uint16_t start);
-void bitddec32( uint32_t *p,  unsigned n, uint32_t start);
-void bitddec64( uint64_t *p,  unsigned n, uint64_t start);
-
-//-- vectorized fast delta4 one: out[0] = in[4]-in[0], out[1]=in[5]-in[1], out[2]=in[6]-in[2], out[3]=in[7]-in[3],...
-uint16_t bits128v16(   uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
-uint32_t bits128v32(   uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
-
-//------------- delta = 1: Sorted integer array w/ mindelta = 1 ---------------------------------------------
-//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
-uint8_t  bitd18( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
-uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
-uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
-uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
-
-//-- in-place reverse delta one
-void bitd1dec8(     uint8_t  *p,  unsigned n, uint8_t  start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
-void bitd1dec16(    uint16_t *p,  unsigned n, uint16_t start);
-void bitd1dec32(    uint32_t *p,  unsigned n, uint32_t start);
-void bitd1dec64(    uint64_t *p,  unsigned n, uint64_t start);
-
-//------------- delta > 1: Sorted integer array w/ mindelta > 1 ---------------------------------------------
-//-- ORed array, for max. bit length get min. delta ()
-uint8_t  bitdi8(    uint8_t  *in, unsigned n, uint8_t  *px,  uint8_t  start);
-uint16_t bitdi16(   uint16_t *in, unsigned n, uint16_t *px,  uint16_t start);
-uint32_t bitdi32(   uint32_t *in, unsigned n, uint32_t *px,  uint32_t start);
-uint64_t bitdi64(   uint64_t *in, unsigned n, uint64_t *px,  uint64_t start);
-//-- transform sorted integer array to delta array: out[i] = in[i] - in[i-1] - mindelta
-uint8_t  bitdienc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta);
-uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
-uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
-uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
-//-- in-place reverse delta
-void     bitdidec8( uint8_t  *in, unsigned n,                uint8_t  start, uint8_t  mindelta);
-void     bitdidec16(uint16_t *in, unsigned n,                uint16_t start, uint16_t mindelta);
-void     bitdidec32(uint32_t *in, unsigned n,                uint32_t start, uint32_t mindelta);
-void     bitdidec64(uint64_t *in, unsigned n,                uint64_t start, uint64_t mindelta);
-
-//------------- FOR : array bit length: ---------------------------------------------------------------------
-//------ ORed array, for max. bit length of the non decreasing integer array.  out[i] = in[i] - start
-uint8_t  bitf8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
-uint16_t bitf16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
-uint32_t bitf32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
-uint64_t bitf64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
-
-//------ ORed array, for max. bit length of the non strictly decreasing integer array out[i] = in[i] - 1 - start
-uint8_t  bitf18( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
-uint16_t bitf116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
-uint32_t bitf132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
-uint64_t bitf164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
-
-//------ ORed array, for max. bit length for usorted array
-uint8_t  bitfm8( uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  *pmin);  // unsorted
-uint16_t bitfm16(uint16_t *in, unsigned n, uint16_t *px, uint16_t *pmin);
-uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t *px, uint32_t *pmin);
-uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin);
-
-//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] ------------------------
-//-- ORed array, to get maximum zigzag bit length integer array
-uint8_t  bitz8(    uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
-uint16_t bitz16(   uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
-uint32_t bitz32(   uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
-uint64_t bitz64(   uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
-//-- Zigzag transform
-uint8_t  bitzenc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta);
-uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
-uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
-uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
-//-- in-place zigzag reverse transform
-void bitzdec8(     uint8_t  *in, unsigned n,                uint8_t  start);
-void bitzdec16(    uint16_t *in, unsigned n,                uint16_t start);
-void bitzdec32(    uint32_t *in, unsigned n,                uint32_t start);
-void bitzdec64(    uint64_t *in, unsigned n,                uint64_t start);
-
-//------------- Zigzag of zigzag/delta : unsorted/sorted integer array ----------------------------------------------------
-//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
-uint8_t  bitzz8(    uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
-uint16_t bitzz16(   uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
-uint32_t bitzz32(   uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
-uint64_t bitzz64(   uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
-
-uint8_t  bitzzenc8( uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start, uint8_t  mindelta);
-uint16_t bitzzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
-uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
-uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
-
-//-- in-place reverse zigzag of delta (encoded w/ bitdiencNN and parameter mindelta = 1)
-void bitzzdec8(     uint8_t  *in,  unsigned n, uint8_t  start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
-void bitzzdec16(    uint16_t *in,  unsigned n, uint16_t start);
-void bitzzdec32(    uint32_t *in,  unsigned n, uint32_t start);
-void bitzzdec64(    uint64_t *in,  unsigned n, uint64_t start);
-
-//------------- XOR encoding for unsorted integer lists: out[i] = in[i] - in[i-1] -------------
-//-- ORed array, to get maximum zigzag bit length integer array
-uint8_t  bitx8(    uint8_t  *in, unsigned n, uint8_t  *px, uint8_t  start);
-uint16_t bitx16(   uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
-uint32_t bitx32(   uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
-uint64_t bitx64(   uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
-
-//-- XOR transform
-uint8_t  bitxenc8(  uint8_t  *in, unsigned n, uint8_t  *out, uint8_t  start);
-uint16_t bitxenc16( uint16_t *in, unsigned n, uint16_t *out, uint16_t start);
-uint32_t bitxenc32( uint32_t *in, unsigned n, uint32_t *out, uint32_t start);
-uint64_t bitxenc64( uint64_t *in, unsigned n, uint64_t *out, uint64_t start);
-
-//-- XOR in-place reverse transform
-void bitxdec8(      uint8_t  *p,  unsigned n, uint8_t  start);
-void bitxdec16(     uint16_t *p,  unsigned n, uint16_t start);
-void bitxdec32(     uint32_t *p,  unsigned n, uint32_t start);
-void bitxdec64(     uint64_t *p,  unsigned n, uint64_t start);
-
-//------- Lossy floating point transform: pad the trailing mantissa bits with zeros according to the error e (ex. e=0.00001)
-  #ifdef USE_FLOAT16
-void fppad16(_Float16 *in, size_t n, _Float16  *out, float  e);
-  #endif
-void fppad32(float  *in, size_t n, float  *out, float  e);
-void fppad64(double *in, size_t n, double *out, double e);
-
-#ifdef __cplusplus
-}
-#endif
-
-//---- Floating point to Integer decomposition ---------------------------------
-// seeeeeeee21098765432109876543210 (s:sign, e:exponent, 0-9:mantissa)
-  #ifdef BITUTIL_IN
-#define MANTF32    23
-#define MANTF64    52
-
-#define BITFENC(_u_, _sgn_, _expo_, _mant_,      _mantbits_, _one_) _sgn_ = _u_ >> (sizeof(_u_)*8-1); _expo_ = ((_u_ >> (_mantbits_)) & ( (_one_<<(sizeof(_u_)*8 - 1 - _mantbits_)) -1)); _mant_ = _u_ & ((_one_<<_mantbits_)-1);
-#define BITFDEC(     _sgn_, _expo_, _mant_, _u_, _mantbits_)        _u_ = (_sgn_) << (sizeof(_u_)*8-1) | (_expo_) << _mantbits_ | (_mant_)
-  #endif
--- a/conf.h
+++ b/conf.h
@ -1,282 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-
-// conf.h - config & common
-#ifndef CONF_H
-#define CONF_H
-//------------------------- Compiler ------------------------------------------
-  #if defined(__GNUC__)
-#include <stdint.h>
-#define ALIGNED(t,v,n)  t v __attribute__ ((aligned (n)))
-#define ALWAYS_INLINE   inline __attribute__((always_inline))
-#define NOINLINE        __attribute__((noinline))
-#define _PACKED         __attribute__ ((packed))
-#define likely(x)       __builtin_expect((x),1)
-#define unlikely(x)     __builtin_expect((x),0)
-
-#define popcnt32(_x_)   __builtin_popcount(_x_)
-#define popcnt64(_x_)   __builtin_popcountll(_x_)
-
-    #if defined(__i386__) || defined(__x86_64__)
-//x,__bsr32:     1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
-//  x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
-static inline int    __bsr32(               int x) {             asm("bsr  %1,%0" : "=r" (x) : "rm" (x) ); return x; }
-static inline int      bsr32(               int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
-static inline int      bsr64(uint64_t x          ) { return x?64 - __builtin_clzll(x):0; }
-static inline int    __bsr64(uint64_t x          ) { return   63 - __builtin_clzll(x);   }
-
-static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
-    #else
-static inline int    __bsr32(unsigned x          ) { return   31 - __builtin_clz(  x); }
-static inline int      bsr32(int x               ) { return x?32 - __builtin_clz(  x):0; }
-static inline int      bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
-
-static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
-static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
-static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
-static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
-    #endif
-
-#define ctz64(_x_) __builtin_ctzll(_x_)
-#define ctz32(_x_) __builtin_ctz(_x_)    // 0:32  ctz32(1<<a) = a (a=1..31)
-#define clz64(_x_) __builtin_clzll(_x_)
-#define clz32(_x_) __builtin_clz(_x_)    // 00000000 00000000 00000000 01000000 = 25
-
-//#define bswap8(x)    (x)
-    #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
-#define bswap16(x) __builtin_bswap16(x)
-    #else
-static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
-    #endif
-#define bswap32(x) __builtin_bswap32(x)
-#define bswap64(x) __builtin_bswap64(x)
-
-  #elif _MSC_VER //----------------------------------------------------
-#include <windows.h>
-#include <intrin.h>
-    #if _MSC_VER < 1600
-#include "vs/stdint.h"
-#define __builtin_prefetch(x,a)
-#define inline          __inline
-    #else
-#include <stdint.h>
-#define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA)
-    #endif
-
-#define ALIGNED(t,v,n)  __declspec(align(n)) t v
-#define ALWAYS_INLINE   __forceinline
-#define NOINLINE        __declspec(noinline)
-#define THREADLOCAL     __declspec(thread)
-#define likely(x)       (x)
-#define unlikely(x)     (x)
-
-static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
-static inline int bsr32(  unsigned x) { unsigned long z;   _BitScanReverse(&z, x); return x?z+1:0; }
-static inline int ctz32(  unsigned x) { unsigned long z;   _BitScanForward(&z, x); return x?z:32; }
-static inline int clz32(  unsigned x) { unsigned long z;   _BitScanReverse(&z, x); return x?31-z:32; }
-  #if !defined(_M_ARM64) && !defined(_M_X64)
-static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
-  unsigned long x0 = (unsigned long)x, top, bottom;         _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0);
-  *ret = x0 ? bottom : 32 + top;  return x != 0;
-}
-static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
-  unsigned long x1 = (unsigned long)(x >> 32), top, bottom; _BitScanReverse(&top, x1);                       _BitScanReverse(&bottom, (unsigned long)x);
-  *ret = x1 ? top + 32 : bottom;  return x != 0;
-}
-  #endif
-static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
-static inline int ctz64(uint64_t x) { unsigned long z;   _BitScanForward64(&z, x); return x?z:64; }
-static inline int clz64(uint64_t x) { unsigned long z;   _BitScanReverse64(&z, x); return x?63-z:64; }
-
-#define rol32(x,s) _lrotl(x, s)
-#define ror32(x,s) _lrotr(x, s)
-
-#define bswap16(x) _byteswap_ushort(x)
-#define bswap32(x) _byteswap_ulong(x)
-#define bswap64(x) _byteswap_uint64(x)
-
-#define popcnt32(x) __popcnt(x)
-  #ifdef _WIN64
-#define popcnt64(x) __popcnt64(x)
-  #else
-#define popcnt64(x) (popcnt32(x) + popcnt32(x>>32))
-  #endif
-
-#define sleep(x)    Sleep(x/1000)
-#define fseeko      _fseeki64
-#define ftello      _ftelli64
-#define strcasecmp  _stricmp
-#define strncasecmp _strnicmp
-#define strtoull    _strtoui64
-static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
-  #endif
-
-#define __bsr8(_x_)  __bsr32(_x_)
-#define __bsr16(_x_) __bsr32(_x_)
-#define bsr8(_x_)  bsr32(_x_)
-#define bsr16(_x_) bsr32(_x_)
-#define ctz8(_x_)  ctz32(_x_)
-#define ctz16(_x_) ctz32(_x_)
-#define clz8(_x_)  (clz32(_x_)-24)
-#define clz16(_x_) (clz32(_x_)-16)
-
-#define popcnt8(x)  popcnt32(x)
-#define popcnt16(x) popcnt32(x)
-
-//--------------- Unaligned memory access -------------------------------------
-  #ifdef UA_MEMCPY
-#include <string.h>
-static inline unsigned short     ctou16(const void *cp) { unsigned short     x; memcpy(&x, cp, sizeof(x)); return x; }
-static inline unsigned           ctou32(const void *cp) { unsigned           x; memcpy(&x, cp, sizeof(x)); return x; }
-static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
-static inline size_t             ctousz(const void *cp) { size_t             x; memcpy(&x, cp, sizeof(x)); return x; }
-static inline float              ctof32(const void *cp) { float              x; memcpy(&x, cp, sizeof(x)); return x; }
-static inline double             ctof64(const void *cp) { double             x; memcpy(&x, cp, sizeof(x)); return x; }
-
-static inline void               stou16(      void *cp, unsigned short     x) { memcpy(cp, &x, sizeof(x)); }
-static inline void               stou32(      void *cp, unsigned           x) { memcpy(cp, &x, sizeof(x)); }
-static inline void               stou64(      void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
-static inline void               stousz(      void *cp, size_t             x) { memcpy(cp, &x, sizeof(x)); }
-static inline void               stof32(      void *cp, float              x) { memcpy(cp, &x, sizeof(x)); }
-static inline void               stof64(      void *cp, double             x) { memcpy(cp, &x, sizeof(x)); }
-  #elif defined(__i386__) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\
-    defined(__powerpc__) || defined(__s390__) ||\
-    defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
-    defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
-    defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
-    defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__)   || defined(__ARM_ARCH_6ZK__)
-#define ctou16(_cp_) (*(unsigned short *)(_cp_))
-#define ctou32(_cp_) (*(unsigned       *)(_cp_))
-#define ctof32(_cp_) (*(float          *)(_cp_))
-
-    #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER)
-#define ctou64(_cp_)       (*(uint64_t *)(_cp_))
-#define ctof64(_cp_)       (*(double   *)(_cp_))
-    #elif defined(__ARM_FEATURE_UNALIGNED)
-struct _PACKED longu     { uint64_t l; };
-struct _PACKED doubleu   { double   d; };
-#define ctou64(_cp_) ((struct longu     *)(_cp_))->l
-#define ctof64(_cp_) ((struct doubleu   *)(_cp_))->d
-    #endif
-
-  #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__)
-struct _PACKED shortu    { unsigned short     s; };
-struct _PACKED unsignedu { unsigned           u; };
-struct _PACKED longu     { uint64_t           l; };
-struct _PACKED floatu    { float              f; };
-struct _PACKED doubleu   { double             d; };
-
-#define ctou16(_cp_) ((struct shortu    *)(_cp_))->s
-#define ctou32(_cp_) ((struct unsignedu *)(_cp_))->u
-#define ctou64(_cp_) ((struct longu     *)(_cp_))->l
-#define ctof32(_cp_) ((struct floatu    *)(_cp_))->f
-#define ctof64(_cp_) ((struct doubleu   *)(_cp_))->d
-  #else
-#error "unknown cpu"
-  #endif
-
-#define ctou24(_cp_) (ctou32(_cp_) & 0xffffff)
-#define ctou48(_cp_) (ctou64(_cp_) & 0xffffffffffffull)
-#define ctou8(_cp_) (*(_cp_))
-//--------------------- wordsize ----------------------------------------------
-  #if defined(__64BIT__) || defined(_LP64) || defined(__LP64__) || defined(_WIN64) ||\
-    defined(__x86_64__) || defined(_M_X64) ||\
-    defined(__ia64) || defined(_M_IA64) ||\
-    defined(__aarch64__) ||\
-    defined(__mips64) ||\
-    defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
-    defined(__s390x__)
-#define __WORDSIZE 64
-  #else
-#define __WORDSIZE 32
-  #endif
-#endif
-
-//---------------------misc ---------------------------------------------------
-#define BZHI64F(_u_, _b_) 				 ((_u_) & ((1ull<<(_b_))-1))  // _b_ < 64
-#define BZHI32F(_u_, _b_)                ((_u_) & ((1u  <<(_b_))-1))  // _b_ < 32
-#define BZHI64( _u_, _b_)                (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))  // Constant
-#define BZHI32( _u_, _b_)                (_b_ == 32?        0xffffffffu  :((_u_) & ((1u  <<(_b_))-1)))
-#define BZHI16( _u_, _b_)                BZHI32(_u_, _b_)
-#define BZHI8(  _u_, _b_)                BZHI32(_u_, _b_)
-
-    #ifdef __AVX2__
-      #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#include <intrin.h>
-      #else
-#include <x86intrin.h>
-      #endif
-#define bzhi32(_u_, _b_)                 _bzhi_u32(_u_, _b_)
-
-      #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
-#define bzhi64(_u_, _b_)                 ((_u_) & ((1ull<<(_b_))-1))
-      #else
-#define bzhi64(_u_, _b_)                 _bzhi_u64(_u_, _b_)
-      #endif
-    #else
-#define bzhi_u64(_u_, _b_)               BZHI64(_u_, _b_) 
-#define bzhi_u32(_u_, _b_)               BZHI32(_u_, _b_) 
-    #endif
-
-#define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1))
-#define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1)))
-
-#define TEMPLATE2_(_x_, _y_) _x_##_y_
-#define TEMPLATE2(_x_, _y_) TEMPLATE2_(_x_,_y_)
-
-#define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_
-#define TEMPLATE3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_)
-
-#define CACHE_LINE_SIZE     64
-#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
-
-#define CLAMP(_x_, _low_, _high_)  (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_)))
-
-//--- NDEBUG -------
-#include <stdio.h>
-  #ifdef _MSC_VER
-    #ifdef NDEBUG
-#define AS(expr, fmt, ...)
-#define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
-#define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
-    #else
-#define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
-#define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
-#define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
-    #endif
-  #else
-    #ifdef NDEBUG
-#define AS(expr, fmt,args...)
-#define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
-#define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
-    #else
-#define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
-#define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
-#define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
-    #endif
-  #endif
--- a/eliasfano.h
+++ b/eliasfano.h
@ -1,61 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-//    eliasfano.h - "Integer Compression" Elias Fano c/c++ header
-#ifdef __cplusplus
-extern "C" {
-#endif
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "vs/stdint.h"
-#else
-#include <stdint.h>
-#endif
-
-// compress/decompress integer array with n values to the buffer out. Return value = end of output/input buffer
-unsigned char *efanoenc32(     unsigned      *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned start);
-unsigned char *efanoenc64(     uint64_t      *__restrict in, unsigned n, unsigned char  *__restrict out, uint64_t start);
-
-unsigned char *efanodec32(     unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start);
-unsigned char *efanodec64(     unsigned char *__restrict in, unsigned n, uint64_t       *__restrict out, uint64_t start);
-
-unsigned char *efano1enc32(    unsigned      *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned start);
-unsigned char *efano1enc64(    uint64_t      *__restrict in, unsigned n, unsigned char  *__restrict out, uint64_t start);
-
-unsigned char *efano1dec32(    unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start);
-unsigned char *efano1dec64(    unsigned char *__restrict in, unsigned n, uint64_t       *__restrict out, uint64_t start);
-
-unsigned char *efanoenc128v32( unsigned      *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned start);
-unsigned char *efanodec128v32( unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start);
-
-unsigned char *efano1enc128v32(unsigned      *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned start);
-unsigned char *efano1dec128v32(unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start);
-
-unsigned char *efanoenc256v32( unsigned      *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned start);
-unsigned char *efanodec256v32( unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start);
-
-unsigned char *efano1enc256v32(unsigned      *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned start);
-unsigned char *efano1dec256v32(unsigned char *__restrict in, unsigned n, unsigned       *__restrict out, unsigned start);
-
-#ifdef __cplusplus
-}
-#endif
--- a/fp.h
+++ b/fp.h
@ -1,125 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-//     "Floating Point + Integer Compression"
-#ifdef __cplusplus
-extern "C" {
-#endif
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "vs/stdint.h"
-#else
-#include <stdint.h>
-#endif
-
-// ---------- TurboPFor Zigzag of delta (=delta of delta + zigzag encoding) (TurboPFor)
-size_t p4nzzenc128v8(    uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t p4nzzdec128v8(    unsigned char *in, size_t n, uint8_t       *out, uint8_t  start);
-size_t p4nzzenc128v16(   uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t p4nzzdec128v16(   unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t p4nzzenc128v32(   uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t p4nzzdec128v32(   unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t p4nzzenc128v64(   uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t p4nzzdec128v64(   unsigned char *in, size_t n, uint64_t      *out, uint64_t start);
-
-//----------- Zigzag (bit/io) -------------------------------------------------------
-size_t bvzenc8(     uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t bvzdec8(     unsigned char *in, size_t n, uint8_t       *out, uint8_t  start);
-size_t bvzenc16(    uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t bvzdec16(    unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t bvzenc32(    uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t bvzdec32(    unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t bvzenc64(    uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t bvzdec64(    unsigned char *in, size_t n, uint64_t      *out, uint64_t start);
-//----------- Zigzag of delta (bit/io) ---------------------------------------------
-size_t bvzzenc8(    uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t bvzzdec8(    unsigned char *in, size_t n, uint8_t       *out, uint8_t  start);
-size_t bvzzenc16(   uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t bvzzdec16(   unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t bvzzenc32(   uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t bvzzdec32(   unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t bvzzenc64(   uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t bvzzdec64(   unsigned char *in, size_t n, uint64_t      *out, uint64_t start);
-
-//----------- TurboGorilla : Improved gorilla style + RLE (bit/io) ------------------
-size_t fpgenc8(     uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t fpgdec8(     unsigned char *in, size_t n, uint8_t       *out, uint8_t  start);
-size_t fpgenc16(    uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t fpgdec16(    unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t fpgenc32(    uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t fpgdec32(    unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t fpgenc64(    uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t fpgdec64(    unsigned char *in, size_t n, uint64_t      *out, uint64_t start);
-
-//----------- TurboFloat XOR : Last value predictor (TurboPFor) ---------------------
-size_t fpxenc8(     uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t fpxdec8(     unsigned char *in, size_t n, uint8_t       *out, uint8_t  start);
-size_t fpxenc16(    uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t fpxdec16(    unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t fpxenc32(    uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t fpxdec32(    unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t fpxenc64(    uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t fpxdec64(    unsigned char *in, size_t n, uint64_t      *out, uint64_t start);
-
-//----------- TurboFloat FCM: Finite Context Method Predictor (TurboPFor) -----------
-size_t fpfcmenc8(   uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t fpfcmdec8(   unsigned char *in, size_t n, uint8_t      *out,  uint8_t  start);
-size_t fpfcmenc16(  uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t fpfcmdec16(  unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t fpfcmenc32(  uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t fpfcmdec32(  unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t fpfcmenc64(  uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t fpfcmdec64(  unsigned char *in, size_t n, uint64_t      *out, uint64_t start);
-
-//----------- TurboFloat DFCM: Differential Finite Context Method Predictor (TurboPFor)
-size_t fpdfcmenc8(  uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t fpdfcmdec8(  unsigned char *in, size_t n, uint8_t       *out, uint8_t  start);
-size_t fpdfcmenc16( uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t fpdfcmdec16( unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t fpdfcmenc32( uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t fpdfcmdec32( unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t fpdfcmenc64( uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t fpdfcmdec64( unsigned char *in, size_t n, uint64_t      *out, uint64_t start);
-
-//----------- TurboFloat 2D DFCM: Differential Finite Context Method Predictor -----
-size_t fp2dfcmenc8( uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t fp2dfcmdec8( unsigned char *in, size_t n, uint8_t       *out, uint8_t  start);
-size_t fp2dfcmenc16(uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t fp2dfcmdec16(unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t fp2dfcmenc32(uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t fp2dfcmdec32(unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t fp2dfcmenc64(uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t fp2dfcmdec64(unsigned char *in, size_t n, uint64_t      *out, uint64_t start);
-
-/*/-------------- delta (=zigzag). Same as p4zenc ------------------------------------
-size_t fppenc8(     uint8_t       *in, size_t n, unsigned char *out, uint8_t  start);
-size_t fppdec8(     unsigned char *in, size_t n, uint8_t       *out, uint8_t  start);
-size_t fppenc16(    uint16_t      *in, size_t n, unsigned char *out, uint16_t start);
-size_t fppdec16(    unsigned char *in, size_t n, uint16_t      *out, uint16_t start);
-size_t fppenc32(    uint32_t      *in, size_t n, unsigned char *out, uint32_t start);
-size_t fppdec32(    unsigned char *in, size_t n, uint32_t      *out, uint32_t start);
-size_t fppenc64(    uint64_t      *in, size_t n, unsigned char *out, uint64_t start);
-size_t fppdec64(    unsigned char *in, size_t n, uint64_t      *out, uint64_t start);*/
-
-#ifdef __cplusplus
-}
-#endif
--- a/sse_neon.h
+++ b/sse_neon.h
@ -1,355 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2021
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-// Intel SSE to ARM NEON optimized for maximum speed (and compatibility gcc/clang) with possible minor changes to the source code
-
-#ifndef _SSE_NEON_H_
-#define _SSE_NEON_H_
-#include "conf.h"
-
-#ifdef __ARM_NEON  //------------------------------------------------------------------------------------------------------------------
-#include <arm_neon.h>
-#define __m128i uint32x4_t                      //  int32x4_t can also be used
-#define __m128  float32x4_t
-
-//#define USE_MACROS
-#define uint8x16_to_8x8x2(_u_)                  ((uint8x8x2_t) { vget_low_u8(_u_), vget_high_u8(_u_) })
-
-  #ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
-#define _mm_set_epi8(u15,u14,u13,u12,\
-                     u11,u10, u9, u8,\
-                         u7,u6,u5,u4,\
-                         u3,u2,u1,u0)           ({ uint8_t  __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (uint32x4_t)vld1q_u8( _u);})
-#define _mm_set_epi16(   u7,u6,u5,u4,\
-                         u3,u2,u1,u0)           ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 };                               (uint32x4_t)vld1q_u16(_u);})
-//#define _mm_set_epi32(   u3,u2,u1,u0)           ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 };                                                      vld1q_u32(_u);})
-//#define _mm_set_epi64x(        u1,u0)           ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 };                                                 (uint32x4_t)vld1q_u64(_u);})
-#define _mm_set_epi32(u3, u2, u1, u0)           vcombine_u32(vcreate_u32((uint64_t)u1 << 32 | u0), vcreate_u32((uint64_t)u3 << 32 | u2))
-#define _mm_set_epi64x(u1, u0)                  (__m128i)vcombine_u64(vcreate_u64(u0), vcreate_u64(u1))
-
-  #else
-static ALWAYS_INLINE __m128i _mm_set_epi8(      uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t  u9, uint8_t  u8,
-                                                uint8_t  u7, uint8_t  u6, uint8_t  u5, uint8_t  u4,
-                                                uint8_t  u3, uint8_t  u2, uint8_t  u1, uint8_t  u0) {
-                                                  uint8_t  __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 };                       return (uint32x4_t)vld1q_u8( u); }
-static ALWAYS_INLINE __m128i _mm_set_epi16(     uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4,
-                                                uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (uint32x4_t)vld1q_u16(u); }
-static ALWAYS_INLINE __m128i _mm_set_epi32(     uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 };             return             vld1q_u32(u); }
-static ALWAYS_INLINE __m128i _mm_set_epi64x(    uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 };                                             return (uint32x4_t)vld1q_u64(u); }
-  #endif
-
-#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7)   
-#define _mm_setr_epi32(u3,u2,u1,u0)             _mm_set_epi32( u0,u1,u2,u3)   
-#define _mm_setr_epi64x(u1,u0)                  _mm_set_epi64x(u0,u0)
-
-#define _mm_set1_epi8(  _u8_ )                  (__m128i)vdupq_n_u8( _u8_ )
-#define _mm_set1_epi16( _u16_)                  (__m128i)vdupq_n_u16(_u16_)
-#define _mm_set1_epi32( _u32_)                           vdupq_n_u32(_u32_)
-#define _mm_set1_epi64x(_u64_)                  (__m128i)vdupq_n_u64(_u64_)
-#define _mm_setzero_si128()                              vdupq_n_u32( 0   )
-
-#define _mm_cvtss_f32(_u_)                      vgetq_lane_f32((float32x4_t)(_u_), 0)
-#define _mm_setzero_ps()	                    (__m128)vdupq_n_f32(0)
-#define _mm_set1_ps(_f32_)	                    (__m128)vdupq_n_f32(_f32_)
-//---------------------------------------------- Arithmetic -----------------------------------------------------------------------
-#define _mm_add_epi8(   _u_,_v_)                (__m128i)vaddq_u8((uint8x16_t)(_u_),  (uint8x16_t)(_v_))
-#define _mm_add_epi16(  _u_,_v_)                (__m128i)vaddq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
-#define _mm_add_epi32(  _u_,_v_)                         vaddq_u32(             _u_,               _v_ )
-#define _mm_sub_epi8(   _u_,_v_)                (__m128i)vsubq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
-#define _mm_sub_epi16(  _u_,_v_)                (__m128i)vsubq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
-#define _mm_sub_epi32(  _u_,_v_)                (__m128i)vsubq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_))
-#define _mm_subs_epu8(  _u_,_v_)                (__m128i)vqsubq_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
-
-#define _mm_mullo_epi16(_u_,_v_)                (__m128i)vmulq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
-#define _mm_mullo_epi32(_u_,_v_)                (__m128i)vmulq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
-#define  mm_mullo_epu32(_u_,_v_)                         vmulq_u32(_u_,_v_)
-
-#define _mm_mulhi_epi16s(_u_,_v_)               (__m128i)vqdmulhq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) //only for small values??
-static ALWAYS_INLINE __m128i _mm_mulhi_epi16(__m128i u, __m128i v) {
-  int32x4_t lo   = vmull_s16(vget_low_s16( (int16x8_t)(u)), vget_low_s16( (int16x8_t)(v)));
-  int32x4_t hi   = vmull_s16(vget_high_s16((int16x8_t)(u)), vget_high_s16((int16x8_t)(v)));
-  uint16x8x2_t a = vuzpq_u16((uint16x8_t)(lo), (uint16x8_t)(hi));
-  return (__m128i)(vreinterpretq_s32_u16(a.val[1]));
-}
-#define _mm_mul_epu32(  _u_,_v_)                (__m128i)vmull_u32(vget_low_u32(_u_),vget_low_u32(_v_))
-#define _mm_adds_epu16( _u_,_v_)                (__m128i)vqaddq_u16((uint16x8_t)(_u_),(uint16x8_t)(_v_))
-static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) {
-  int32x4_t mlo = vmull_s16(vget_low_s16( (int16x8_t)u), vget_low_s16( (int16x8_t)v)),
-            mhi = vmull_s16(vget_high_s16((int16x8_t)u), vget_high_s16((int16x8_t)v));
-  int32x2_t alo = vpadd_s32(vget_low_s32(mlo), vget_high_s32(mlo)),
-            ahi = vpadd_s32(vget_low_s32(mhi), vget_high_s32(mhi));
-  return (__m128i)vcombine_s32(alo, ahi);
-}
-//---------------------------------------------- Special math functions -----------------------------------------------------------
-#define _mm_min_epu8(    _u_,_v_)               (__m128i)vminq_u8( (uint8x16_t)(_u_), (uint8x16_t)(_v_))
-#define _mm_min_epu16(   _u_,_v_)               (__m128i)vminq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
-#define _mm_min_epi16(   _u_,_v_)               (__m128i)vminq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
-//---------------------------------------------- Logical --------------------------------------------------------------------------
-#define  mm_testnz_epu32(_u_)                   vmaxvq_u32(_u_) //vaddvq_u32(_u_)
-#define  mm_testnz_epu8( _u_)                   vmaxv_u8(_u_)
-#define _mm_or_si128(    _u_,_v_)               (__m128i)vorrq_u32(  (uint32x4_t)(_u_), (uint32x4_t)(_v_))
-#define _mm_and_si128(   _u_,_v_)               (__m128i)vandq_u32(  (uint32x4_t)(_u_), (uint32x4_t)(_v_))
-#define _mm_xor_si128(   _u_,_v_)               (__m128i)veorq_u32(  (uint32x4_t)(_u_), (uint32x4_t)(_v_))
-//---------------------------------------------- Shift ----------------------------------------------------------------------------
-#define  mm_slli_epi8(   _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)> 7?vdupq_n_u8( 0):vshlq_n_u8( (uint8x16_t)(_u_), (_c_))))  // parameter c MUST be a constant / vshlq_n_u8: __constrange(0-(N-1))
-#define  mm_slli_epi16(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u16(0):vshlq_n_u16((uint16x8_t)(_u_), (_c_))))
-#define  mm_slli_epi32(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)>31?vdupq_n_u32(0):vshlq_n_u32((uint32x4_t)(_u_), (_c_))))           
-#define  mm_slli_epi64(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)>63?vdupq_n_u64(0):vshlq_n_u64((uint64x2_t)(_u_), (_c_))))           
-#define _mm_slli_si128(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u8( 0):vextq_u8(vdupq_n_u8(0), (uint8x16_t)(_u_), 16-(_c_) )) ) // vextq_u8: __constrange(0-15)
-
-#define  mm_srli_epi8(   _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)> 7?vdupq_n_u8( 0):vshrq_n_u8( (uint8x16_t)(_u_), (_c_)))) // vshrq_n: __constrange(1-N)
-#define  mm_srli_epi16(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u16(0):vshrq_n_u16((uint16x8_t)(_u_), (_c_))))
-#define  mm_srli_epi32(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)>31?vdupq_n_u32(0):vshrq_n_u32((uint32x4_t)(_u_), (_c_))))
-#define  mm_srli_epi64(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)>63?vdupq_n_u64(0):vshlq_n_u64((uint64x2_t)(_u_), (_c_))))
-#define _mm_srli_si128(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u8(0):vextq_u8((uint8x16_t)(_u_), vdupq_n_u8(0), (_c_) )) ) // vextq_u8: __constrange(0-15)
-
-#define  mm_srai_epi8(   _u_,_c_)               (__m128i)((_c_)<1?(_u_):vshrq_n_s8( (int8x16_t)(_u_), (_c_))) // c <=  8 (vshrq_n:1-N)
-#define  mm_srai_epi16(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):vshrq_n_s16((int16x8_t)(_u_), (_c_))) // c <= 16
-#define  mm_srai_epi32(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):vshrq_n_s32((int32x4_t)(_u_), (_c_))) // c <= 32
-#define  mm_srai_epi64(  _u_,_c_)               (__m128i)((_c_)<1?(_u_):vshrq_n_s64((int64x2_t)(_u_), (_c_))) // c <= 64
-
-#define _mm_slli_epi8(   _u_,_m_)               (__m128i)vshlq_u8( (uint8x16_t)(_u_), vdupq_n_s8(  (_m_))) // parameter c integer constant/variable
-#define _mm_slli_epi16(  _u_,_m_)               (__m128i)vshlq_u16((uint16x8_t)(_u_), vdupq_n_s16( (_m_)))
-#define _mm_slli_epi32(  _u_,_m_)               (__m128i)vshlq_u32((uint32x4_t)(_u_), vdupq_n_s32( (_m_)))
-#define _mm_slli_epi64(  _u_,_m_)               (__m128i)vshlq_u64((uint64x2_t)(_u_), vdupq_n_s64( (_m_)))
-
-#define _mm_srli_epi8(   _u_,_m_)               (__m128i)vshlq_u8( (uint8x16_t)(_u_), vdupq_n_s8( -(_m_)))
-#define _mm_srli_epi16(  _u_,_m_)               (__m128i)vshlq_u16((uint16x8_t)(_u_), vdupq_n_s16(-(_m_)))
-#define _mm_srli_epi32(  _u_,_m_)               (__m128i)vshlq_u32((uint32x4_t)(_u_), vdupq_n_s32(-(_m_)))
-#define _mm_srli_epi64(  _u_,_m_)               (__m128i)vshlq_u64((uint64x2_t)(_u_), vdupq_n_s64(-(_m_)))
-
-#define _mm_srai_epi8(   _u_,_m_)               (__m128i)vshlq_s8( (int8x16_t)(_u_), vdupq_n_s8( -(_m_)))
-#define _mm_srai_epi16(  _u_,_m_)               (__m128i)vshlq_s16((int16x8_t)(_u_), vdupq_n_s16(-(_m_)))
-#define _mm_srai_epi32(  _u_,_m_)               (__m128i)vshlq_s32((int32x4_t)(_u_), vdupq_n_s32(-(_m_)))
-#define _mm_srai_epi64(  _u_,_m_)               (__m128i)vshlq_s64((int64x2_t)(_u_), vdupq_n_s64(-(_m_)))
-
-#define _mm_sll_epi8(    _u_,_v_)               (__m128i)vshlq_s8(  (int8x16_t)(_u_), (int8x16_t)(_v_))    //_v_:all lanes equal
-#define _mm_sll_epi16(   _u_,_v_)               (__m128i)vshlq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_))
-#define _mm_sll_epi32(   _u_,_v_)               (__m128i)vshlq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_))
-#define _mm_sll_epi64(   _u_,_v_)               (__m128i)vshlq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_))
-
-#define _mm_srl_epi8(    _u_,_v_)               (__m128i)vshrq_s8(  (int8x16_t)(_u_), (int8x16_t)(_v_))   
-#define _mm_srl_epi16(   _u_,_v_)               (__m128i)vshrq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_))
-#define _mm_srl_epi32(   _u_,_v_)               (__m128i)vshrq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_))
-#define _mm_srl_epi64(   _u_,_v_)               (__m128i)vshrq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_))
-
-#define _mm_sllv_epi32(  _u_,_v_)               (__m128i)vshlq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_))   //variable shift 
-#define _mm_srlv_epi32(  _u_,_v_)               (__m128i)vshlq_u32((uint32x4_t)(_u_), vnegq_s32((int32x4_t)(_v_)))
-//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
-#define _mm_cmpeq_epi8(  _u_,_v_)               (__m128i)vceqq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
-#define _mm_cmpeq_epi16( _u_,_v_)               (__m128i)vceqq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
-#define _mm_cmpeq_epi32( _u_,_v_)               (__m128i)vceqq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
-
-#define _mm_cmpgt_epi8(  _u_,_v_)               (__m128i)vcgtq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
-#define _mm_cmpgt_epi16( _u_,_v_)               (__m128i)vcgtq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
-#define _mm_cmpgt_epi32( _u_,_v_)               (__m128i)vcgtq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
-
-#define _mm_cmpgt_epu16( _u_,_v_)               (__m128i)vcgtq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
-#define  mm_cmpgt_epu32( _u_,_v_)               (__m128i)vcgtq_u32(             _u_,               _v_)
-//---------------------------------------------- Load -----------------------------------------------------------------------------
-#define _mm_loadl_epi64( _u64p_)                (__m128i)vcombine_s32(vld1_s32((int32_t const *)(_u64p_)), vcreate_s32(0))
-#define  mm_loadu_epi64p(_u64p_,_u_)            (__m128i)vld1q_lane_u64((uint64_t *)(_u64p_), (uint64x2_t)(_u_), 0)
-#define _mm_loadu_si128( _ip_)                  vld1q_u32(_ip_)
-#define _mm_load_si128(  _ip_)                  vld1q_u32(_ip_)
-
-#define _mm_load_ps(     _ip_)	                (__m128)vld1q_f32((float32_t *)(_ip_))
-#define _mm_loadu_ps(    _ip_)	                (__m128)vld1q_f32((float32_t *)(_ip_))
-#define _mm_load1_ps(    _ip_)	                (__m128)vld1q_dup_f32((float32_t *)(_p_))
-#define _mm_loadl_pi(_u_,_ip_) 	                (__m128)vcombine_f32((float32x2_t)vld1_f32((float32_t *)(_ip)), (float32x2_t)vget_high_f32(_u_))
-#define _mm_loadh_pi(_u_,_ip_)	                (__m128)vcombine_f32((float32x2_t)vget_low_f32(_u_), (float32x2_t)vld1_f32((const float *)(_ip_)))
-//---------------------------------------------- Store ----------------------------------------------------------------------------
-#define _mm_storel_epi64(_ip_,_u_)              vst1q_lane_u64((uint64_t *)(_ip_), (uint64x2_t)(_u_), 0)
-#define _mm_storeu_si128(_ip_,_u_)              vst1q_u32((__m128i *)(_ip_), _u_)
-
-#define _mm_store_ps(    _ip_,_u_)              vst1q_f32(     (float32_t *)(_ip_), (float32x4_t)(_u_))
-#define _mm_storeu_ps(   _ip_,_u_)              vst1q_f32(     (float32_t *)(_ip_), (float32x4_t)(_u_))
-#define _mm_store_ss(    _ip_,_u_)	            vst1q_lane_f32((float32_t *)(_ip_), (float32x4_t)(_u_), 0)
-//---------------------------------------------- Convert --------------------------------------------------------------------------
-#define  mm_cvtsi64_si128p(_u64p_,_u_)          mm_loadu_epi64p(_u64p_,_u_)
-#define _mm_cvtsi64_si128(_u_)                  (__m128i)vdupq_n_u64(_u_)  //vld1q_s64(_u_)
-//---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
-#define mm_rbit_epi8(_v_)                       (__m128i)vrbitq_u8( (uint8x16_t)(_v_)) // reverse bits
-#define mm_rev_epi16(_v_)                       vrev16q_u8((uint8x16_t)(_v_))          // reverse bytes
-#define mm_rev_epi32(_v_)                       vrev32q_u8((uint8x16_t)(_v_))
-#define mm_rev_epi64(_v_)                       vrev64q_u8((uint8x16_t)(_v_))
-//--------------------------------------------- Insert/extract --------------------------------------------------------------------
-#define  mm_extract_epi32x(_u_,_u32_,_id_)      vst1q_lane_u32((uint32_t *)&(_u32_),              _u_,  _id_)
-#define _mm_extract_epi64x(_u_,_u64_,_id_)      vst1q_lane_u64((uint64_t *)&(_u64_), (uint64x2_t)(_u_), _id_)
-
-#define _mm_extract_epi8( _u_,       _id_)      vgetq_lane_u8( (uint8x16_t)(_u_), _id_)
-#define _mm_extract_epi16(_u_,       _id_)      vgetq_lane_u16(_u_, _id_)
-#define _mm_extract_epi32(_u_,       _id_)      vgetq_lane_u32(_u_, _id_)
-#define  mm_extract_epu32(_u_,       _id_)      vgetq_lane_u32(_u_, _id_)
-#define _mm_cvtsi128_si32(_u_)                  vgetq_lane_u32((uint32x4_t)(_u_),0)
-#define _mm_cvtsi128_si64(_u_)                  vgetq_lane_u64((uint64x2_t)(_u_),0)
-
-#define _mm_insert_epu32p(_u_,_u32p_,_id_)      vsetq_lane_u32(_u32p_, _u_, _id_)
-#define  mm_insert_epi32p(_u_,_u32p_,_id_)      vld1q_lane_u32(_u32p_, (uint32x4_t)(_u_), _id_)
-#define _mm_cvtsi32_si128(_x_)                  (__m128i)vsetq_lane_s32(_x_, vdupq_n_s32(0), 0)
-
-#define _mm_blendv_epi8(_u_,_v_,_m_)            vbslq_u32(_m_,_v_,_u_)
-//---------------------------------------------- Miscellaneous --------------------------------------------------------------------
-#define _mm_alignr_epi8(_u_,_v_,_m_)            (__m128i)vextq_u8(  (uint8x16_t)(_v_), (uint8x16_t)(_u_), _m_)
-#define _mm_packs_epi16(   _u_,_v_)             (__m128i)vcombine_s8( vqmovn_s16((int16x8_t)(_u_)), vqmovn_s16((int16x8_t)(_v_)))
-#define _mm_packs_epi32(   _u_,_v_)             (__m128i)vcombine_s16(vqmovn_s32((int32x4_t)(_u_)), vqmovn_s32((int32x4_t)(_v_)))
-
-#define _mm_packs_epu16(   _u_,_v_)             (__m128i)vcombine_u8((uint16x8_t)(_u_), (uint16x8_t)(_v_))
-#define _mm_packus_epi16(  _u_,_v_)             (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)(_u_)), vqmovun_s16((int16x8_t)(_v_)))
-
-static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
-  const uint8x16_t __attribute__ ((aligned (16))) m = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7};
-  uint8x16_t mv = (uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcltq_s8((int8x16_t)v, vdupq_n_s8(0)), m))));
-  return vgetq_lane_u8(mv, 8) << 8 | vgetq_lane_u8(mv, 0);
-}
-//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
-  #ifdef __aarch64__
-static ALWAYS_INLINE uint8_t   mm_movemask_epi8s(uint8x8_t sv) { const uint8x8_t  m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddv_u8(  vand_u8(             sv, m)); } // short only ARM
-//static ALWAYS_INLINE uint16_t  mm_movemask_epu16(uint32x4_t v) { const uint16x8_t m = { 1, 1<<2, 1<<4, 1<<6, 1<<8, 1<<10, 1<<12, 1<<14}; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
-static ALWAYS_INLINE uint16_t  mm_movemask_epu16(__m128i v) { const uint16x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
-static ALWAYS_INLINE uint32_t  mm_movemask_epu32(__m128i v) { const uint32x4_t m = { 1, 1<<1, 1<<2, 1<<3 };                           return vaddvq_u32(vandq_u32((uint32x4_t)v, m)); }
-static ALWAYS_INLINE uint64_t  mm_movemask_epu64(__m128i v) { const uint64x2_t m = { 1, 1<<1 };                                       return vaddvq_u64(vandq_u64((uint64x2_t)v, m)); }
-  #else
-static ALWAYS_INLINE uint32_t  mm_movemask_epu32(uint32x4_t v) { const uint32x4_t mask = {1,2,4,8}, av = vandq_u32(v, mask), xv = vextq_u32(av, av, 2), ov = vorrq_u32(av, xv); return vgetq_lane_u32(vorrq_u32(ov, vextq_u32(ov, ov, 3)), 0); }
-  #endif
-// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
-#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_)        ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_))
-
-#define _mm_shuffle_epi8(_u_, _v_)              (__m128i)vqtbl1q_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
-  #if defined(__aarch64__)
-#define  mm_shuffle_nnnn_epi32(_u_,_m_)         (__m128i)vdupq_laneq_u32(_u_, _m_)
-  #else
-#define  mm_shuffle_nnnn_epi32(_u_,_m_)         (__m128i)vdupq_n_u32(vgetq_lane_u32(_u_, _m_)
-  #endif
-
-  #ifdef USE_MACROS
-#define mm_shuffle_2031_epi32(_u_)              ({ uint32x4_t _zv = (uint32x4_t)vrev64q_u32(_u_); uint32x2x2_t _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
-#define mm_shuffle_3120_epi32(_u_)              ({ uint32x4_t _zv = _u_;                                       _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
-  #else
-static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) { uint32x4_t   a = (uint32x4_t)vrev64q_u32(v); uint32x2x2_t z = vtrn_u32(vget_low_u32(a), vget_high_u32(a)); return vcombine_u32(z.val[0], z.val[1]);}
-static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) {                                              uint32x2x2_t z = vtrn_u32(vget_low_u32(v), vget_high_u32(v)); return vcombine_u32(z.val[0], z.val[1]);}
-  #endif
-
-  #if defined(USE_MACROS) || defined(__clang__)
-#define _mm_shuffle_epi32(_u_, _m_)             ({ const uint32x4_t _av =_u_;\
-                                                   uint32x4_t _v =    vmovq_n_u32(vgetq_lane_u32(_av, (_m_)        & 0x3));\
-                                                              _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);\
-                                                              _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);\
-                                                              _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); _v;\
-                                                })
-#define _mm_shuffle_epi32s(_u_, _m_)            _mm_set_epi32(vgetq_lane_u32(_u_, ((_m_)     ) & 0x3),\
-                                                              vgetq_lane_u32(_u_, ((_m_) >> 2) & 0x3),\
-                                                              vgetq_lane_u32(_u_, ((_m_) >> 4) & 0x3),\
-                                                              vgetq_lane_u32(_u_, ((_m_) >> 6) & 0x3))
-  #else
-static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _u_, const unsigned _m_) { const uint32x4_t _av =_u_;
-  uint32x4_t _v =    vmovq_n_u32(vgetq_lane_u32(_av, (_m_)        & 0x3));
-  _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);
-  _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);
-  _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3);
-  return _v;
-}
-static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _u_, const unsigned _m_) {
-  return _mm_set_epi32(vgetq_lane_u32(_u_, ((_m_)     ) & 0x3),
-                       vgetq_lane_u32(_u_, ((_m_) >> 2) & 0x3),
-                       vgetq_lane_u32(_u_, ((_m_) >> 4) & 0x3),
-                       vgetq_lane_u32(_u_, ((_m_) >> 6) & 0x3));
-}
-  #endif
-  #ifdef USE_MACROS
-#define _mm_unpacklo_epi8( _u_,_v_)             ({ uint8x8x2_t  _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
-#define _mm_unpacklo_epi16(_u_,_v_)             ({ uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
-#define _mm_unpacklo_epi32(_u_,_v_)             ({ uint32x2x2_t _zv = vzip_u32( vget_low_u32(             _u_ ), vget_low_u32(             _v_ ));             vcombine_u32(_zv.val[0], _zv.val[1]);})
-#define _mm_unpacklo_epi64(_u_,_v_)             (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_)))
-
-#define _mm_unpackhi_epi8( _u_,_v_)             ({ uint8x8x2_t  _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
-#define _mm_unpackhi_epi16(_u_,_v_)             ({ uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
-#define _mm_unpackhi_epi32(_u_,_v_)             ({ uint32x2x2_t _zv = vzip_u32(vget_high_u32(             _u_ ), vget_high_u32(             _v_ ));             vcombine_u32(_zv.val[0], _zv.val[1]);})
-#define _mm_unpackhi_epi64(_u_,_v_)             (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_)))
-  #else
-static ALWAYS_INLINE __m128i _mm_unpacklo_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t  _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_)));  return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}
-static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_)));  return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}
-static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32( vget_low_u32(             _u_ ), vget_low_u32(             _v_ ));  return             vcombine_u32(_zv.val[0], _zv.val[1]);}
-static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) {                                                                                                  return (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_))); }
-
-static ALWAYS_INLINE __m128i _mm_unpackhi_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t  _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]); }
-static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]); }
-static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32(vget_high_u32(             _u_ ), vget_high_u32(             _v_ )); return             vcombine_u32(_zv.val[0], _zv.val[1]); }
-static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) {                                                                                                  return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); }
-  #endif
-
-#else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) --------------
-#define mm_movemask_epu32(_u_)                  _mm_movemask_ps(_mm_castsi128_ps(_u_))
-#define mm_movemask_epu16(_u_)                  _mm_movemask_epi8(_u_)
-#define mm_loadu_epi64p( _u64p_,_u_)            _u_ = _mm_cvtsi64_si128(ctou64(_u64p_))
-
-#define mm_extract_epu32( _u_,       _id_)      _mm_extract_epi32(_u_, _id_)
-#define mm_extract_epi32x(_u_,_u32_, _id_)      _u32_ = _mm_extract_epi32(_u_, _id_)
-#define mm_extract_epi64x(_u_,_u64_, _id_)      _u64_ = _mm_extract_epi64(_u_, _id_)
-#define mm_insert_epi32p( _u_,_u32p_,_c_)       _mm_insert_epi32( _u_,ctou32(_u32p_),_c_)
-
-#define mm_mullo_epu32(   _u_,_v_)              _mm_mullo_epi32(_u_,_v_)
-#define mm_cvtsi64_si128p(_u64p_,_u_)           _u_ = _mm_cvtsi64_si128(ctou64(_u64p_))
-
-#define  mm_cmplt_epu32(  _u_, _v_)             _mm_cmplt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000)) //__m128i cv80000000 = _mm_set1_epi32(0x80000000); must be declared 
-#define  mm_cmpgt_epu32(  _u_, _v_)             _mm_cmpgt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000)) 
-#define _mm_cmplt_epu32(  _u_, _v_)             _mm_cmplt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000))) 
-#define _mm_cmpgt_epu32(  _u_, _v_)             _mm_cmpgt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000))) 
-
-#define mm_shuffle_nnnn_epi32(_u_, _n_)         _mm_shuffle_epi32(_u_, _MM_SHUFFLE(_n_,_n_,_n_,_n_))
-#define mm_shuffle_2031_epi32(_u_)              _mm_shuffle_epi32(_u_, _MM_SHUFFLE(2,0,3,1))
-#define mm_shuffle_3120_epi32(_u_)              _mm_shuffle_epi32(_u_, _MM_SHUFFLE(3,1,2,0))
-
-#define _mm_slli_epi8(_u_, _m_ )                _mm_and_si128(_mm_set1_epi8(0xff << _m_), _mm_slli_epi32(_u_, _m_ ))
-#define _mm_srli_epi8(_u_, _m_ )                _mm_and_si128(_mm_set1_epi8(0xff >> _m_), _mm_srli_epi32(_u_, _m_ ))
-
-#define  mm_slli_epi8(   _u_,_c_)               _mm_slli_epi8( _u_,_c_)  // parameter c MUST be a constant for compatibilty with the arm functions above 
-#define  mm_slli_epi16(  _u_,_c_)               _mm_slli_epi16(_u_,_c_)    
-#define  mm_slli_epi32(  _u_,_c_)               _mm_slli_epi32(_u_,_c_)  
-#define  mm_slli_epi64(  _u_,_c_)               _mm_slli_epi64(_u_,_c_)  
-
-#define  mm_srli_epi8(   _u_,_c_)               _mm_srli_epi8( _u_,_c_) 
-#define  mm_srli_epi16(  _u_,_c_)               _mm_srli_epi16(_u_,_c_) 
-#define  mm_srli_epi32(  _u_,_c_)               _mm_srli_epi32(_u_,_c_)  
-#define  mm_srli_epi64(  _u_,_c_)               _mm_srli_epi64(_u_,_c_)           
-
-#define  mm_srai_epi8(   _u_,_c_)               _mm_srai_epi8( _u_,_c_)              
-#define  mm_srai_epi16(  _u_,_c_)               _mm_srai_epi16(_u_,_c_) 
-#define  mm_srai_epi32(  _u_,_c_)               _mm_srai_epi32(_u_,_c_) 
-#define  mm_srai_epi64(  _u_,_c_)               _mm_srai_epi64(_u_,_c_) 
-
-    #ifdef __SSSE3__
-static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i v) { // reverse bits in bytes
-  __m128i fv     = _mm_set_epi8(15, 7,11, 3,13, 5, 9, 1,14, 6,10, 2,12, 4, 8, 0), cv0f_8 = _mm_set1_epi8(0xf);
-  __m128i lv = _mm_shuffle_epi8(fv,_mm_and_si128(               v,     cv0f_8));
-  __m128i hv = _mm_shuffle_epi8(fv,_mm_and_si128( mm_srli_epi64(v, 4), cv0f_8));
-  return _mm_or_si128( mm_slli_epi64(lv,4), hv);
-}
-
-static ALWAYS_INLINE __m128i mm_rev_epi16(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } // reverse vector bytes in uint??_t
-static ALWAYS_INLINE __m128i mm_rev_epi32(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3)); }
-static ALWAYS_INLINE __m128i mm_rev_epi64(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7)); }
-static ALWAYS_INLINE __m128i mm_rev_si128(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15)); }
-    #endif
-  #endif
-#endif
-
--- a/time_.h
+++ b/time_.h
@ -1,252 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-//      time_.h : parameter free high precision time/benchmark functions
-#include <time.h>
-#include <float.h>
-
-  #ifdef _WIN32
-#include <windows.h>
-    #ifndef sleep
-#define sleep(n) Sleep((n) * 1000)
-    #endif
-
-typedef unsigned __int64 uint64_t;
-typedef unsigned __int64 tm_t;
-
-  #else
-#include <stdint.h>
-#include <unistd.h>
-#define Sleep(ms) usleep((ms) * 1000)
-
-typedef struct timespec tm_t;
-  #endif
-
-#if defined (__i386__) || defined( __x86_64__ )
-  #ifdef _MSC_VER
-#include <intrin.h> // __rdtsc
-  #else
-#include <x86intrin.h>
-  #endif
-
-  #ifdef __corei7__
-#define RDTSC_INI(_c_) do { unsigned _cl, _ch;              \
-  __asm volatile ("cpuid\n\t"                               \
-                "rdtsc\n\t"                                 \
-                "mov %%edx, %0\n"                           \
-                "mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \
-                "%rax", "%rbx", "%rcx", "%rdx");            \
-  _c_ = (uint64_t)_ch << 32 | _cl;              \
-} while(0)
-
-#define RDTSC(_c_) do { unsigned _cl, _ch;                  \
-  __asm volatile("rdtscp\n"                                 \
-               "mov %%edx, %0\n"                            \
-               "mov %%eax, %1\n"                            \
-               "cpuid\n\t": "=r" (_ch), "=r" (_cl):: "%rax",\
-               "%rbx", "%rcx", "%rdx");\
-  _c_ = (uint64_t)_ch << 32 | _cl;\
-} while(0)
-  #else
-#define RDTSC(_c_) do { unsigned _cl, _ch;\
-  __asm volatile ("cpuid \n"\
-                "rdtsc"\
-                : "=a"(_cl), "=d"(_ch)\
-                : "a"(0)\
-                : "%ebx", "%ecx");\
-  _c_ = (uint64_t)_ch << 32 | _cl;\
-} while(0)
-#define RDTSC_INI(_c_) RDTSC(_c_)
-  #endif
-#else
-#define RDTSC_INI(_c_)
-#define RDTSC(_c_)
-#endif
-
-#define tmrdtscini() ({ uint64_t _c; __asm volatile("" ::: "memory"); RDTSC_INI(_c); _c; })
-#define tmrdtsc()    ({ uint64_t _c; RDTSC(_c); _c; })
-
-#ifndef TM_F
-#define TM_F 1.0  // TM_F=4 -> MI/s
-#endif
-
-  #ifdef RDTSC_ON
-#define tminit() tmrdtscini()
-#define tmtime() tmrdtsc()
-#define TM_T                    CLOCKS_PER_SEC
-static double TMBS(unsigned l, double t) { double dt = t, dl = l; return t/l; }
-#define TM_C 1000
-
-  #else
-#define TM_C 1
-static double TMBS(unsigned l, double t) { return (l/t)/1000000.0; }
-
-    #ifdef _WIN32
-static LARGE_INTEGER tps;
-static tm_t tmtime(void) {
-  LARGE_INTEGER tm;
-  tm_t t;
-  QueryPerformanceCounter(&tm);
-  return tm.QuadPart;
-}
-
-static tm_t tminit() { tm_t t0,ts; QueryPerformanceFrequency(&tps); t0 = tmtime(); while((ts = tmtime())==t0) {}; return ts; }
-static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start)/tps.QuadPart; }
-static int tmiszero(tm_t t) { return !t; }
-    #else
-      #ifdef __APPLE__
-#include <AvailabilityMacros.h>
-        #ifndef MAC_OS_X_VERSION_10_12
-#define MAC_OS_X_VERSION_10_12 101200
-        #endif
-#define CIVETWEB_APPLE_HAVE_CLOCK_GETTIME (defined(__APPLE__) && defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12)
-        #if !(CIVETWEB_APPLE_HAVE_CLOCK_GETTIME)
-#include <sys/time.h>
-#define CLOCK_REALTIME 0
-#define CLOCK_MONOTONIC 0
-int clock_gettime(int /*clk_id*/, struct timespec* t) {
-    struct timeval now;
-    int rv = gettimeofday(&now, NULL);
-    if (rv) return rv;
-    t->tv_sec  = now.tv_sec;
-    t->tv_nsec = now.tv_usec * 1000;
-    return 0;
-}
-        #endif
-      #endif
-static   tm_t tmtime()                      { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return tm; }
-static double tmdiff(tm_t start, tm_t stop) { return (stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1e9f; }
-static   tm_t tminit()   { tm_t t0 = tmtime(),t; while(!tmdiff(t = tmtime(),t0)) {}; return t; }
-static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
-    #endif
-#endif
-
-//---------------------------------------- bench ----------------------------------------------------------------------
-// for each a function call is repeated until exceeding tm_tx seconds.
-// A run duration is always tm_tx seconds
-// The number of runs can be set with the program options  -I and -J (specify -I15 -J15 for more precision)
-
-// sleep after each 8 runs to avoid cpu throttling.
-#define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0)
-
-// benchmark loop
-#define TMBEG(_tm_Reps_) { unsigned _tm_r,_tm_c = 0,_tm_R,_tm_Rx = _tm_Reps_,_tm_Rn = _tm_Reps_; double _tm_t;\
-  for(tm_rm = tm_rep, tm_tm = DBL_MAX, _tm_R = 0; _tm_R < _tm_Rn; _tm_R++) { tm_t _tm_t0 = tminit(); /*for each run*/\
-    for(_tm_r = 0;_tm_r < tm_rm;) { /*repeat tm_rm times */
-
-#define TMEND(_len_) \
-      _tm_r++; if(tm_tm == DBL_MAX && (_tm_t = tmdiff(_tm_t0, tmtime())) > tm_tx) break;\
-    }\
-    /*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\
-    /*other runs: break the loop only after 'tm_rm' repeats */ \
-    _tm_t = tmdiff(_tm_t0, tmtime());\
-    /*set min time, recalculate repeats tm_rm based on tm_tx, recalculate number of runs based on tm_TX*/\
-    if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("[%d,%d] ", tm_rm, _tm_Rn);*/ } tm_tm = _tm_t; _tm_c++; }\
-    else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\
-    if(tm_verbose) { printf("%8.2f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\
-    if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu throttling*/\
-  }\
-}
-
-static unsigned tm_rep = 1<<30, tm_Rep = 3, tm_Rep2 = 3, tm_rm, tm_RepMin = 1, tm_slp = 20, tm_verbose = 2;
-static tm_t tm_0, tm_T;
-static double tm_tm, tm_tx = 1, tm_TX = 60;
-
-static void tm_init(int _tm_Rep, int _tm_verbose) { tm_verbose = _tm_verbose; if(_tm_Rep) tm_Rep = _tm_Rep; }
-
-#define TMBENCH(_name_, _func_, _len_)  do { if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
-  TMBEG(tm_Rep) _func_; TMEND(_len_); \
-  double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.2f      \b\b\b\b\b", TMBS(_len_, dm*TM_C/dr) );\
-} while(0)
-
-// second TMBENCH. Example: use TMBENCH for encoding and TMBENCH2 for decoding
-#define TMBENCH2(_name_, _func_, _len_)  do { \
-  TMBEG(tm_Rep2) _func_; TMEND(_len_);\
-  double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.2f      \b\b\b\b\b", TMBS(_len_, dm*TM_C/dr) );\
-  if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
-} while(0)
-
-// Check
-#define TMBENCHT(_name_,_func_, _len_, _res_)  do { \
-  TMBEG(tm_Rep) \
-  if(_func_ != _res_) { printf("ERROR: %lld != %lld", (long long)_func_, (long long)_res_ ); exit(0); };\
-  TMEND(_len_);\
-  if(tm_verbose) printf("%8.2f      \b\b\b\b\b", TMBS(_len_,(double)tm_tm*TM_C/(double)tm_rm) );\
-  if(tm_verbose) printf("%s ", _name_?_name_:#_func_ );\
-} while(0)
-//----------------------------------------------------------------------------------------------------------------------------------
-#define Kb (1u<<10)
-#define Mb (1u<<20)
-#define Gb (1u<<30)
-#define KB 1000
-#define MB 1000000
-#define GB 1000000000
-
-static unsigned argtoi(char *s, unsigned def) {
-  char *p;
-  unsigned n = strtol(s, &p, 10),f = 1;
-  switch(*p) {
-    case 'K': f = KB; break;
-    case 'M': f = MB; break;
-    case 'G': f = GB; break;
-    case 'k': f = Kb; break;
-    case 'm': f = Mb; break;
-    case 'g': f = Gb; break;
-    case 'B': return n; break;
-    case 'b': def = 0;
-    default: if(!def) return n>=32?0xffffffffu:(1u << n); f = def;
-  }
-  return n*f;
-}
-static uint64_t argtol(char *s) {
-  char *p;
-  uint64_t n = strtol(s, &p, 10),f=1;
-  switch(*p) {
-    case 'K': f = KB; break;
-    case 'M': f = MB; break;
-    case 'G': f = GB; break;
-    case 'k': f = Kb; break;
-    case 'm': f = Mb; break;
-    case 'g': f = Gb; break;
-    case 'B': return n; break;
-    case 'b': return 1u << n;
-    default:  f = MB;
-  }
-  return n*f;
-}
-
-static uint64_t argtot(char *s) {
-  char *p;
-  uint64_t n = strtol(s, &p, 10),f=1;
-  switch(*p) {
-    case 'h': f = 3600000; break;
-    case 'm': f = 60000;   break;
-    case 's': f = 1000;    break;
-    case 'M': f = 1;       break;
-    default:  f = 1000;
-  }
-  return n*f;
-}
-
-static void memrcpy(unsigned char *out, unsigned char *in, unsigned n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; }
-
--- a/transpose.h
+++ b/transpose.h
@ -1,113 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-//   transpose.h - Byte/Nibble transpose for further compressing with lz77 or other compressors
-#ifdef __cplusplus
-extern "C" {
-#endif
-// Syntax
-// in    : Input buffer
-// n     : Total number of bytes in input buffer
-// out   : output buffer
-// esize : element size in bytes (ex. 2, 4, 8,... )
-
-//---------- High level functions with dynamic cpu detection and JIT scalar/sse/avx2 switching
-void tpenc(       unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // tranpose
-void tpdec(       unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // reverse transpose
-
-void tp2denc(unsigned char *in,             unsigned x, unsigned y,             unsigned char *out, unsigned esize); //2D transpose
-void tp2ddec(unsigned char *in,             unsigned x, unsigned y,             unsigned char *out, unsigned esize);
-void tp3denc(unsigned char *in,             unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); //3D transpose
-void tp3ddec(unsigned char *in,             unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize);
-void tp4denc(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); //4D transpose
-void tp4ddec(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize);
-
-// Nibble transpose SIMD (SSE2,AVX2, ARM Neon)
-void tp4enc(      unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
-void tp4dec(      unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
-
-// bit transpose
-//void tp1enc(      unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
-//void tp1dec(      unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
-
-//---------- Low level functions ------------------------------------
-void tpenc2(      unsigned char *in, unsigned n, unsigned char *out);  // scalar
-void tpenc3(      unsigned char *in, unsigned n, unsigned char *out);
-void tpenc4(      unsigned char *in, unsigned n, unsigned char *out);
-void tpenc8(      unsigned char *in, unsigned n, unsigned char *out);
-void tpenc16(     unsigned char *in, unsigned n, unsigned char *out);
-
-void tpdec2(      unsigned char *in, unsigned n, unsigned char *out);
-void tpdec3(      unsigned char *in, unsigned n, unsigned char *out);
-void tpdec4(      unsigned char *in, unsigned n, unsigned char *out);
-void tpdec8(      unsigned char *in, unsigned n, unsigned char *out);
-void tpdec16(     unsigned char *in, unsigned n, unsigned char *out);
-
-void tpenc128v2(  unsigned char *in, unsigned n, unsigned char *out);   // sse2
-void tpdec128v2(  unsigned char *in, unsigned n, unsigned char *out);
-void tpenc128v4(  unsigned char *in, unsigned n, unsigned char *out);
-void tpdec128v4(  unsigned char *in, unsigned n, unsigned char *out);
-void tpenc128v8(  unsigned char *in, unsigned n, unsigned char *out);
-void tpdec128v8(  unsigned char *in, unsigned n, unsigned char *out);
-
-void tp4enc128v2( unsigned char *in, unsigned n, unsigned char *out);
-void tp4dec128v2( unsigned char *in, unsigned n, unsigned char *out);
-void tp4enc128v4( unsigned char *in, unsigned n, unsigned char *out);
-void tp4dec128v4( unsigned char *in, unsigned n, unsigned char *out);
-void tp4enc128v8( unsigned char *in, unsigned n, unsigned char *out);
-void tp4dec128v8( unsigned char *in, unsigned n, unsigned char *out);
-
-void tp1enc128v2( unsigned char *in, unsigned n, unsigned char *out);
-void tp1dec128v2( unsigned char *in, unsigned n, unsigned char *out);
-void tp1enc128v4( unsigned char *in, unsigned n, unsigned char *out);
-void tp1dec128v4( unsigned char *in, unsigned n, unsigned char *out);
-void tp1enc128v8( unsigned char *in, unsigned n, unsigned char *out);
-void tp1dec128v8( unsigned char *in, unsigned n, unsigned char *out);
-
-void tpenc256v2(  unsigned char *in, unsigned n, unsigned char *out);   // avx2
-void tpdec256v2(  unsigned char *in, unsigned n, unsigned char *out);
-void tpenc256v4(  unsigned char *in, unsigned n, unsigned char *out);
-void tpdec256v4(  unsigned char *in, unsigned n, unsigned char *out);
-void tpenc256v8(  unsigned char *in, unsigned n, unsigned char *out);
-void tpdec256v8(  unsigned char *in, unsigned n, unsigned char *out);
-
-void tp4enc256v2( unsigned char *in, unsigned n, unsigned char *out);
-void tp4dec256v2( unsigned char *in, unsigned n, unsigned char *out);
-void tp4enc256v4( unsigned char *in, unsigned n, unsigned char *out);
-void tp4dec256v4( unsigned char *in, unsigned n, unsigned char *out);
-void tp4enc256v8( unsigned char *in, unsigned n, unsigned char *out);
-void tp4dec256v8( unsigned char *in, unsigned n, unsigned char *out);
-
-//------- CPU instruction set
-// cpuiset  = 0: return current simd set,
-// cpuiset != 0: set simd set 0:scalar, 20:sse2, 52:avx2
-unsigned cpuini(unsigned cpuiset);
-
-// convert simd set to string "sse3", "sse3", "sse4.1" or "avx2"
-// Ex.: printf("current cpu set=%s\n", cpustr(cpuini(0)) );
-char *cpustr(unsigned cpuisa);
-
-unsigned cpuisa(void);
-#ifdef __cplusplus
-}
-#endif
--- a/trle.h
+++ b/trle.h
@ -1,72 +0,0 @@
-/**
-    Copyright (C) powturbo 2015-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - email    : powturbo [AT] gmail.com
-    - github   : https://github.com/powturbo
-    - homepage : https://sites.google.com/site/powturbo/
-    - twitter  : https://twitter.com/powturbo
-
-    TurboRLE - "Most efficient and fastest Run Length Encoding"
-**/
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "vs/stdint.h"
-#else
-#include <stdint.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-// RLE with specified escape char
-unsigned _srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
-unsigned _srled8( const unsigned char *__restrict in,                 unsigned char *__restrict out, unsigned outlen, uint8_t e);
-
-unsigned _srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
-unsigned _srled16(const unsigned char *__restrict in,                 unsigned char *__restrict out, unsigned outlen, uint16_t e);
-
-unsigned _srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
-unsigned _srled32(const unsigned char *__restrict in,                 unsigned char *__restrict out, unsigned outlen, uint32_t e);
-
-unsigned _srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
-unsigned _srled64(const unsigned char *__restrict in,                 unsigned char *__restrict out, unsigned outlen, uint64_t e);
-
-// functions w/ overflow handling
-unsigned  srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
-unsigned  srled8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint8_t e);
-
-unsigned  srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
-unsigned  srled16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint16_t e);
-
-unsigned  srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
-unsigned  srled32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint32_t e);
-
-unsigned  srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
-unsigned  srled64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint64_t e);
-
-// RLE w. automatic escape char determination
-unsigned  srlec(  const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
-unsigned _srled(  const unsigned char *__restrict in,                 unsigned char *__restrict out, unsigned outlen);
-unsigned  srled(  const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
-
-// Turbo RLE
-unsigned  trlec(  const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
-unsigned _trled(  const unsigned char *__restrict in,                 unsigned char *__restrict out, unsigned outlen);
-unsigned  trled(  const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
-#ifdef __cplusplus
-}
-#endif
--- a/vint.h
+++ b/vint.h
@ -1,401 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-// "Integer Compression" variable byte include header (scalar TurboVByte+ SIMD TurboByte)
-#ifndef _VINT_H_
-#define _VINT_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-  #ifdef VINT_IN
-#include "conf.h"
-//----------------------------------- Variable byte: single value macros (low level) -----------------------------------------------
-//------------- 32 bits -------------
-extern unsigned char _vtab32_[];
-#define _vbxvlen32(_x_) _vtab32_[(unsigned char)(_x_)>>4] // (clz32((_x_) ^ 0xff) - 23) //
-#define _vbxlen32(_x_) ((bsr32(_x_|1)+6)/7)
-
-#define _vbxput32(_op_, _x_, _act_) {\
-       if(likely((_x_) < (1<< 7))) {        *_op_++ = _x_;                                               _act_;}\
-  else if(likely((_x_) < (1<<14))) { ctou16(_op_)   = bswap16((_x_) | 0x8000u);               _op_ += 2; _act_;}\
-  else if(likely((_x_) < (1<<21))) {        *_op_++ = _x_ >> 16  | 0xc0u; ctou16(_op_) = _x_; _op_ += 2; _act_;}\
-  else if(likely((_x_) < (1<<28))) { ctou32(_op_)   = bswap32((_x_) | 0xe0000000u);              _op_ += 4; _act_;}\
-  else                             {        *_op_++ = (unsigned long long)(_x_) >> 32 | 0xf0u; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
-}
-
-#define _vbxget32(_ip_, _x_, _act_) do { _x_ = (unsigned)(*_ip_++);\
-       if(!(_x_ & 0x80u)) {                                                                      _act_;}\
-  else if(!(_x_ & 0x40u)) { _x_ = bswap16(ctou16(_ip_ - 1) & 0xff3fu); _ip_++;                       _act_;}\
-  else if(!(_x_ & 0x20u)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_);                    _ip_ += 2; _act_;}\
-  else if(!(_x_ & 0x10u)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0fu);                  _ip_ += 3; _act_;}\
-  else                    { _x_ = (unsigned long long)((_x_) & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\
-} while(0)
-
-//------------- 64 bits -----------
-#define _vbxlen64(_x_)  ((bsr64(_x_)+6)/7)
-#define _vbxvlen64(_x_) ((_x_)==0xff?9:clz32((_x_) ^ 0xff) - 23)
-
-#define _vbxput64(_op_, _x_, _act_) {\
-       if(likely(_x_ < (1<< 7))) {        *_op_++ = _x_;                                                                                          _act_;}\
-  else if(likely(_x_ < (1<<14))) { ctou16(_op_)   =        bswap16(_x_| 0x8000);                                                       _op_ += 2; _act_;}\
-  else if(likely(_x_ < (1<<21))) {        *_op_++ =        _x_ >> 16  | 0xc0;      ctou16(_op_) = _x_;                                 _op_ += 2; _act_;}\
-  else if(likely(_x_ < (1<<28))) { ctou32(_op_)   =        bswap32(_x_| 0xe0000000);                                                   _op_ += 4; _act_;}\
-  else if(       _x_ < 1ull<<35) {        *_op_++ =         _x_ >> 32 | 0xf0;                                      ctou32(_op_) = _x_; _op_ += 4; _act_;}\
-  else if(       _x_ < 1ull<<42) { ctou16(_op_)   = bswap16(_x_ >> 32 | 0xf800);                        _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
-  else if(       _x_ < 1ull<<49) {        *_op_++ =         _x_ >> 48 | 0xfc; ctou16(_op_) = _x_ >> 32; _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
-  else if(       _x_ < 1ull<<56) { ctou64(_op_)   = bswap64(_x_       | 0xfe00000000000000ull);                                        _op_ += 8; _act_;}\
-  else {                                  *_op_++ =                     0xff;                                      ctou64(_op_) = _x_; _op_ += 8; _act_;}\
-}
-
-#define _vbxget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
-       if(!(_x_ & 0x80)) {                                                                                                _act_;}\
-  else if(!(_x_ & 0x40)) { _x_ = bswap16(ctou16(_ip_++-1) & 0xff3f);                                                      _act_;}\
-  else if(!(_x_ & 0x20)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_);                                              _ip_ += 2; _act_;}\
-  else if(!(_x_ & 0x10)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0f);                                         _ip_ += 3; _act_;}\
-  else if(!(_x_ & 0x08)) { _x_ = (_x_ & 0x07)<<32 | ctou32(_ip_);                                              _ip_ += 4; _act_;}\
-  else if(!(_x_ & 0x04)) { _x_ = (unsigned long long)(bswap16(ctou16(_ip_-1)) & 0x7ff) << 32 | ctou32(_ip_+1); _ip_ += 5; _act_;}\
-  else if(!(_x_ & 0x02)) { _x_ = (_x_ & 0x03)<<48 |   (unsigned long long)ctou16(_ip_) << 32 | ctou32(_ip_+2); _ip_ += 6; _act_;}\
-  else if(!(_x_ & 0x01)) { _x_ = bswap64(ctou64(_ip_-1)) & 0x01ffffffffffffffull;                              _ip_ += 7; _act_;}\
-  else                   { _x_ = ctou64(_ip_);                                                                 _ip_ += 8; _act_;}\
-} while(0)
-
-#define vbxput64(_op_, _x_) { unsigned long long _x = _x_; _vbxput64(_op_, _x, ;); }
-#define vbxput32(_op_, _x_) { register unsigned  _x = _x_; _vbxput32(_op_, _x, ;); }
-#define vbxput16(_op_, _x_)   vbxput32(_op_, _x_)
-#define vbxput8(  _op_, _x_)  (*_op_++ = _x_)
-
-#define vbxget64(_ip_, _x_) _vbxget64(_ip_, _x_, ;)
-#define vbxget32(_ip_, _x_) _vbxget32(_ip_, _x_, ;)
-#define vbxget16(_ip_, _x_)  vbxget32(_ip_,_x_)
-#define vbxget8(_ip_, _x_)       (_x_ = *_ip_++)
-//---------------------------------------------------------------------------
-#define VB_SIZE 64
-#define VB_MAX 254
-#define VB_B2 6
-#define VB_B3 3
-#define VB_BA3 (VB_MAX - (VB_SIZE/8 - 3))
-#define VB_BA2 (VB_BA3 - (1<<VB_B3))
-
-#define VB_OFS1 (VB_BA2 - (1<<VB_B2))
-#define VB_OFS2 (VB_OFS1 + (1 << (8+VB_B2)))
-#define VB_OFS3 (VB_OFS2 + (1 << (16+VB_B3)))
-
-#define _vblen32(_x_)  ((_x_) < VB_OFS1?1:((_x_) < VB_OFS2?2:((_x_) < VB_OFS3)?3:(bsr32(_x_)+7)/8+1))
-#define _vbvlen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_BA2?2:((_x_) < VB_BA3)?3:(_x_-VB_BA3)))
-
-#define _vbput32(_op_, _x_, _act_) {\
-  if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_);                                                                 _act_;}\
-  else if  ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1));             _op_  += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\
-  else if  ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_  += 2;  _act_;}\
-  else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3);    ctou32(_op_) = (_x_); _op_  += _b; _act_;}\
-}
-
-#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
-       if(likely(_x_ < VB_OFS1)) { _act_ ;}\
-  else if(likely(_x_ < VB_BA2))  { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 <<  8)); _ip_++; _act_;} \
-  else if(likely(_x_ < VB_BA3))  { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\
-  else { unsigned _b = _x_-VB_BA3; _x_ = ctou32(_ip_) & ((1u << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\
-} while(0)
-
-#define _vblen64(_x_)  _vblen32(_x_)
-#define _vbvlen64(_x_) _vbvlen32(_x_)
-#define _vbput64(_op_, _x_, _act_) {\
-  if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_);                                                                 _act_;}\
-  else if  ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1));             _op_  += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\
-  else if  ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_  += 2;  _act_;}\
-  else { unsigned _b = (bsr64((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3);    ctou64(_op_) = (_x_); _op_  += _b; _act_;}\
-}
-
-#define _vbget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
-       if(likely(_x_ < VB_OFS1)) { _act_ ;}\
-  else if(likely(_x_ < VB_BA2))  { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 <<  8)); _ip_++; _act_;} \
-  else if(likely(_x_ < VB_BA3))  { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\
-  else { unsigned _b = _x_-VB_BA3; _x_ = ctou64(_ip_) & ((1ull << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\
-} while(0)
-
-#ifdef _WIN32
-//#define fgetc_unlocked(_f_)    _fgetc_nolock(_f_)
-#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_)
-#define fgetc_unlocked(_f_)      fgetc(_f_)
-#else
-#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_) //_IO_putc_unlocked(_c_,_f_)
-#define fgetc_unlocked(_f_)      fgetc(_f_) //_IO_getc_unlocked(_f_)
-#endif
-
-#define leb128put(_op_, _x_)  { uint64_t _x = _x_; while(_x > 0x7f) { *_op_++ =      _x & 0x7f;       _x >>= 7; }  *_op_++ =     _x | 0x80; }
-#define vbfput32(_f_,  _x_)  ({ uint64_t _x = _x_; while(_x > 0x7f) { fputc_unlocked(_x & 0x7f, _f_); _x >>= 7; } fputc_unlocked(_x | 0x80, _f_); })
-
-#define _leb128get(_ip_, _x_, _act_) { unsigned _sft=0; for(_x_=0;;_sft += 7) { unsigned _c = *_ip_++; _x_ += (_c & 0x7f) << _sft; if(_c >= 0x80) { _act_; break; } } }
-#define leb128get(_ip_, _x_) vbgetax(_ip_, _x_, ;)
-#define vbfget32(_f_ )  ({ unsigned _sft=0,_x=0; for(;;_sft += 7) { unsigned _c = fgetc_unlocked(_f_); if(_c != EOF) { _x += (_c & 0x7f) << _sft; if(_c & 0x80) break; } else { _x = EOF; break; } } _x; })
-
-//------------- 16 bits -----------
-#define _vblen16(_x_) _vblen32(_x_)
-#define _vbvlen16(_x_) _vbvlen32(_x_)
-
-#define _vbput16(_op_, _x_, _act_) _vbput32(_op_, _x_, _act_)
-#define _vbget16(_ip_, _x_, _act_) _vbget32(_ip_, _x_, _act_)
-
-#define _vblen8(_x_) 1
-#define _vbvlen8(_x_) 1
-#define _vbput8(_op_, _x_, _act_) { *_op_++ = _x_; _act_; }
-#define _vbget8(_ip_, _x_, _act_) { _x_ = *_ip_++; _act_; }
-//----------------------------------- Variable byte: single value functions -----------------------------------------------
-// ---- Variable byte length after compression
-static inline unsigned vblen16(unsigned short x) { return _vblen16(x); }
-static inline unsigned vblen32(unsigned       x) { return _vblen32(x); }
-static inline unsigned vblen64(uint64_t       x) { return _vblen64(x); }
-
-// ---- Length of compressed value. Input in is the first char of the compressed buffer start (Ex. vbvlen32(in[0]) )
-static inline unsigned vbvlen16(unsigned x) { return _vbvlen32(x); }
-static inline unsigned vbvlen32(unsigned x) { return _vbvlen32(x); }
-static inline unsigned vbvlen64(unsigned x) { return _vbvlen64(x); }
-
-//----- encode/decode 16/32/64 single value and advance output/input pointer
-#define vbput64(_op_, _x_) { unsigned long long _x = _x_; _vbput64(_op_, _x, ;); }
-#define vbput32(_op_, _x_) { register unsigned  _x = _x_; _vbput32(_op_, _x, ;); }
-#define vbput16(_op_, _x_)   vbput32(_op_, _x_)
-#define vbput8(_op_, _x_)    (*_op_++ = _x_)
-
-#define vbget64(_ip_, _x_)     _vbget64(_ip_, _x_, ;)
-#define vbget32(_ip_, _x_)     _vbget32(_ip_, _x_, ;)
-#define vbget16(_ip_, _x_)      vbget32(_ip_,_x_)
-#define vbget8(_ip_, _x_)       (_x_ = *_ip_++)
-  #endif
-//----------------------------- TurboVByte 'vb':Variable byte + SIMD TurboByte 'v8': array functions ----------------------------------------
-// Encoding/DEcoding: Return value = end of compressed output/input buffer out/in
-
-//----------------------- Encoding/Decoding unsorted array with n integer values --------------------------
-unsigned char *vbenc16(  unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out); //TurboVByte
-unsigned char *vbenc32(  unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out);
-unsigned char *vbenc64(  uint64_t       *__restrict in, unsigned n, unsigned char  *__restrict out);
-
-//-- Decode
-unsigned char *vbdec16(  unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out);
-unsigned char *vbdec32(  unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out);
-unsigned char *vbdec64(  unsigned char  *__restrict in, unsigned n, uint64_t       *__restrict out);
-
-//-- Get value stored at index idx (idx:0...n-1)
-unsigned short vbgetx16(  unsigned char *__restrict in, unsigned idx);
-unsigned       vbgetx32(  unsigned char *__restrict in, unsigned idx);
-uint64_t       vbgetx64(  unsigned char *__restrict in, unsigned idx);
-
-//-- Search and return index of next value equal to key or n when no key value found
-// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
-unsigned vbgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key);
-unsigned vbgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned       key);
-unsigned vbgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t       key);
-
-//---------------------- Delta encoding/decoding sorted array ---------------------------------------------
-//-- Increasing integer array. out[i] = out[i-1] + in[i]
-unsigned char *vbdenc16( unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *vbdenc32( unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-unsigned char *vbdenc64( uint64_t       *__restrict in, unsigned n, unsigned char  *__restrict out, uint64_t       start);
-
-unsigned char *vbddec16( unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *vbddec32( unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-unsigned char *vbddec64( unsigned char  *__restrict in, unsigned n, uint64_t       *__restrict out, uint64_t       start);
-
-//-- Get value stored at index idx (idx:0...n-1)
-unsigned short vbdgetx16(  unsigned char *__restrict in, unsigned idx, unsigned short start);
-unsigned       vbdgetx32(  unsigned char *__restrict in, unsigned idx, unsigned start);
-uint64_t       vbdgetx64(  unsigned char *__restrict in, unsigned idx, uint64_t start);
-
-//-- Search and return index of next value equal to key or n when no key value found
-// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
-unsigned vbdgetgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start);
-unsigned vbdgetgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned       *key, unsigned       start);
-unsigned vbdgetgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t       *key, uint64_t       start);
-
-//-- Strictly increasing (never remaining constant or decreasing) integer array. out[i] = out[i-1] + in[i] + 1
-unsigned char *vbd1enc16(unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *vbd1enc32(unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-unsigned char *vbd1enc64(uint64_t       *__restrict in, unsigned n, unsigned char  *__restrict out, uint64_t       start);
-
-unsigned char *vbd1dec16(unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *vbd1dec32(unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-unsigned char *vbd1dec64(unsigned char  *__restrict in, unsigned n, uint64_t       *__restrict out, uint64_t       start);
-
-
-//-- Get value stored at index idx (idx:0...n-1)
-unsigned short vbd1getx16(  unsigned char *__restrict in, unsigned idx, unsigned short start);
-unsigned       vbd1getx32(  unsigned char *__restrict in, unsigned idx, unsigned       start);
-uint64_t       vbd1getx64(  unsigned char *__restrict in, unsigned idx, uint64_t       start);
-
-//-- Search and return index of next value equal to key or n when no key value found
-// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
-unsigned vbd1getgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start);
-unsigned vbd1getgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned       *key, unsigned       start);
-unsigned vbd1getgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t       *key, uint64_t       start);
-
-//---------------------- Zigzag encoding/decoding for unsorted integer lists.
-unsigned char *vbzenc8(  unsigned char  *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned char start);
-unsigned char *vbzenc16( unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *vbzenc32( unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-unsigned char *vbzenc64( uint64_t       *__restrict in, unsigned n, unsigned char  *__restrict out, uint64_t       start);
-
-unsigned char *vbzdec8(  unsigned char  *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned char  start);
-unsigned char *vbzdec16( unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *vbzdec32( unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-unsigned char *vbzdec64( unsigned char  *__restrict in, unsigned n, uint64_t       *__restrict out, uint64_t       start);
-
-//---------------------- XOR encoding/decoding for unsorted integer lists.
-unsigned char *vbxenc8(  unsigned char  *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned char start);
-unsigned char *vbxenc16( unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *vbxenc32( unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-unsigned char *vbxenc64( uint64_t       *__restrict in, unsigned n, unsigned char  *__restrict out, uint64_t       start);
-
-unsigned char *vbxdec8(  unsigned char  *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned char  start);
-unsigned char *vbxdec16( unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *vbxdec32( unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-unsigned char *vbxdec64( unsigned char  *__restrict in, unsigned n, uint64_t       *__restrict out, uint64_t       start);
-
-//---------------------- Delta of delta encoding/decoding for unsorted integer lists.
-unsigned char *vbddenc16( unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *vbddenc32( unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-unsigned char *vbddenc64( uint64_t       *__restrict in, unsigned n, unsigned char  *__restrict out, uint64_t       start);
-
-unsigned char *vbdddec16( unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *vbdddec32( unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-unsigned char *vbdddec64( unsigned char  *__restrict in, unsigned n, uint64_t       *__restrict out, uint64_t       start);
-
-//-- Get value stored at index idx (idx:0...n-1)
-unsigned short vbzgetx16(  unsigned char *__restrict in, unsigned idx, unsigned short start);
-unsigned       vbzgetx32(  unsigned char *__restrict in, unsigned idx, unsigned       start);
-uint64_t       vbzgetx64(  unsigned char *__restrict in, unsigned idx, uint64_t       start);
-
-//-- Search and return index of next value equal to key or n when no key value found
-// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
-/*unsigned vbzgeteq15( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start);
-unsigned vbzgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start);
-unsigned vbzgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned       key, unsigned start);
-unsigned vbzgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t       key, unsigned start);*/
-
-//-------------------------- TurboByte (SIMD Group varint) --------------------------------------------------------------
-unsigned char *v8enc16(  unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out); //TurboByte
-unsigned char *v8enc32(  unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out);
-
-unsigned char *v8dec16(  unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out);
-unsigned char *v8dec32(  unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out);
-
-//------ delta ---------
-unsigned char *v8denc16( unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *v8denc32( unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-
-unsigned char *v8ddec16( unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *v8ddec32( unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-
-//------ delta 1 -------
-unsigned char *v8d1enc16(unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *v8d1enc32(unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-
-unsigned char *v8d1dec16(unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *v8d1dec32(unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-
-//------- zigzag -------
-unsigned char *v8zenc16( unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *v8zenc32( unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-
-unsigned char *v8zdec16( unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *v8zdec32( unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-
-//------- xor ----------
-unsigned char *v8xenc16( unsigned short *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned short start);
-unsigned char *v8xenc32( unsigned       *__restrict in, unsigned n, unsigned char  *__restrict out, unsigned       start);
-
-unsigned char *v8xdec16( unsigned char  *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
-unsigned char *v8xdec32( unsigned char  *__restrict in, unsigned n, unsigned       *__restrict out, unsigned       start);
-//-------------------------- TurboByte Hybrid (SIMD Group varint) + Bitpacking -------------------------------------------
-size_t v8nenc16(  uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nenc32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8nd1enc16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nd1enc32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8nzenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nzenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8ndec16(  unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8ndec32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t v8nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t v8nd1dec16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8nd1dec32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t v8nzdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8nzdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t v8nxdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8nxdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-//-------------
-size_t v8nenc128v16(  uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nenc128v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8ndenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8nd1enc128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8nzenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nzenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8ndec128v16(  unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8ndec128v32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t v8nddec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t v8nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t v8nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-size_t v8nxdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t v8nxdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-//-------------
-size_t v8nenc256v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nzenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t v8nxenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t v8ndec256v32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t v8nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t v8nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t v8nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t v8nxdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/vp4.h
+++ b/vp4.h
@ -1,355 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-//  "TurboPFor: Integer Compression" PFor/PForDelta  + Direct access
-#ifndef VP4_H_
-#define VP4_H_
-#if defined(_MSC_VER) && _MSC_VER < 1600
-#include "vs/stdint.h"
-#else
-#include <stdint.h>
-#endif
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-//************************************************ High level API - n unlimited ****************************************************
-// Compress integer array with n values to the buffer out.
-// Return value = number of bytes written to compressed buffer out
-size_t p4nenc8(       uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nenc16(      uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nenc32(      uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nenc64(      uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nenc128v16(  uint16_t *__restrict in, size_t n, unsigned char *__restrict out); // SIMD (Vertical bitpacking)
-size_t p4nenc128v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nenc128v64(  uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nenc256w32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nenc256v32(  uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-
-size_t p4ndenc8(      uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4ndenc16(     uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4ndenc32(     uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4ndenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4ndenc64(     uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t p4nd1enc8(     uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nd1enc16(    uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nd1enc32(    uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nd1enc128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nd1enc64(    uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-size_t p4nzenc8(      uint8_t  *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nzenc16(     uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nzenc32(     uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nzenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nzenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nzenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
-size_t p4nzenc64(     uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
-
-// Decompress the compressed n values in input buffer in to the integer array out.
-// Return value = number of bytes read from the ompressed buffer in
-size_t p4ndec8(       unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t p4ndec16(      unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t p4ndec32(      unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4ndec64(      unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t p4ndec128v16(  unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t p4ndec128v32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4ndec128v64(  unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-size_t p4ndec256v32(  unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-
-// Delta minimum = 0
-size_t p4nddec8(      unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t p4nddec16(     unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t p4nddec32(     unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nddec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t p4nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nddec256w32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nddec64(     unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-// Delta minimum = 1
-size_t p4nd1dec8(     unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t p4nd1dec16(    unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t p4nd1dec32(    unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nd1dec64(    unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-//Zigzag
-size_t p4nzdec8(      unsigned char *__restrict in, size_t n, uint8_t  *__restrict out);
-size_t p4nzdec16(     unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t p4nzdec32(     unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
-size_t p4nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
-size_t p4nzdec64(     unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
-
-//************** Low level API - n limited to 128/256 ***************************************
-#define P4D_MAX 256
-
-// -------------- TurboPFor: Encode ------------
-//#include <assert.h>
-// Low level API: Single block n limited
-//compress integer array with n values to the buffer out. Return value = end of compressed buffer out
-unsigned char *p4enc8(       uint8_t  *__restrict in, unsigned n, unsigned char *__restrict out);
-unsigned char *p4enc16(      uint16_t *__restrict in, unsigned n, unsigned char *__restrict out);
-unsigned char *p4enc32(      uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
-unsigned char *p4enc128v16(  uint16_t *__restrict in, unsigned n, unsigned char *__restrict out); // SSE (Vertical bitpacking)
-unsigned char *p4enc128v32(  uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
-unsigned char *p4enc128v64(  uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
-unsigned char *p4enc256v32(  uint32_t *__restrict in, unsigned n, unsigned char *__restrict out); // AVX2
-unsigned char *p4enc64(      uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
-
-unsigned char *p4enc256w32(  uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
-
-unsigned char *p4encx8(      uint8_t  *__restrict in, unsigned n, unsigned char *__restrict out);// Direct access
-unsigned char *p4encx16(     uint16_t *__restrict in, unsigned n, unsigned char *__restrict out);
-unsigned char *p4encx32(     uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
-unsigned char *p4encx64(     uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
-
-unsigned char *p4denc8(      uint8_t  *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t  start);
-unsigned char *p4denc16(     uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
-unsigned char *p4denc32(     uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4denc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
-unsigned char *p4denc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4denc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4denc64(     uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
-
-unsigned char *p4denc256w32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-
-unsigned char *p4dencx8(     uint8_t  *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t  start); // Direct access
-unsigned char *p4dencx16(    uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
-unsigned char *p4dencx32(    unsigned *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-
-unsigned char *p4d1enc8(     uint8_t  *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t  start);
-unsigned char *p4d1enc16(    uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
-unsigned char *p4d1enc32(    uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4d1enc128v16(uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); // SIMD (Vertical bitpacking)
-unsigned char *p4d1enc128v32(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4d1enc256v32(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4d1enc64(    uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
-
-unsigned char *p4d1encx8(    uint8_t  *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t  start); // Direct access
-unsigned char *p4d1encx16(   uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
-unsigned char *p4d1encx32(   uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-
-unsigned char *p4zenc8(      uint8_t  *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t  start);
-unsigned char *p4zenc16(     uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
-unsigned char *p4zenc32(     uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4zenc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
-unsigned char *p4zenc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4zenc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
-unsigned char *p4zenc64(     uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
-
-unsigned char *p4senc16(uint16_t      *in, unsigned n, unsigned char *out, uint16_t start);
-unsigned char *p4senc32(uint32_t      *in, unsigned n, unsigned char *out, uint32_t start);
-unsigned char *p4senc64(uint64_t      *in, unsigned n, unsigned char *out, uint64_t start);
-
-unsigned char *p4sdec16(unsigned char *in, unsigned n, uint16_t *out,      uint16_t start);
-unsigned char *p4sdec32(unsigned char *in, unsigned n, uint32_t *out,      uint32_t start);
-unsigned char *p4sdec64(unsigned char *in, unsigned n, uint64_t *out,      uint64_t start);
-
-size_t p4nsenc16(uint16_t *in, size_t n, unsigned char *out);
-size_t p4nsenc32(uint32_t *in, size_t n, unsigned char *out);
-size_t p4nsenc64(uint64_t *in, size_t n, unsigned char *out);
-
-size_t p4nsdec16(unsigned char *in, size_t n, uint16_t *out);
-size_t p4nsdec32(unsigned char *in, size_t n, uint32_t *out);
-size_t p4nsdec64(unsigned char *in, size_t n, uint64_t *out);
-
-// same as p4enc, but with b and bx as parameters. Call after _p4bitsXX
-inline unsigned char *_p4enc8(      uint8_t  *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4enc16(     uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4enc32(     uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4enc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
-inline unsigned char *_p4enc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
-inline unsigned char *_p4enc128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
-inline unsigned char *_p4enc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4enc64(     uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
-// calculate the best bit sizes b and bx, return b.
-unsigned _p4bits8(           uint8_t  *__restrict in, unsigned n, unsigned *pbx);
-unsigned _p4bits16(          uint16_t *__restrict in, unsigned n, unsigned *pbx);
-unsigned _p4bits32(          uint32_t *__restrict in, unsigned n, unsigned *pbx);
-unsigned _p4bits64(          uint64_t *__restrict in, unsigned n, unsigned *pbx);
-
-unsigned _p4bitsx8(          uint8_t  *__restrict in, unsigned n, unsigned *pbx);
-unsigned _p4bitsx16(         uint16_t *__restrict in, unsigned n, unsigned *pbx);
-unsigned _p4bitsx32(         uint32_t *__restrict in, unsigned n, unsigned *pbx);
-unsigned _p4bitsx64(         uint64_t *__restrict in, unsigned n, unsigned *pbx);
-
-#define P4HVE(_out_, _b_, _bx_,_usize_) do { if(!_bx_) *_out_++ = _b_;else if(_bx_ <= _usize_) *_out_++ = 0x80|_b_, *_out_++ = _bx_; else *_out_++= (_bx_ == _usize_+1?0x40:0xc0)|_b_; } while(0)
-
-#define P4HVE8( _out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_, 8)
-#define P4HVE16(_out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_,16)
-#define P4HVE32(_out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_,32)
-#define P4HVE64(_out_, _b_, _bx_) do { unsigned _c = _b_==64?64-1:_b_; P4HVE(_out_, _c, _bx_,64); } while(0)
-
-//---------------------------- TurboPFor: Decode --------------------------------------------------------
-// decompress a previously (with p4enc32) bit packed array. Return value = end of packed buffer in
-//-- scalar. (see p4getx32 for direct access)
-// b and bx specified (not stored within the compressed stream header)
-inline unsigned char *_p4dec8(       unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4dec16(      unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4dec32(      unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4dec128v16(  unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical BitPacking)
-inline unsigned char *_p4dec128v32(  unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4dec128v64(  unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx);
-inline unsigned char *_p4dec64(      unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx);
-
-unsigned char *p4dec8(        unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out);
-unsigned char *p4dec16(       unsigned char *__restrict in, unsigned n, uint16_t *__restrict out);
-unsigned char *p4dec32(       unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
-unsigned char *p4dec128v16(   unsigned char *__restrict in, unsigned n, uint16_t *__restrict out);  // SIMD (Vertical BitPacking)
-unsigned char *p4dec128v32(   unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
-unsigned char *p4dec128v64(   unsigned char *__restrict in, unsigned n, uint64_t *__restrict out);
-unsigned char *p4dec256v32(   unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
-unsigned char *p4dec64(       unsigned char *__restrict in, unsigned n, uint64_t *__restrict out);
-//------ Delta decoding --------------------------- Return value = end of packed input buffer in ---------------------------
-//-- Increasing integer lists. out[i] = out[i-1] + in[i]
-// b and bx specified
-unsigned char *_p4ddec8(      unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b, unsigned bx);
-unsigned char *_p4ddec16(     unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
-unsigned char *_p4ddec32(     unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-unsigned char *_p4ddec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
-unsigned char *_p4ddec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-unsigned char *_p4ddec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-unsigned char *_p4ddec64(     unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
-
-unsigned char *p4ddec8(       unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start);
-unsigned char *p4ddec16(      unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
-unsigned char *p4ddec32(      unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4ddec128v16(  unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
-unsigned char *p4ddec128v32(  unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4ddec256v32(  unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4ddec64(      unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
-
-//-- Strictly increasing (never remaining constant or decreasing) integer lists. out[i] = out[i-1] + in[i] +  1
-// b and bx specified (see idxcr.c/idxqry.c for an example)
-unsigned char *_p4d1dec8(     unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b, unsigned bx);
-unsigned char *_p4d1dec16(    unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
-unsigned char *_p4d1dec32(    unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-unsigned char *_p4d1dec128v16(unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); // SIMD (Vertical BitPacking)
-unsigned char *_p4d1dec128v32(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-unsigned char *_p4d1dec256v32(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-unsigned char *_p4d1dec64(    unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
-
-unsigned char *p4d1dec8(      unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start);
-unsigned char *p4d1dec16(     unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
-unsigned char *p4d1dec32(     unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4d1dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
-unsigned char *p4d1dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4d1dec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4d1dec64(     unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
-
-// ZigZag encoding
-inline unsigned char *_p4zdec8(      unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start, unsigned b, unsigned bx);
-inline unsigned char *_p4zdec16(     unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
-inline unsigned char *_p4zdec32(     unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-inline unsigned char *_p4zdec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
-inline unsigned char *_p4zdec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-inline unsigned char *_p4zdec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
-inline unsigned char *_p4zdec64(     unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
-
-unsigned char *p4zdec8(       unsigned char *__restrict in, unsigned n, uint8_t  *__restrict out, uint8_t  start);
-unsigned char *p4zdec16(      unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
-unsigned char *p4zdec32(      unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4zdec128v16(  unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
-unsigned char *p4zdec128v32(  unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4zdec256v32(  unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
-unsigned char *p4zdec64(      unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
-
-//---------------- Direct Access functions to compressed TurboPFor array p4encx16/p4encx32 -------------------------------------------------------
-  #ifdef TURBOPFOR_DAC
-#include "conf.h"
-#define P4D_PAD8(_x_)       ( (((_x_)+8-1)/8) )
-#define P4D_B(_x_)          ((_x_) & 0x7f)
-#define P4D_XB(_x_)         (((_x_) & 0x80)?((_x_) >> 8):0)
-#define P4D_ININC(_in_, _x_) _in_ += 1+((_x_) >> 7)
-
-static inline unsigned p4bits(unsigned char *__restrict in, int *bx) { unsigned i = ctou16(in); *bx = P4D_XB(i); return P4D_B(i); }
-
-struct p4 {
-  unsigned long long *xmap;
-  unsigned char *ex;
-  unsigned isx,bx,cum[P4D_MAX/64+1];
-  int oval,idx;
-};
-
-static unsigned long long p4xmap[P4D_MAX/64+1] = { 0 };
-
-// prepare direct access usage
-static inline void p4ini(struct p4 *p4, unsigned char **pin, unsigned n, unsigned *b) { unsigned char *in = *pin;
-  unsigned p4i  = ctou16(in);
-  p4->isx       = p4i&0x80;
-  *b            = P4D_B(p4i);
-  p4->bx        = P4D_XB(p4i);              //printf("p4i=%x,b=%d,bx=%d ", p4->i, *b, p4->bx);                        //assert(n <= P4D_MAX);
-  *pin = p4->ex = ++in;
-  if(p4->isx) {
-    unsigned num=0,j;
-    unsigned char *p;
-    ++in;
-    p4->xmap = (unsigned long long *)in;
-    for(j=0; j < n/64; j++) { p4->cum[j] = num; num += popcnt64(ctou64(in+j*8)); }
-    if(n & 0x3f) num += popcnt64(ctou64(in+j*8) & ((1ull<<(n&0x3f))-1) );
-    p4->ex = p = in + (n+7)/8;
-    *pin   = p = p4->ex+(((uint64_t)num*p4->bx+7)/8);
-  } else p4->xmap = p4xmap;
-  p4->oval = p4->idx  = -1;
-}
-
-//---------- Get a single value with index "idx" from a "p4encx32" packed array
-static ALWAYS_INLINE uint8_t  p4getx8( struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx8( in, idx, b);
-  if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx8(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
-  return u;
-}
-
-static ALWAYS_INLINE uint16_t p4getx16(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx16(in, idx, b);
-  if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx16(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
-  return u;
-}
-static ALWAYS_INLINE uint32_t p4getx32(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx32(in, idx, b);
-  if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx32(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
-  return u;
-}
-
-// Get the next single value greater of equal to val
-static ALWAYS_INLINE uint16_t p4geqx8( struct p4 *p4, unsigned char *in, unsigned b, uint8_t  val) { do p4->oval += p4getx8( p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
-static ALWAYS_INLINE uint16_t p4geqx16(struct p4 *p4, unsigned char *in, unsigned b, uint16_t val) { do p4->oval += p4getx16(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
-static ALWAYS_INLINE uint32_t p4geqx32(struct p4 *p4, unsigned char *in, unsigned b, uint32_t val) { do p4->oval += p4getx32(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
-
-/* DO NOT USE : like p4dec32 but using direct access. This is only a demo showing direct access usage. Use p4dec32 instead for decompressing entire blocks */
-unsigned char *p4decx32(   unsigned char *in, unsigned n, uint32_t *out);  // unsorted
-unsigned char *p4fdecx32(  unsigned char *in, unsigned n, uint32_t *out, uint32_t start); // FOR increasing
-unsigned char *p4f1decx32( unsigned char *in, unsigned n, uint32_t *out, uint32_t start); // FOR strictly increasing
-  #endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/vsimple.h
+++ b/vsimple.h
@ -1,47 +0,0 @@
-/**
-    Copyright (C) powturbo 2013-2019
-    GPL v2 License
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    - homepage : https://sites.google.com/site/powturbo/
-    - github   : https://github.com/powturbo
-    - twitter  : https://twitter.com/powturbo
-    - email    : powturbo [_AT_] gmail [_DOT_] com
-**/
-//  "Integer Compression" variable simple "SimpleV"
-//  this belongs to the integer compression known as "simple family", like simple-9,simple-16
-//  or simple-8b. SimpleV is compressing integers in groups into variable word size 32, 40 and 64 bits + RLE (run length encoding)
-//  SimpleV is faster than simple-16 and compress better than simple-16 or simple-8b.
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// vsencNN: compress array with n unsigned (NN bits in[n]) values to the buffer out. Return value = end of compressed output buffer out
-unsigned char *vsenc8( unsigned char  *__restrict in, size_t n, unsigned char  *__restrict out);
-unsigned char *vsenc16(unsigned short *__restrict in, size_t n, unsigned char  *__restrict out);
-unsigned char *vsenc32(unsigned       *__restrict in, size_t n, unsigned char  *__restrict out);
-unsigned char *vsenc64(uint64_t       *__restrict in, size_t n, unsigned char  *__restrict out);
-
-// vsdecNN: decompress buffer into an array of n unsigned values. Return value = end of compressed input buffer in
-unsigned char *vsdec8( unsigned char  *__restrict in, size_t n, unsigned char  *__restrict out);
-unsigned char *vsdec16(unsigned char  *__restrict in, size_t n, unsigned short *__restrict out);
-unsigned char *vsdec32(unsigned char  *__restrict in, size_t n, unsigned       *__restrict out);
-unsigned char *vsdec64(unsigned char  *__restrict in, size_t n, uint64_t       *__restrict out);
-
-#ifdef __cplusplus
-}
-#endif