This commit is contained in:
x
2023-03-10 20:50:21 +01:00
parent 2d3dcf5dda
commit 59f99b9b3c
12 changed files with 0 additions and 2920 deletions

310
bitpack.h
View File

@ -1,310 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// bitpack.h - "Integer Compression" Binary Packing header file
#ifndef BITPACK_H_
#define BITPACK_H_
#if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h"
#else
#include <stdint.h>
#endif
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
//******************** Bit Packing High Level API - n unlimited ***************************************************
size_t bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack128v64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitndpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitndpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitndpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitndpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitndpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitndpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnd1pack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnd1pack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnd1pack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnd1pack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnd1pack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnd1pack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnd1pack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnzpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnzpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnzpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnzpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnzpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t bitnunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t bitnunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
size_t bitnunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnunpack128v64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
size_t bitnunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitndunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t bitndunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitndunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitndunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
size_t bitndunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitndunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitndunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnd1unpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t bitnd1unpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnd1unpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnd1unpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
size_t bitnd1unpack128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnd1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnd1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnzunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t bitnzunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnzunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnzunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnfunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t bitnfunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnfunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
//******** Bit Packing Low level API ****************************************************************
// bipackNN: Pack array with n unsigned (NN bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
unsigned char *bitpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
unsigned char *bitpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
unsigned char *bitpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
unsigned char *bitpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
// delta bit packing
unsigned char *bitdpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
unsigned char *bitdpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
unsigned char *bitdpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
unsigned char *bitdpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
unsigned char *bitd1pack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
unsigned char *bitd1pack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
unsigned char *bitd1pack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
unsigned char *bitd1pack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
// FOR bit packing : sorted integer array
unsigned char *bitfpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
unsigned char *bitfpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
unsigned char *bitfpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
unsigned char *bitfpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
unsigned char *bitf1pack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
unsigned char *bitf1pack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
unsigned char *bitf1pack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
unsigned char *bitf1pack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
// zigzag : unsorted integer array
unsigned char *bitzpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
unsigned char *bitzpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
unsigned char *bitzpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
unsigned char *bitzpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
//-------------------------------------- SIMD ------------------------------------------------------------------------------------------
// Pack array with 128 unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
unsigned char *bitpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
unsigned char *bitdpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
unsigned char *bitfpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
unsigned char *bitf1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
unsigned char *bitzpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
unsigned char *bitpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
unsigned char *bitdpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
unsigned char *bitfpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
unsigned char *bitf1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
unsigned char *bitzpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
//unsigned char *bitpack256w32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
unsigned char *bitpack128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
unsigned char *bitpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
unsigned char *bitdpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
unsigned char *bitfpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
unsigned char *bitzpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
//********************************** Bit Packing : Unpack ****************************************************************
// ---------------- Unpack a b-bits packed integer array -------------------------------------------------------------------------------
// unpack a bitpacked integer array. Return value = end of packed buffer in
unsigned char *bitunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, unsigned b);
unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b);
unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b);
unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b);
// ---------------- Direct Access to a single packed integer array entry --------------------------------------------------------------
#ifdef TURBOPFOR_DAC
#ifdef __AVX2__
#include <immintrin.h>
#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_)
#else
#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
#define bzhi32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1))
#endif
#include "conf.h"
static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
//static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx;
//return (ctou64((uint32_t *)in+(bidx>>5)) << 32+(bidx&0x1f)) >> (64-b);
// return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
static ALWAYS_INLINE unsigned _bitgetx32(const unsigned char *__restrict in, uint64_t bidx, unsigned b) { return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
// like bitgetx32 but for 16 bits integer array
static ALWAYS_INLINE unsigned bitgetx8( const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou16((uint16_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
static ALWAYS_INLINE unsigned _bitgetx8( const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou16((uint16_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
static ALWAYS_INLINE unsigned bitgetx16(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou32((uint32_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
static ALWAYS_INLINE unsigned _bitgetx16(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou32((uint32_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
// Set a single value with index "idx"
static ALWAYS_INLINE void bitsetx16(const unsigned char *__restrict in, unsigned idx, unsigned v, unsigned b) { unsigned bidx = b*idx; unsigned *p = (unsigned *) in+(bidx>>4) ; *p = ( *p & ~(((1u <<b)-1) << (bidx& 0xf)) ) | v<<(bidx& 0xf);}
static ALWAYS_INLINE void bitsetx32(const unsigned char *__restrict in, unsigned idx, unsigned v, unsigned b) { unsigned bidx = b*idx; unsigned long long *p = (unsigned long long *)((unsigned *)in+(bidx>>5)); *p = ( *p & ~(((1ull<<b)-1) << (bidx&0x1f)) ) | (unsigned long long)v<<(bidx&0x1f);}
#endif
// ---------------- DFOR : integrated bitpacking, for delta packed SORTED array (Ex. DocId in inverted index) -------------------------------
// start <= out[0] <= out[1] <= ... <= out[n-2] <= out[n-1] <= (1<<N)-1 N=8,16,32 or 64
// out[0] = start + in[0]; out[1] = out[0] + in[1]; ... ; out[i] = out[i-1] + in[i]
unsigned char *bitdunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
unsigned char *bitdunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
// start < out[0] < out[1] < ... < out[n-2] < out[n-1] < (1<<N)-1, N=8,16,32 or 64
// out[0] = start + in[0] + 1; out[1] = out[0] + in[1] + 1; ... ; out[i] = out[i-1] + in[i] + 1
unsigned char *bitd1unpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
unsigned char *bitd1unpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
// ---------------- ZigZag : integrated bitpacking, for zigzag packed unsorted
unsigned char *bitzunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
unsigned char *bitzunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
// ---------------- For : Direct Access for packed SORTED array --------------------------------------------
// out[i] = start + in[i] + i
unsigned char *bitfunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
unsigned char *bitfunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
// out[i] = start + in[i] + i + 1
unsigned char *bitf1unpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
unsigned char *bitf1unpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
// ---------------- SIMD : unpack a SIMD bit packed integer array -------------------------------------------------------------------------------
// SIMD unpack a 128/256 bitpacked integer array. Return value = end of packed buffer in
unsigned char *bitunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b);
unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
unsigned char *bitd1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
unsigned char *bitf1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
unsigned char *bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitf1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
unsigned char *bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b);
unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitf1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
unsigned char *bitzunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitdunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
unsigned char *bitd1unpack128h32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
// internal TurboPFor functions: masked unpack
unsigned char *_bitunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
unsigned char *_bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
unsigned char *_bitd1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
unsigned char *_bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitdunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitd1unpack128h32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
//unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned char *bb);
unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
#ifdef __cplusplus
}
#endif
#endif

547
bitutil.h
View File

@ -1,547 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "Integer Compression: max.bits, delta, zigzag, xor"
#ifdef BITUTIL_IN
#ifdef __AVX2__
#include <immintrin.h>
#elif defined(__AVX__)
#include <immintrin.h>
#elif defined(__SSE4_1__)
#include <smmintrin.h>
#elif defined(__SSSE3__)
#ifdef __powerpc64__
#define __SSE__ 1
#define __SSE2__ 1
#define __SSE3__ 1
#define NO_WARN_X86_INTRINSICS 1
#endif
#include <tmmintrin.h>
#elif defined(__SSE2__)
#include <emmintrin.h>
#elif defined(__ARM_NEON)
#include <arm_neon.h>
#endif
#if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h"
#else
#include <stdint.h>
#endif
#include "sse_neon.h"
#ifdef __ARM_NEON
#define PREFETCH(_ip_,_rw_)
#else
#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
#endif
//------------------------ zigzag encoding -------------------------------------------------------------
static inline unsigned char zigzagenc8( signed char x) { return x << 1 ^ x >> 7; }
static inline char zigzagdec8( unsigned char x) { return x >> 1 ^ -(x & 1); }
static inline unsigned short zigzagenc16(short x) { return x << 1 ^ x >> 15; }
static inline short zigzagdec16(unsigned short x) { return x >> 1 ^ -(x & 1); }
static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; }
static inline int zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); }
static inline uint64_t zigzagenc64(int64_t x) { return x << 1 ^ x >> 63; }
static inline int64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); }
#if defined(__SSE2__) || defined(__ARM_NEON)
static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16(v,15)); }
static ALWAYS_INLINE __m128i mm_zzage_epi32(__m128i v) { return _mm_xor_si128( mm_slli_epi32(v,1), mm_srai_epi32(v,31)); }
//static ALWAYS_INLINE __m128i mm_zzage_epi64(__m128i v) { return _mm_xor_si128( mm_slli_epi64(v,1), _mm_srai_epi64(v,63)); }
static ALWAYS_INLINE __m128i mm_zzagd_epi16(__m128i v) { return _mm_xor_si128( mm_srli_epi16(v,1), mm_srai_epi16( mm_slli_epi16(v,15),15) ); }
static ALWAYS_INLINE __m128i mm_zzagd_epi32(__m128i v) { return _mm_xor_si128( mm_srli_epi32(v,1), mm_srai_epi32( mm_slli_epi32(v,31),31) ); }
//static ALWAYS_INLINE __m128i mm_zzagd_epi64(__m128i v) { return _mm_xor_si128(mm_srli_epi64(v,1), _mm_srai_epi64( m_slli_epi64(v,63),63) ); }
#endif
#ifdef __AVX2__
static ALWAYS_INLINE __m256i mm256_zzage_epi32(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi32(v,1), _mm256_srai_epi32(v,31)); }
static ALWAYS_INLINE __m256i mm256_zzagd_epi32(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi32(v,1), _mm256_srai_epi32(_mm256_slli_epi32(v,31),31) ); }
#endif
//-------------- AVX2 delta + prefix sum (scan) / xor encode/decode ---------------------------------------------------------------------------------------
#ifdef __AVX2__
static ALWAYS_INLINE __m256i mm256_delta_epi32(__m256i v, __m256i sv) { return _mm256_sub_epi32(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
static ALWAYS_INLINE __m256i mm256_delta_epi64(__m256i v, __m256i sv) { return _mm256_sub_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 8)); }
static ALWAYS_INLINE __m256i mm256_xore_epi32( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
static ALWAYS_INLINE __m256i mm256_xore_epi64( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 8)); }
static ALWAYS_INLINE __m256i mm256_scan_epi32(__m256i v, __m256i sv) {
v = _mm256_add_epi32(v, _mm256_slli_si256(v, 4));
v = _mm256_add_epi32(v, _mm256_slli_si256(v, 8));
return _mm256_add_epi32( _mm256_permute2x128_si256( _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
_mm256_add_epi32(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));
}
static ALWAYS_INLINE __m256i mm256_xord_epi32(__m256i v, __m256i sv) {
v = _mm256_xor_si256(v, _mm256_slli_si256(v, 4));
v = _mm256_xor_si256(v, _mm256_slli_si256(v, 8));
return _mm256_xor_si256( _mm256_permute2x128_si256( _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
_mm256_xor_si256(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));
}
static ALWAYS_INLINE __m256i mm256_scan_epi64(__m256i v, __m256i sv) {
v = _mm256_add_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
return _mm256_add_epi64(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_add_epi64(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
}
static ALWAYS_INLINE __m256i mm256_xord_epi64(__m256i v, __m256i sv) {
v = _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
return _mm256_xor_si256(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_xor_si256(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
}
static ALWAYS_INLINE __m256i mm256_scani_epi32(__m256i v, __m256i sv, __m256i vi) { return _mm256_add_epi32(mm256_scan_epi32(v, sv), vi); }
#endif
#if defined(__SSSE3__) || defined(__ARM_NEON)
static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_alignr_epi8(v, sv, 14)); }
static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_alignr_epi8(v, sv, 12)); }
static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 14)); }
static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 12)); }
#define MM_HDEC_EPI32(_v_,_sv_,_hop_) { _v_ = _hop_(_v_, _mm_slli_si128(_v_, 4)); _v_ = _hop_(mm_shuffle_nnnn_epi32(_sv_, 3), _hop_(_mm_slli_si128(_v_, 8), _v_)); }
static ALWAYS_INLINE __m128i mm_scan_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_add_epi32); return v; }
static ALWAYS_INLINE __m128i mm_xord_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_xor_si128); return v; }
#define MM_HDEC_EPI16(_v_,_sv_,_hop_) {\
_v_ = _hop_( _v_, _mm_slli_si128(_v_, 2));\
_v_ = _hop_( _v_, _mm_slli_si128(_v_, 4));\
_v_ = _hop_(_hop_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi16(0x0f0e)));\
}
static ALWAYS_INLINE __m128i mm_scan_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_add_epi16); return v; }
static ALWAYS_INLINE __m128i mm_xord_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_xor_si128); return v; }
//-------- scan with vi delta > 0 -----------------------------
static ALWAYS_INLINE __m128i mm_scani_epi16(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi16(mm_scan_epi16(v, sv), vi); }
static ALWAYS_INLINE __m128i mm_scani_epi32(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi32(mm_scan_epi32(v, sv), vi); }
#elif defined(__SSE2__)
static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
#endif
#if !defined(_M_X64) && !defined(__x86_64__) && defined(__AVX__)
#define _mm256_extract_epi64(v, index) ((__int64)((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2) | (((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2 + 1)) << 32)))
#endif
//------------------ Horizontal OR -----------------------------------------------
#ifdef __AVX2__
static ALWAYS_INLINE unsigned mm256_hor_epi32(__m256i v) {
v = _mm256_or_si256(v, _mm256_srli_si256(v, 8));
v = _mm256_or_si256(v, _mm256_srli_si256(v, 4));
return _mm256_extract_epi32(v,0) | _mm256_extract_epi32(v, 4);
}
static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
v = _mm256_or_si256(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(2, 0, 0, 1)));
return _mm256_extract_epi64(v, 1) | _mm256_extract_epi64(v,0);
}
#endif
#if defined(__SSE2__) || defined(__ARM_NEON)
#define MM_HOZ_EPI16(v,_hop_) {\
v = _hop_(v, _mm_srli_si128(v, 8));\
v = _hop_(v, _mm_srli_si128(v, 6));\
v = _hop_(v, _mm_srli_si128(v, 4));\
v = _hop_(v, _mm_srli_si128(v, 2));\
}
#define MM_HOZ_EPI32(v,_hop_) {\
v = _hop_(v, _mm_srli_si128(v, 8));\
v = _hop_(v, _mm_srli_si128(v, 4));\
}
static ALWAYS_INLINE uint16_t mm_hor_epi16( __m128i v) { MM_HOZ_EPI16(v,_mm_or_si128); return (unsigned short)_mm_cvtsi128_si32(v); }
static ALWAYS_INLINE uint32_t mm_hor_epi32( __m128i v) { MM_HOZ_EPI32(v,_mm_or_si128); return (unsigned )_mm_cvtsi128_si32(v); }
static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _mm_srli_si128(v, 8)); return (uint64_t )_mm_cvtsi128_si64(v); }
#endif
//----------------- sub / add ----------------------------------------------------------
#if defined(__SSE2__) || defined(__ARM_NEON)
#define SUBI16x8(_v_, _sv_) _mm_sub_epi16(_v_, _sv_)
#define SUBI32x4(_v_, _sv_) _mm_sub_epi32(_v_, _sv_)
#define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_)
#define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_)
//---------------- Convert mm_cvtsi128_siXX -------------------------------------------
static ALWAYS_INLINE uint8_t mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
#endif
//--------- memset -----------------------------------------
#define BITFORSET_(_out_, _n_, _start_, _mindelta_) do { unsigned _i;\
for(_i = 0; _i != (_n_&~3); _i+=4) { \
_out_[_i+0] = _start_+(_i )*_mindelta_; \
_out_[_i+1] = _start_+(_i+1)*_mindelta_; \
_out_[_i+2] = _start_+(_i+2)*_mindelta_; \
_out_[_i+3] = _start_+(_i+3)*_mindelta_; \
} \
while(_i != _n_) \
_out_[_i] = _start_+_i*_mindelta_, ++_i; \
} while(0)
//--------- SIMD zero -----------------------------------------
#ifdef __AVX2__
#define BITZERO32(_out_, _n_, _start_) do {\
__m256i _sv_ = _mm256_set1_epi32(_start_), *_ov = (__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\
do _mm256_storeu_si256(_ov++, _sv_); while(_ov < _ove);\
} while(0)
#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) do {\
__m256i _sv = _mm256_set1_epi32(_start_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_), _cv = _mm256_set_epi32(7+_mindelta_,6+_mindelta_,5+_mindelta_,4+_mindelta_,3*_mindelta_,2*_mindelta_,1*_mindelta_,0); \
_sv = _mm256_add_epi32(_sv, _cv);\
_cv = _mm256_set1_epi32(4);\
do { _mm256_storeu_si256(_ov++, _sv); _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\
} while(0)
#define BITDIZERO32(_out_, _n_, _start_, _mindelta_) do { __m256i _sv = _mm256_set1_epi32(_start_), _cv = _mm256_set_epi32(7+_mindelta_,6+_mindelta_,5+_mindelta_,4+_mindelta_,3+_mindelta_,2+_mindelta_,1+_mindelta_,_mindelta_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\
_sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_mindelta_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\
} while(0)
#elif defined(__SSE2__) || defined(__ARM_NEON) // -------------
// SIMD set value (memset)
#define BITZERO32(_out_, _n_, _v_) do {\
__m128i _sv_ = _mm_set1_epi32(_v_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
do _mm_storeu_si128(_ov++, _sv_); while(_ov < _ove); \
} while(0)
#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) do {\
__m128i _sv = _mm_set1_epi32(_start_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_), _cv = _mm_set_epi32(3*_mindelta_,2*_mindelta_,1*_mindelta_,0); \
_sv = _mm_add_epi32(_sv, _cv);\
_cv = _mm_set1_epi32(4);\
do { _mm_storeu_si128(_ov++, _sv); _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
} while(0)
#define BITDIZERO32(_out_, _n_, _start_, _mindelta_) do { __m128i _sv = _mm_set1_epi32(_start_), _cv = _mm_set_epi32(3+_mindelta_,2+_mindelta_,1+_mindelta_,_mindelta_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
_sv = _mm_add_epi32(_sv, _cv); _cv = _mm_set1_epi32(4*_mindelta_); do { _mm_storeu_si128(_ov++, _sv), _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
} while(0)
#else
#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) BITFORSET_(_out_, _n_, _start_, _mindelta_)
#define BITZERO32( _out_, _n_, _start_) BITFORSET_(_out_, _n_, _start_, 0)
#endif
#define DELTR( _in_, _n_, _start_, _mindelta_, _out_) { unsigned _v; for( _v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_); }
#define DELTRB(_in_, _n_, _start_, _mindelta_, _b_, _out_) { unsigned _v; for(_b_=0,_v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_), _b_ |= _out_[_v]; _b_ = bsr32(_b_); }
//----------------------------------------- bitreverse scalar + SIMD -------------------------------------------
#if __clang__ && defined __has_builtin
#if __has_builtin(__builtin_bitreverse64)
#define BUILTIN_BITREVERSE
#else
#define BUILTIN_BITREVERSE
#endif
#endif
#ifdef BUILTIN_BITREVERSE
#define rbit8(x) __builtin_bitreverse8( x)
#define rbit16(x) __builtin_bitreverse16(x)
#define rbit32(x) __builtin_bitreverse32(x)
#define rbit64(x) __builtin_bitreverse64(x)
#else
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
static ALWAYS_INLINE uint32_t _rbit_(uint32_t x) { uint32_t rc; __asm volatile ("rbit %0, %1" : "=r" (rc) : "r" (x) ); }
#endif
static ALWAYS_INLINE uint8_t rbit8(uint8_t x) {
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
return _rbit_(x) >> 24;
#elif 0
x = (x & 0xaa) >> 1 | (x & 0x55) << 1;
x = (x & 0xcc) >> 2 | (x & 0x33) << 2;
return x << 4 | x >> 4;
#else
return (x * 0x0202020202ull & 0x010884422010ull) % 1023;
#endif
}
static ALWAYS_INLINE uint16_t rbit16(uint16_t x) {
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
return _rbit_(x) >> 16;
#else
x = (x & 0xaaaa) >> 1 | (x & 0x5555) << 1;
x = (x & 0xcccc) >> 2 | (x & 0x3333) << 2;
x = (x & 0xf0f0) >> 4 | (x & 0x0f0f) << 4;
return x << 8 | x >> 8;
#endif
}
static ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
return _rbit_(x);
#else
x = ((x & 0xaaaaaaaa) >> 1 | (x & 0x55555555) << 1);
x = ((x & 0xcccccccc) >> 2 | (x & 0x33333333) << 2);
x = ((x & 0xf0f0f0f0) >> 4 | (x & 0x0f0f0f0f) << 4);
x = ((x & 0xff00ff00) >> 8 | (x & 0x00ff00ff) << 8);
return x << 16 | x >> 16;
#endif
}
static ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
return (uint64_t)_rbit_(x) << 32 | _rbit_(x >> 32);
#else
x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1;
x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2;
x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4;
x = (x & 0xff00ff00ff00ff00) >> 8 | (x & 0x00ff00ff00ff00ff) << 8;
x = (x & 0xffff0000ffff0000) >> 16 | (x & 0x0000ffff0000ffff) << 16;
return x << 32 | x >> 32;
#endif
}
#endif
#if defined(__SSSE3__) || defined(__ARM_NEON)
static ALWAYS_INLINE __m128i mm_rbit_epi16(__m128i v) { return mm_rbit_epi8(mm_rev_epi16(v)); }
static ALWAYS_INLINE __m128i mm_rbit_epi32(__m128i v) { return mm_rbit_epi8(mm_rev_epi32(v)); }
static ALWAYS_INLINE __m128i mm_rbit_epi64(__m128i v) { return mm_rbit_epi8(mm_rev_epi64(v)); }
//static ALWAYS_INLINE __m128i mm_rbit_si128(__m128i v) { return mm_rbit_epi8(mm_rev_si128(v)); }
#endif
#ifdef __AVX2__
static ALWAYS_INLINE __m256i mm256_rbit_epi8(__m256i v) {
__m256i fv = _mm256_setr_epi8(0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15, 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15), cv0f_8 = _mm256_set1_epi8(0xf);
__m256i lv = _mm256_shuffle_epi8(fv,_mm256_and_si256( v, cv0f_8));
__m256i hv = _mm256_shuffle_epi8(fv,_mm256_and_si256(_mm256_srli_epi64(v, 4), cv0f_8));
return _mm256_or_si256(_mm256_slli_epi64(lv,4), hv);
}
static ALWAYS_INLINE __m256i mm256_rev_epi16(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14)); }
static ALWAYS_INLINE __m256i mm256_rev_epi32(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11,10, 9, 8,15,14,13,12, 3, 2, 1, 0, 7, 6, 5, 4, 11,10, 9, 8,15,14,13,12)); }
static ALWAYS_INLINE __m256i mm256_rev_epi64(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8)); }
static ALWAYS_INLINE __m256i mm256_rev_si128(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
static ALWAYS_INLINE __m256i mm256_rbit_epi16(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi16(v)); }
static ALWAYS_INLINE __m256i mm256_rbit_epi32(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi32(v)); }
static ALWAYS_INLINE __m256i mm256_rbit_epi64(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi64(v)); }
static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi8(mm256_rev_si128(v)); }
#endif
// ------------------ bitio genaral macros ---------------------------
#ifdef __AVX2__
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#define bzhi_u32(_u_, _b_) _bzhi_u32(_u_, _b_)
#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
#define bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
#else
#define bzhi_u64(_u_, _b_) _bzhi_u64(_u_, _b_)
#endif
#else
#define bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
#define bzhi_u32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1))
#endif
#define BZHI64(_u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))
#define BZHI32(_u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1)))
#define bitdef( _bw_,_br_) uint64_t _bw_=0; unsigned _br_=0
#define bitini( _bw_,_br_) _bw_=_br_=0
//-- bitput ---------
#define bitput( _bw_,_br_,_nb_,_x_) (_bw_) += (uint64_t)(_x_) << (_br_), (_br_) += (_nb_)
#define bitenorm( _bw_,_br_,_op_) ctou64(_op_) = _bw_; _op_ += ((_br_)>>3), (_bw_) >>=((_br_)&~7), (_br_) &= 7
#define bitflush( _bw_,_br_,_op_) ctou64(_op_) = _bw_, _op_ += ((_br_)+7)>>3, _bw_=_br_=0
//-- bitget ---------
#define bitbw( _bw_,_br_) ((_bw_)>>(_br_))
#define bitrmv( _bw_,_br_,_nb_) (_br_) += _nb_
#define bitdnorm( _bw_,_br_,_ip_) _bw_ = ctou64((_ip_) += ((_br_)>>3)), (_br_) &= 7
#define bitalign( _bw_,_br_,_ip_) ((_ip_) += ((_br_)+7)>>3)
#define BITPEEK32( _bw_,_br_,_nb_) BZHI32(bitbw(_bw_,_br_), _nb_)
#define BITGET32( _bw_,_br_,_nb_,_x_) _x_ = BITPEEK32(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
#define BITPEEK64( _bw_,_br_,_nb_) BZHI64(bitbw(_bw_,_br_), _nb_)
#define BITGET64( _bw_,_br_,_nb_,_x_) _x_ = BITPEEK64(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
#define bitpeek57( _bw_,_br_,_nb_) bzhi_u64(bitbw(_bw_,_br_), _nb_)
#define bitget57( _bw_,_br_,_nb_,_x_) _x_ = bitpeek57(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
#define bitpeek31( _bw_,_br_,_nb_) bzhi_u32(bitbw(_bw_,_br_), _nb_)
#define bitget31( _bw_,_br_,_nb_,_x_) _x_ = bitpeek31(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
//------------------ templates -----------------------------------
#define bitput8( _bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
#define bitput16(_bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
#define bitput32(_bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
#define bitput64(_bw_,_br_,_b_,_x_,_op_) if((_b_)>45) { bitput(_bw_,_br_,(_b_)-32, (_x_)>>32); bitenorm(_bw_,_br_,_op_); bitput(_bw_,_br_,32,(unsigned)(_x_)); } else bitput(_bw_,_br_,_b_,_x_)
#define bitget8( _bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_)
#define bitget16(_bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_)
#define bitget32(_bw_,_br_,_b_,_x_,_ip_) bitget57(_bw_,_br_,_b_,_x_)
#define bitget64(_bw_,_br_,_b_,_x_,_ip_) if((_b_)>45) { unsigned _v; bitget57(_bw_,_br_,(_b_)-32,_x_); bitdnorm(_bw_,_br_,_ip_); BITGET64(_bw_,_br_,32,_v); _x_ = _x_<<32|_v; } else bitget57(_bw_,_br_,_b_,_x_)
#endif
//---------- max. bit length + transform for sorted/unsorted arrays, delta,delta 1, delta > 1, zigzag, zigzag of delta, xor, FOR,----------------
#ifdef __cplusplus
extern "C" {
#endif
//------ ORed array, used to determine the maximum bit length of the elements in an unsorted integer array ---------------------
uint8_t bit8( uint8_t *in, unsigned n, uint8_t *px);
uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px);
uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px);
uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px);
//-------------- delta = 0: Sorted integer array w/ mindelta = 0 ----------------------------------------------
//-- ORed array, maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1]
uint8_t bitd8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
//-- in-place reverse delta 0
void bitddec8( uint8_t *p, unsigned n, uint8_t start); // non decreasing (out[i] = in[i] - in[i-1])
void bitddec16( uint16_t *p, unsigned n, uint16_t start);
void bitddec32( uint32_t *p, unsigned n, uint32_t start);
void bitddec64( uint64_t *p, unsigned n, uint64_t start);
//-- vectorized fast delta4 one: out[0] = in[4]-in[0], out[1]=in[5]-in[1], out[2]=in[6]-in[2], out[3]=in[7]-in[3],...
uint16_t bits128v16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bits128v32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
//------------- delta = 1: Sorted integer array w/ mindelta = 1 ---------------------------------------------
//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
uint8_t bitd18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
//-- in-place reverse delta one
void bitd1dec8( uint8_t *p, unsigned n, uint8_t start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
void bitd1dec16( uint16_t *p, unsigned n, uint16_t start);
void bitd1dec32( uint32_t *p, unsigned n, uint32_t start);
void bitd1dec64( uint64_t *p, unsigned n, uint64_t start);
//------------- delta > 1: Sorted integer array w/ mindelta > 1 ---------------------------------------------
//-- ORed array, for max. bit length get min. delta ()
uint8_t bitdi8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
uint16_t bitdi16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bitdi32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
uint64_t bitdi64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
//-- transform sorted integer array to delta array: out[i] = in[i] - in[i-1] - mindelta
uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
//-- in-place reverse delta
void bitdidec8( uint8_t *in, unsigned n, uint8_t start, uint8_t mindelta);
void bitdidec16(uint16_t *in, unsigned n, uint16_t start, uint16_t mindelta);
void bitdidec32(uint32_t *in, unsigned n, uint32_t start, uint32_t mindelta);
void bitdidec64(uint64_t *in, unsigned n, uint64_t start, uint64_t mindelta);
//------------- FOR : array bit length: ---------------------------------------------------------------------
//------ ORed array, for max. bit length of the non decreasing integer array. out[i] = in[i] - start
uint8_t bitf8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
uint16_t bitf16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bitf32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
uint64_t bitf64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
//------ ORed array, for max. bit length of the non strictly decreasing integer array out[i] = in[i] - 1 - start
uint8_t bitf18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
uint16_t bitf116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bitf132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
uint64_t bitf164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
//------ ORed array, for max. bit length for usorted array
uint8_t bitfm8( uint8_t *in, unsigned n, uint8_t *px, uint8_t *pmin); // unsorted
uint16_t bitfm16(uint16_t *in, unsigned n, uint16_t *px, uint16_t *pmin);
uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t *px, uint32_t *pmin);
uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin);
//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] ------------------------
//-- ORed array, to get maximum zigzag bit length integer array
uint8_t bitz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
uint16_t bitz16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bitz32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
uint64_t bitz64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
//-- Zigzag transform
uint8_t bitzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
//-- in-place zigzag reverse transform
void bitzdec8( uint8_t *in, unsigned n, uint8_t start);
void bitzdec16( uint16_t *in, unsigned n, uint16_t start);
void bitzdec32( uint32_t *in, unsigned n, uint32_t start);
void bitzdec64( uint64_t *in, unsigned n, uint64_t start);
//------------- Zigzag of zigzag/delta : unsorted/sorted integer array ----------------------------------------------------
//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
uint8_t bitzz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
uint16_t bitzz16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bitzz32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
uint64_t bitzz64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
uint8_t bitzzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
uint16_t bitzzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
//-- in-place reverse zigzag of delta (encoded w/ bitdiencNN and parameter mindelta = 1)
void bitzzdec8( uint8_t *in, unsigned n, uint8_t start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
void bitzzdec16( uint16_t *in, unsigned n, uint16_t start);
void bitzzdec32( uint32_t *in, unsigned n, uint32_t start);
void bitzzdec64( uint64_t *in, unsigned n, uint64_t start);
//------------- XOR encoding for unsorted integer lists: out[i] = in[i] - in[i-1] -------------
//-- ORed array, to get maximum zigzag bit length integer array
uint8_t bitx8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
uint16_t bitx16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
uint32_t bitx32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
uint64_t bitx64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
//-- XOR transform
uint8_t bitxenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start);
uint16_t bitxenc16( uint16_t *in, unsigned n, uint16_t *out, uint16_t start);
uint32_t bitxenc32( uint32_t *in, unsigned n, uint32_t *out, uint32_t start);
uint64_t bitxenc64( uint64_t *in, unsigned n, uint64_t *out, uint64_t start);
//-- XOR in-place reverse transform
void bitxdec8( uint8_t *p, unsigned n, uint8_t start);
void bitxdec16( uint16_t *p, unsigned n, uint16_t start);
void bitxdec32( uint32_t *p, unsigned n, uint32_t start);
void bitxdec64( uint64_t *p, unsigned n, uint64_t start);
//------- Lossy floating point transform: pad the trailing mantissa bits with zeros according to the error e (ex. e=0.00001)
#ifdef USE_FLOAT16
void fppad16(_Float16 *in, size_t n, _Float16 *out, float e);
#endif
void fppad32(float *in, size_t n, float *out, float e);
void fppad64(double *in, size_t n, double *out, double e);
#ifdef __cplusplus
}
#endif
//---- Floating point to Integer decomposition ---------------------------------
// seeeeeeee21098765432109876543210 (s:sign, e:exponent, 0-9:mantissa)
#ifdef BITUTIL_IN
#define MANTF32 23
#define MANTF64 52
#define BITFENC(_u_, _sgn_, _expo_, _mant_, _mantbits_, _one_) _sgn_ = _u_ >> (sizeof(_u_)*8-1); _expo_ = ((_u_ >> (_mantbits_)) & ( (_one_<<(sizeof(_u_)*8 - 1 - _mantbits_)) -1)); _mant_ = _u_ & ((_one_<<_mantbits_)-1);
#define BITFDEC( _sgn_, _expo_, _mant_, _u_, _mantbits_) _u_ = (_sgn_) << (sizeof(_u_)*8-1) | (_expo_) << _mantbits_ | (_mant_)
#endif

282
conf.h
View File

@ -1,282 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// conf.h - config & common
#ifndef CONF_H
#define CONF_H
//------------------------- Compiler ------------------------------------------
#if defined(__GNUC__)
#include <stdint.h>
#define ALIGNED(t,v,n) t v __attribute__ ((aligned (n)))
#define ALWAYS_INLINE inline __attribute__((always_inline))
#define NOINLINE __attribute__((noinline))
#define _PACKED __attribute__ ((packed))
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
#define popcnt32(_x_) __builtin_popcount(_x_)
#define popcnt64(_x_) __builtin_popcountll(_x_)
#if defined(__i386__) || defined(__x86_64__)
//x,__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
// x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; }
static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
static inline int bsr64(uint64_t x ) { return x?64 - __builtin_clzll(x):0; }
static inline int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); }
static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
#else
static inline int __bsr32(unsigned x ) { return 31 - __builtin_clz( x); }
static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; }
static inline int bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
#endif
#define ctz64(_x_) __builtin_ctzll(_x_)
#define ctz32(_x_) __builtin_ctz(_x_) // 0:32 ctz32(1<<a) = a (a=1..31)
#define clz64(_x_) __builtin_clzll(_x_)
#define clz32(_x_) __builtin_clz(_x_) // 00000000 00000000 00000000 01000000 = 25
//#define bswap8(x) (x)
#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
#define bswap16(x) __builtin_bswap16(x)
#else
static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
#endif
#define bswap32(x) __builtin_bswap32(x)
#define bswap64(x) __builtin_bswap64(x)
#elif _MSC_VER //----------------------------------------------------
#include <windows.h>
#include <intrin.h>
#if _MSC_VER < 1600
#include "vs/stdint.h"
#define __builtin_prefetch(x,a)
#define inline __inline
#else
#include <stdint.h>
#define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA)
#endif
#define ALIGNED(t,v,n) __declspec(align(n)) t v
#define ALWAYS_INLINE __forceinline
#define NOINLINE __declspec(noinline)
#define THREADLOCAL __declspec(thread)
#define likely(x) (x)
#define unlikely(x) (x)
static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
static inline int bsr32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?z+1:0; }
static inline int ctz32( unsigned x) { unsigned long z; _BitScanForward(&z, x); return x?z:32; }
static inline int clz32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?31-z:32; }
#if !defined(_M_ARM64) && !defined(_M_X64)
static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
unsigned long x0 = (unsigned long)x, top, bottom; _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0);
*ret = x0 ? bottom : 32 + top; return x != 0;
}
static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
unsigned long x1 = (unsigned long)(x >> 32), top, bottom; _BitScanReverse(&top, x1); _BitScanReverse(&bottom, (unsigned long)x);
*ret = x1 ? top + 32 : bottom; return x != 0;
}
#endif
static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
static inline int ctz64(uint64_t x) { unsigned long z; _BitScanForward64(&z, x); return x?z:64; }
static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x); return x?63-z:64; }
#define rol32(x,s) _lrotl(x, s)
#define ror32(x,s) _lrotr(x, s)
#define bswap16(x) _byteswap_ushort(x)
#define bswap32(x) _byteswap_ulong(x)
#define bswap64(x) _byteswap_uint64(x)
#define popcnt32(x) __popcnt(x)
#ifdef _WIN64
#define popcnt64(x) __popcnt64(x)
#else
#define popcnt64(x) (popcnt32(x) + popcnt32(x>>32))
#endif
#define sleep(x) Sleep(x/1000)
#define fseeko _fseeki64
#define ftello _ftelli64
#define strcasecmp _stricmp
#define strncasecmp _strnicmp
#define strtoull _strtoui64
static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
#endif
#define __bsr8(_x_) __bsr32(_x_)
#define __bsr16(_x_) __bsr32(_x_)
#define bsr8(_x_) bsr32(_x_)
#define bsr16(_x_) bsr32(_x_)
#define ctz8(_x_) ctz32(_x_)
#define ctz16(_x_) ctz32(_x_)
#define clz8(_x_) (clz32(_x_)-24)
#define clz16(_x_) (clz32(_x_)-16)
#define popcnt8(x) popcnt32(x)
#define popcnt16(x) popcnt32(x)
//--------------- Unaligned memory access -------------------------------------
#ifdef UA_MEMCPY
#include <string.h>
static inline unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; }
static inline unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; }
static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
static inline size_t ctousz(const void *cp) { size_t x; memcpy(&x, cp, sizeof(x)); return x; }
static inline float ctof32(const void *cp) { float x; memcpy(&x, cp, sizeof(x)); return x; }
static inline double ctof64(const void *cp) { double x; memcpy(&x, cp, sizeof(x)); return x; }
static inline void stou16( void *cp, unsigned short x) { memcpy(cp, &x, sizeof(x)); }
static inline void stou32( void *cp, unsigned x) { memcpy(cp, &x, sizeof(x)); }
static inline void stou64( void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
static inline void stousz( void *cp, size_t x) { memcpy(cp, &x, sizeof(x)); }
static inline void stof32( void *cp, float x) { memcpy(cp, &x, sizeof(x)); }
static inline void stof64( void *cp, double x) { memcpy(cp, &x, sizeof(x)); }
#elif defined(__i386__) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\
defined(__powerpc__) || defined(__s390__) ||\
defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
#define ctou16(_cp_) (*(unsigned short *)(_cp_))
#define ctou32(_cp_) (*(unsigned *)(_cp_))
#define ctof32(_cp_) (*(float *)(_cp_))
#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER)
#define ctou64(_cp_) (*(uint64_t *)(_cp_))
#define ctof64(_cp_) (*(double *)(_cp_))
#elif defined(__ARM_FEATURE_UNALIGNED)
struct _PACKED longu { uint64_t l; };
struct _PACKED doubleu { double d; };
#define ctou64(_cp_) ((struct longu *)(_cp_))->l
#define ctof64(_cp_) ((struct doubleu *)(_cp_))->d
#endif
#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__)
struct _PACKED shortu { unsigned short s; };
struct _PACKED unsignedu { unsigned u; };
struct _PACKED longu { uint64_t l; };
struct _PACKED floatu { float f; };
struct _PACKED doubleu { double d; };
#define ctou16(_cp_) ((struct shortu *)(_cp_))->s
#define ctou32(_cp_) ((struct unsignedu *)(_cp_))->u
#define ctou64(_cp_) ((struct longu *)(_cp_))->l
#define ctof32(_cp_) ((struct floatu *)(_cp_))->f
#define ctof64(_cp_) ((struct doubleu *)(_cp_))->d
#else
#error "unknown cpu"
#endif
#define ctou24(_cp_) (ctou32(_cp_) & 0xffffff)
#define ctou48(_cp_) (ctou64(_cp_) & 0xffffffffffffull)
#define ctou8(_cp_) (*(_cp_))
//--------------------- wordsize ----------------------------------------------
#if defined(__64BIT__) || defined(_LP64) || defined(__LP64__) || defined(_WIN64) ||\
defined(__x86_64__) || defined(_M_X64) ||\
defined(__ia64) || defined(_M_IA64) ||\
defined(__aarch64__) ||\
defined(__mips64) ||\
defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
defined(__s390x__)
#define __WORDSIZE 64
#else
#define __WORDSIZE 32
#endif
#endif
//---------------------misc ---------------------------------------------------
#define BZHI64F(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) // _b_ < 64
#define BZHI32F(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) // _b_ < 32
#define BZHI64( _u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1))) // Constant
#define BZHI32( _u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1)))
#define BZHI16( _u_, _b_) BZHI32(_u_, _b_)
#define BZHI8( _u_, _b_) BZHI32(_u_, _b_)
#ifdef __AVX2__
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_)
#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
#else
#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
#endif
#else
#define bzhi_u64(_u_, _b_) BZHI64(_u_, _b_)
#define bzhi_u32(_u_, _b_) BZHI32(_u_, _b_)
#endif
#define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1))
#define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1)))
#define TEMPLATE2_(_x_, _y_) _x_##_y_
#define TEMPLATE2(_x_, _y_) TEMPLATE2_(_x_,_y_)
#define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_
#define TEMPLATE3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_)
#define CACHE_LINE_SIZE 64
#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
#define CLAMP(_x_, _low_, _high_) (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_)))
//--- NDEBUG -------
#include <stdio.h>
#ifdef _MSC_VER
#ifdef NDEBUG
#define AS(expr, fmt, ...)
#define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
#define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
#else
#define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
#define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
#define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
#endif
#else
#ifdef NDEBUG
#define AS(expr, fmt,args...)
#define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
#define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
#else
#define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
#define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
#define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
#endif
#endif

View File

@ -1,61 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// eliasfano.h - "Integer Compression" Elias Fano c/c++ header
#ifdef __cplusplus
extern "C" {
#endif
#if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h"
#else
#include <stdint.h>
#endif
// compress/decompress integer array with n values to the buffer out. Return value = end of output/input buffer
unsigned char *efanoenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *efanoenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *efanodec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *efanodec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
unsigned char *efano1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *efano1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *efano1dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *efano1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
unsigned char *efanoenc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *efanodec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *efano1enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *efano1dec128v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *efanoenc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *efanodec256v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *efano1enc256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *efano1dec256v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
#ifdef __cplusplus
}
#endif

125
fp.h
View File

@ -1,125 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "Floating Point + Integer Compression"
#ifdef __cplusplus
extern "C" {
#endif
#if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h"
#else
#include <stdint.h>
#endif
// ---------- TurboPFor Zigzag of delta (=delta of delta + zigzag encoding) (TurboPFor)
size_t p4nzzenc128v8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t p4nzzdec128v8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t p4nzzenc128v16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t p4nzzdec128v16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t p4nzzenc128v32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t p4nzzdec128v32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t p4nzzenc128v64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t p4nzzdec128v64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
//----------- Zigzag (bit/io) -------------------------------------------------------
size_t bvzenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t bvzdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t bvzenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t bvzdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t bvzenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t bvzdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t bvzenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t bvzdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
//----------- Zigzag of delta (bit/io) ---------------------------------------------
size_t bvzzenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t bvzzdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t bvzzenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t bvzzdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t bvzzenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t bvzzdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t bvzzenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t bvzzdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
//----------- TurboGorilla : Improved gorilla style + RLE (bit/io) ------------------
size_t fpgenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t fpgdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t fpgenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t fpgdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t fpgenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t fpgdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t fpgenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t fpgdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
//----------- TurboFloat XOR : Last value predictor (TurboPFor) ---------------------
size_t fpxenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t fpxdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t fpxenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t fpxdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t fpxenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t fpxdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t fpxenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t fpxdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
//----------- TurboFloat FCM: Finite Context Method Predictor (TurboPFor) -----------
size_t fpfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t fpfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t fpfcmenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t fpfcmdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t fpfcmenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t fpfcmdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t fpfcmenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t fpfcmdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
//----------- TurboFloat DFCM: Differential Finite Context Method Predictor (TurboPFor)
size_t fpdfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t fpdfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t fpdfcmenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t fpdfcmdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t fpdfcmenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t fpdfcmdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t fpdfcmenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t fpdfcmdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
//----------- TurboFloat 2D DFCM: Differential Finite Context Method Predictor -----
size_t fp2dfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t fp2dfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t fp2dfcmenc16(uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t fp2dfcmdec16(unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t fp2dfcmenc32(uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t fp2dfcmdec32(unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t fp2dfcmenc64(uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t fp2dfcmdec64(unsigned char *in, size_t n, uint64_t *out, uint64_t start);
/*/-------------- delta (=zigzag). Same as p4zenc ------------------------------------
size_t fppenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
size_t fppdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
size_t fppenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
size_t fppdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
size_t fppenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
size_t fppdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
size_t fppenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
size_t fppdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);*/
#ifdef __cplusplus
}
#endif

View File

@ -1,355 +0,0 @@
/**
Copyright (C) powturbo 2013-2021
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// Intel SSE to ARM NEON optimized for maximum speed (and compatibility gcc/clang) with possible minor changes to the source code
#ifndef _SSE_NEON_H_
#define _SSE_NEON_H_
#include "conf.h"
#ifdef __ARM_NEON //------------------------------------------------------------------------------------------------------------------
#include <arm_neon.h>
#define __m128i uint32x4_t // int32x4_t can also be used
#define __m128 float32x4_t
//#define USE_MACROS
#define uint8x16_to_8x8x2(_u_) ((uint8x8x2_t) { vget_low_u8(_u_), vget_high_u8(_u_) })
#ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
#define _mm_set_epi8(u15,u14,u13,u12,\
u11,u10, u9, u8,\
u7,u6,u5,u4,\
u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (uint32x4_t)vld1q_u8( _u);})
#define _mm_set_epi16( u7,u6,u5,u4,\
u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (uint32x4_t)vld1q_u16(_u);})
//#define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; vld1q_u32(_u);})
//#define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (uint32x4_t)vld1q_u64(_u);})
#define _mm_set_epi32(u3, u2, u1, u0) vcombine_u32(vcreate_u32((uint64_t)u1 << 32 | u0), vcreate_u32((uint64_t)u3 << 32 | u2))
#define _mm_set_epi64x(u1, u0) (__m128i)vcombine_u64(vcreate_u64(u0), vcreate_u64(u1))
#else
static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8,
uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4,
uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) {
uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (uint32x4_t)vld1q_u8( u); }
static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4,
uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (uint32x4_t)vld1q_u16(u); }
static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return vld1q_u32(u); }
static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (uint32x4_t)vld1q_u64(u); }
#endif
#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7)
#define _mm_setr_epi32(u3,u2,u1,u0) _mm_set_epi32( u0,u1,u2,u3)
#define _mm_setr_epi64x(u1,u0) _mm_set_epi64x(u0,u0)
#define _mm_set1_epi8( _u8_ ) (__m128i)vdupq_n_u8( _u8_ )
#define _mm_set1_epi16( _u16_) (__m128i)vdupq_n_u16(_u16_)
#define _mm_set1_epi32( _u32_) vdupq_n_u32(_u32_)
#define _mm_set1_epi64x(_u64_) (__m128i)vdupq_n_u64(_u64_)
#define _mm_setzero_si128() vdupq_n_u32( 0 )
#define _mm_cvtss_f32(_u_) vgetq_lane_f32((float32x4_t)(_u_), 0)
#define _mm_setzero_ps() (__m128)vdupq_n_f32(0)
#define _mm_set1_ps(_f32_) (__m128)vdupq_n_f32(_f32_)
//---------------------------------------------- Arithmetic -----------------------------------------------------------------------
#define _mm_add_epi8( _u_,_v_) (__m128i)vaddq_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
#define _mm_add_epi16( _u_,_v_) (__m128i)vaddq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
#define _mm_add_epi32( _u_,_v_) vaddq_u32( _u_, _v_ )
#define _mm_sub_epi8( _u_,_v_) (__m128i)vsubq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
#define _mm_sub_epi16( _u_,_v_) (__m128i)vsubq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
#define _mm_sub_epi32( _u_,_v_) (__m128i)vsubq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_))
#define _mm_subs_epu8( _u_,_v_) (__m128i)vqsubq_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
#define _mm_mullo_epi16(_u_,_v_) (__m128i)vmulq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
#define _mm_mullo_epi32(_u_,_v_) (__m128i)vmulq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
#define mm_mullo_epu32(_u_,_v_) vmulq_u32(_u_,_v_)
#define _mm_mulhi_epi16s(_u_,_v_) (__m128i)vqdmulhq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) //only for small values??
static ALWAYS_INLINE __m128i _mm_mulhi_epi16(__m128i u, __m128i v) {
int32x4_t lo = vmull_s16(vget_low_s16( (int16x8_t)(u)), vget_low_s16( (int16x8_t)(v)));
int32x4_t hi = vmull_s16(vget_high_s16((int16x8_t)(u)), vget_high_s16((int16x8_t)(v)));
uint16x8x2_t a = vuzpq_u16((uint16x8_t)(lo), (uint16x8_t)(hi));
return (__m128i)(vreinterpretq_s32_u16(a.val[1]));
}
#define _mm_mul_epu32( _u_,_v_) (__m128i)vmull_u32(vget_low_u32(_u_),vget_low_u32(_v_))
#define _mm_adds_epu16( _u_,_v_) (__m128i)vqaddq_u16((uint16x8_t)(_u_),(uint16x8_t)(_v_))
static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) {
int32x4_t mlo = vmull_s16(vget_low_s16( (int16x8_t)u), vget_low_s16( (int16x8_t)v)),
mhi = vmull_s16(vget_high_s16((int16x8_t)u), vget_high_s16((int16x8_t)v));
int32x2_t alo = vpadd_s32(vget_low_s32(mlo), vget_high_s32(mlo)),
ahi = vpadd_s32(vget_low_s32(mhi), vget_high_s32(mhi));
return (__m128i)vcombine_s32(alo, ahi);
}
//---------------------------------------------- Special math functions -----------------------------------------------------------
#define _mm_min_epu8( _u_,_v_) (__m128i)vminq_u8( (uint8x16_t)(_u_), (uint8x16_t)(_v_))
#define _mm_min_epu16( _u_,_v_) (__m128i)vminq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
#define _mm_min_epi16( _u_,_v_) (__m128i)vminq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
//---------------------------------------------- Logical --------------------------------------------------------------------------
#define mm_testnz_epu32(_u_) vmaxvq_u32(_u_) //vaddvq_u32(_u_)
#define mm_testnz_epu8( _u_) vmaxv_u8(_u_)
#define _mm_or_si128( _u_,_v_) (__m128i)vorrq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
#define _mm_and_si128( _u_,_v_) (__m128i)vandq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
#define _mm_xor_si128( _u_,_v_) (__m128i)veorq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
//---------------------------------------------- Shift ----------------------------------------------------------------------------
#define mm_slli_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)> 7?vdupq_n_u8( 0):vshlq_n_u8( (uint8x16_t)(_u_), (_c_)))) // parameter c MUST be a constant / vshlq_n_u8: __constrange(0-(N-1))
#define mm_slli_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u16(0):vshlq_n_u16((uint16x8_t)(_u_), (_c_))))
#define mm_slli_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>31?vdupq_n_u32(0):vshlq_n_u32((uint32x4_t)(_u_), (_c_))))
#define mm_slli_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>63?vdupq_n_u64(0):vshlq_n_u64((uint64x2_t)(_u_), (_c_))))
#define _mm_slli_si128( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u8( 0):vextq_u8(vdupq_n_u8(0), (uint8x16_t)(_u_), 16-(_c_) )) ) // vextq_u8: __constrange(0-15)
#define mm_srli_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)> 7?vdupq_n_u8( 0):vshrq_n_u8( (uint8x16_t)(_u_), (_c_)))) // vshrq_n: __constrange(1-N)
#define mm_srli_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u16(0):vshrq_n_u16((uint16x8_t)(_u_), (_c_))))
#define mm_srli_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>31?vdupq_n_u32(0):vshrq_n_u32((uint32x4_t)(_u_), (_c_))))
#define mm_srli_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>63?vdupq_n_u64(0):vshlq_n_u64((uint64x2_t)(_u_), (_c_))))
#define _mm_srli_si128( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u8(0):vextq_u8((uint8x16_t)(_u_), vdupq_n_u8(0), (_c_) )) ) // vextq_u8: __constrange(0-15)
#define mm_srai_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s8( (int8x16_t)(_u_), (_c_))) // c <= 8 (vshrq_n:1-N)
#define mm_srai_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s16((int16x8_t)(_u_), (_c_))) // c <= 16
#define mm_srai_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s32((int32x4_t)(_u_), (_c_))) // c <= 32
#define mm_srai_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s64((int64x2_t)(_u_), (_c_))) // c <= 64
#define _mm_slli_epi8( _u_,_m_) (__m128i)vshlq_u8( (uint8x16_t)(_u_), vdupq_n_s8( (_m_))) // parameter c integer constant/variable
#define _mm_slli_epi16( _u_,_m_) (__m128i)vshlq_u16((uint16x8_t)(_u_), vdupq_n_s16( (_m_)))
#define _mm_slli_epi32( _u_,_m_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vdupq_n_s32( (_m_)))
#define _mm_slli_epi64( _u_,_m_) (__m128i)vshlq_u64((uint64x2_t)(_u_), vdupq_n_s64( (_m_)))
#define _mm_srli_epi8( _u_,_m_) (__m128i)vshlq_u8( (uint8x16_t)(_u_), vdupq_n_s8( -(_m_)))
#define _mm_srli_epi16( _u_,_m_) (__m128i)vshlq_u16((uint16x8_t)(_u_), vdupq_n_s16(-(_m_)))
#define _mm_srli_epi32( _u_,_m_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vdupq_n_s32(-(_m_)))
#define _mm_srli_epi64( _u_,_m_) (__m128i)vshlq_u64((uint64x2_t)(_u_), vdupq_n_s64(-(_m_)))
#define _mm_srai_epi8( _u_,_m_) (__m128i)vshlq_s8( (int8x16_t)(_u_), vdupq_n_s8( -(_m_)))
#define _mm_srai_epi16( _u_,_m_) (__m128i)vshlq_s16((int16x8_t)(_u_), vdupq_n_s16(-(_m_)))
#define _mm_srai_epi32( _u_,_m_) (__m128i)vshlq_s32((int32x4_t)(_u_), vdupq_n_s32(-(_m_)))
#define _mm_srai_epi64( _u_,_m_) (__m128i)vshlq_s64((int64x2_t)(_u_), vdupq_n_s64(-(_m_)))
#define _mm_sll_epi8( _u_,_v_) (__m128i)vshlq_s8( (int8x16_t)(_u_), (int8x16_t)(_v_)) //_v_:all lanes equal
#define _mm_sll_epi16( _u_,_v_) (__m128i)vshlq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_))
#define _mm_sll_epi32( _u_,_v_) (__m128i)vshlq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_))
#define _mm_sll_epi64( _u_,_v_) (__m128i)vshlq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_))
#define _mm_srl_epi8( _u_,_v_) (__m128i)vshrq_s8( (int8x16_t)(_u_), (int8x16_t)(_v_))
#define _mm_srl_epi16( _u_,_v_) (__m128i)vshrq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_))
#define _mm_srl_epi32( _u_,_v_) (__m128i)vshrq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_))
#define _mm_srl_epi64( _u_,_v_) (__m128i)vshrq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_))
#define _mm_sllv_epi32( _u_,_v_) (__m128i)vshlq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_)) //variable shift
#define _mm_srlv_epi32( _u_,_v_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vnegq_s32((int32x4_t)(_v_)))
//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
#define _mm_cmpeq_epi8( _u_,_v_) (__m128i)vceqq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
#define _mm_cmpeq_epi16( _u_,_v_) (__m128i)vceqq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
#define _mm_cmpeq_epi32( _u_,_v_) (__m128i)vceqq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
#define _mm_cmpgt_epi8( _u_,_v_) (__m128i)vcgtq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
#define _mm_cmpgt_epi16( _u_,_v_) (__m128i)vcgtq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
#define _mm_cmpgt_epi32( _u_,_v_) (__m128i)vcgtq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
#define _mm_cmpgt_epu16( _u_,_v_) (__m128i)vcgtq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
#define mm_cmpgt_epu32( _u_,_v_) (__m128i)vcgtq_u32( _u_, _v_)
//---------------------------------------------- Load -----------------------------------------------------------------------------
#define _mm_loadl_epi64( _u64p_) (__m128i)vcombine_s32(vld1_s32((int32_t const *)(_u64p_)), vcreate_s32(0))
#define mm_loadu_epi64p(_u64p_,_u_) (__m128i)vld1q_lane_u64((uint64_t *)(_u64p_), (uint64x2_t)(_u_), 0)
#define _mm_loadu_si128( _ip_) vld1q_u32(_ip_)
#define _mm_load_si128( _ip_) vld1q_u32(_ip_)
#define _mm_load_ps( _ip_) (__m128)vld1q_f32((float32_t *)(_ip_))
#define _mm_loadu_ps( _ip_) (__m128)vld1q_f32((float32_t *)(_ip_))
#define _mm_load1_ps( _ip_) (__m128)vld1q_dup_f32((float32_t *)(_p_))
#define _mm_loadl_pi(_u_,_ip_) (__m128)vcombine_f32((float32x2_t)vld1_f32((float32_t *)(_ip)), (float32x2_t)vget_high_f32(_u_))
#define _mm_loadh_pi(_u_,_ip_) (__m128)vcombine_f32((float32x2_t)vget_low_f32(_u_), (float32x2_t)vld1_f32((const float *)(_ip_)))
//---------------------------------------------- Store ----------------------------------------------------------------------------
#define _mm_storel_epi64(_ip_,_u_) vst1q_lane_u64((uint64_t *)(_ip_), (uint64x2_t)(_u_), 0)
#define _mm_storeu_si128(_ip_,_u_) vst1q_u32((__m128i *)(_ip_), _u_)
#define _mm_store_ps( _ip_,_u_) vst1q_f32( (float32_t *)(_ip_), (float32x4_t)(_u_))
#define _mm_storeu_ps( _ip_,_u_) vst1q_f32( (float32_t *)(_ip_), (float32x4_t)(_u_))
#define _mm_store_ss( _ip_,_u_) vst1q_lane_f32((float32_t *)(_ip_), (float32x4_t)(_u_), 0)
//---------------------------------------------- Convert --------------------------------------------------------------------------
#define mm_cvtsi64_si128p(_u64p_,_u_) mm_loadu_epi64p(_u64p_,_u_)
#define _mm_cvtsi64_si128(_u_) (__m128i)vdupq_n_u64(_u_) //vld1q_s64(_u_)
//---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
#define mm_rbit_epi8(_v_) (__m128i)vrbitq_u8( (uint8x16_t)(_v_)) // reverse bits
#define mm_rev_epi16(_v_) vrev16q_u8((uint8x16_t)(_v_)) // reverse bytes
#define mm_rev_epi32(_v_) vrev32q_u8((uint8x16_t)(_v_))
#define mm_rev_epi64(_v_) vrev64q_u8((uint8x16_t)(_v_))
//--------------------------------------------- Insert/extract --------------------------------------------------------------------
#define mm_extract_epi32x(_u_,_u32_,_id_) vst1q_lane_u32((uint32_t *)&(_u32_), _u_, _id_)
#define _mm_extract_epi64x(_u_,_u64_,_id_) vst1q_lane_u64((uint64_t *)&(_u64_), (uint64x2_t)(_u_), _id_)
#define _mm_extract_epi8( _u_, _id_) vgetq_lane_u8( (uint8x16_t)(_u_), _id_)
#define _mm_extract_epi16(_u_, _id_) vgetq_lane_u16(_u_, _id_)
#define _mm_extract_epi32(_u_, _id_) vgetq_lane_u32(_u_, _id_)
#define mm_extract_epu32(_u_, _id_) vgetq_lane_u32(_u_, _id_)
#define _mm_cvtsi128_si32(_u_) vgetq_lane_u32((uint32x4_t)(_u_),0)
#define _mm_cvtsi128_si64(_u_) vgetq_lane_u64((uint64x2_t)(_u_),0)
#define _mm_insert_epu32p(_u_,_u32p_,_id_) vsetq_lane_u32(_u32p_, _u_, _id_)
#define mm_insert_epi32p(_u_,_u32p_,_id_) vld1q_lane_u32(_u32p_, (uint32x4_t)(_u_), _id_)
#define _mm_cvtsi32_si128(_x_) (__m128i)vsetq_lane_s32(_x_, vdupq_n_s32(0), 0)
#define _mm_blendv_epi8(_u_,_v_,_m_) vbslq_u32(_m_,_v_,_u_)
//---------------------------------------------- Miscellaneous --------------------------------------------------------------------
#define _mm_alignr_epi8(_u_,_v_,_m_) (__m128i)vextq_u8( (uint8x16_t)(_v_), (uint8x16_t)(_u_), _m_)
#define _mm_packs_epi16( _u_,_v_) (__m128i)vcombine_s8( vqmovn_s16((int16x8_t)(_u_)), vqmovn_s16((int16x8_t)(_v_)))
#define _mm_packs_epi32( _u_,_v_) (__m128i)vcombine_s16(vqmovn_s32((int32x4_t)(_u_)), vqmovn_s32((int32x4_t)(_v_)))
#define _mm_packs_epu16( _u_,_v_) (__m128i)vcombine_u8((uint16x8_t)(_u_), (uint16x8_t)(_v_))
#define _mm_packus_epi16( _u_,_v_) (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)(_u_)), vqmovun_s16((int16x8_t)(_v_)))
static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
const uint8x16_t __attribute__ ((aligned (16))) m = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7};
uint8x16_t mv = (uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcltq_s8((int8x16_t)v, vdupq_n_s8(0)), m))));
return vgetq_lane_u8(mv, 8) << 8 | vgetq_lane_u8(mv, 0);
}
//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
#ifdef __aarch64__
static ALWAYS_INLINE uint8_t mm_movemask_epi8s(uint8x8_t sv) { const uint8x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddv_u8( vand_u8( sv, m)); } // short only ARM
//static ALWAYS_INLINE uint16_t mm_movemask_epu16(uint32x4_t v) { const uint16x8_t m = { 1, 1<<2, 1<<4, 1<<6, 1<<8, 1<<10, 1<<12, 1<<14}; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) { const uint16x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) { const uint32x4_t m = { 1, 1<<1, 1<<2, 1<<3 }; return vaddvq_u32(vandq_u32((uint32x4_t)v, m)); }
static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { const uint64x2_t m = { 1, 1<<1 }; return vaddvq_u64(vandq_u64((uint64x2_t)v, m)); }
#else
static ALWAYS_INLINE uint32_t mm_movemask_epu32(uint32x4_t v) { const uint32x4_t mask = {1,2,4,8}, av = vandq_u32(v, mask), xv = vextq_u32(av, av, 2), ov = vorrq_u32(av, xv); return vgetq_lane_u32(vorrq_u32(ov, vextq_u32(ov, ov, 3)), 0); }
#endif
// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_))
#define _mm_shuffle_epi8(_u_, _v_) (__m128i)vqtbl1q_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
#if defined(__aarch64__)
#define mm_shuffle_nnnn_epi32(_u_,_m_) (__m128i)vdupq_laneq_u32(_u_, _m_)
#else
#define mm_shuffle_nnnn_epi32(_u_,_m_) (__m128i)vdupq_n_u32(vgetq_lane_u32(_u_, _m_)
#endif
#ifdef USE_MACROS
#define mm_shuffle_2031_epi32(_u_) ({ uint32x4_t _zv = (uint32x4_t)vrev64q_u32(_u_); uint32x2x2_t _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
#define mm_shuffle_3120_epi32(_u_) ({ uint32x4_t _zv = _u_; _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
#else
static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) { uint32x4_t a = (uint32x4_t)vrev64q_u32(v); uint32x2x2_t z = vtrn_u32(vget_low_u32(a), vget_high_u32(a)); return vcombine_u32(z.val[0], z.val[1]);}
static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) { uint32x2x2_t z = vtrn_u32(vget_low_u32(v), vget_high_u32(v)); return vcombine_u32(z.val[0], z.val[1]);}
#endif
#if defined(USE_MACROS) || defined(__clang__)
#define _mm_shuffle_epi32(_u_, _m_) ({ const uint32x4_t _av =_u_;\
uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));\
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);\
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);\
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); _v;\
})
#define _mm_shuffle_epi32s(_u_, _m_) _mm_set_epi32(vgetq_lane_u32(_u_, ((_m_) ) & 0x3),\
vgetq_lane_u32(_u_, ((_m_) >> 2) & 0x3),\
vgetq_lane_u32(_u_, ((_m_) >> 4) & 0x3),\
vgetq_lane_u32(_u_, ((_m_) >> 6) & 0x3))
#else
static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _u_, const unsigned _m_) { const uint32x4_t _av =_u_;
uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3);
return _v;
}
static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _u_, const unsigned _m_) {
return _mm_set_epi32(vgetq_lane_u32(_u_, ((_m_) ) & 0x3),
vgetq_lane_u32(_u_, ((_m_) >> 2) & 0x3),
vgetq_lane_u32(_u_, ((_m_) >> 4) & 0x3),
vgetq_lane_u32(_u_, ((_m_) >> 6) & 0x3));
}
#endif
#ifdef USE_MACROS
#define _mm_unpacklo_epi8( _u_,_v_) ({ uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
#define _mm_unpacklo_epi16(_u_,_v_) ({ uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
#define _mm_unpacklo_epi32(_u_,_v_) ({ uint32x2x2_t _zv = vzip_u32( vget_low_u32( _u_ ), vget_low_u32( _v_ )); vcombine_u32(_zv.val[0], _zv.val[1]);})
#define _mm_unpacklo_epi64(_u_,_v_) (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_)))
#define _mm_unpackhi_epi8( _u_,_v_) ({ uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
#define _mm_unpackhi_epi16(_u_,_v_) ({ uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
#define _mm_unpackhi_epi32(_u_,_v_) ({ uint32x2x2_t _zv = vzip_u32(vget_high_u32( _u_ ), vget_high_u32( _v_ )); vcombine_u32(_zv.val[0], _zv.val[1]);})
#define _mm_unpackhi_epi64(_u_,_v_) (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_)))
#else
static ALWAYS_INLINE __m128i _mm_unpacklo_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}
static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}
static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32( vget_low_u32( _u_ ), vget_low_u32( _v_ )); return vcombine_u32(_zv.val[0], _zv.val[1]);}
static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_))); }
static ALWAYS_INLINE __m128i _mm_unpackhi_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]); }
static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]); }
static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32(vget_high_u32( _u_ ), vget_high_u32( _v_ )); return vcombine_u32(_zv.val[0], _zv.val[1]); }
static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); }
#endif
#else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) --------------
#define mm_movemask_epu32(_u_) _mm_movemask_ps(_mm_castsi128_ps(_u_))
#define mm_movemask_epu16(_u_) _mm_movemask_epi8(_u_)
#define mm_loadu_epi64p( _u64p_,_u_) _u_ = _mm_cvtsi64_si128(ctou64(_u64p_))
#define mm_extract_epu32( _u_, _id_) _mm_extract_epi32(_u_, _id_)
#define mm_extract_epi32x(_u_,_u32_, _id_) _u32_ = _mm_extract_epi32(_u_, _id_)
#define mm_extract_epi64x(_u_,_u64_, _id_) _u64_ = _mm_extract_epi64(_u_, _id_)
#define mm_insert_epi32p( _u_,_u32p_,_c_) _mm_insert_epi32( _u_,ctou32(_u32p_),_c_)
#define mm_mullo_epu32( _u_,_v_) _mm_mullo_epi32(_u_,_v_)
#define mm_cvtsi64_si128p(_u64p_,_u_) _u_ = _mm_cvtsi64_si128(ctou64(_u64p_))
#define mm_cmplt_epu32( _u_, _v_) _mm_cmplt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000)) //__m128i cv80000000 = _mm_set1_epi32(0x80000000); must be declared
#define mm_cmpgt_epu32( _u_, _v_) _mm_cmpgt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000))
#define _mm_cmplt_epu32( _u_, _v_) _mm_cmplt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000)))
#define _mm_cmpgt_epu32( _u_, _v_) _mm_cmpgt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000)))
#define mm_shuffle_nnnn_epi32(_u_, _n_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(_n_,_n_,_n_,_n_))
#define mm_shuffle_2031_epi32(_u_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(2,0,3,1))
#define mm_shuffle_3120_epi32(_u_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(3,1,2,0))
#define _mm_slli_epi8(_u_, _m_ ) _mm_and_si128(_mm_set1_epi8(0xff << _m_), _mm_slli_epi32(_u_, _m_ ))
#define _mm_srli_epi8(_u_, _m_ ) _mm_and_si128(_mm_set1_epi8(0xff >> _m_), _mm_srli_epi32(_u_, _m_ ))
#define mm_slli_epi8( _u_,_c_) _mm_slli_epi8( _u_,_c_) // parameter c MUST be a constant for compatibilty with the arm functions above
#define mm_slli_epi16( _u_,_c_) _mm_slli_epi16(_u_,_c_)
#define mm_slli_epi32( _u_,_c_) _mm_slli_epi32(_u_,_c_)
#define mm_slli_epi64( _u_,_c_) _mm_slli_epi64(_u_,_c_)
#define mm_srli_epi8( _u_,_c_) _mm_srli_epi8( _u_,_c_)
#define mm_srli_epi16( _u_,_c_) _mm_srli_epi16(_u_,_c_)
#define mm_srli_epi32( _u_,_c_) _mm_srli_epi32(_u_,_c_)
#define mm_srli_epi64( _u_,_c_) _mm_srli_epi64(_u_,_c_)
#define mm_srai_epi8( _u_,_c_) _mm_srai_epi8( _u_,_c_)
#define mm_srai_epi16( _u_,_c_) _mm_srai_epi16(_u_,_c_)
#define mm_srai_epi32( _u_,_c_) _mm_srai_epi32(_u_,_c_)
#define mm_srai_epi64( _u_,_c_) _mm_srai_epi64(_u_,_c_)
#ifdef __SSSE3__
static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i v) { // reverse bits in bytes
__m128i fv = _mm_set_epi8(15, 7,11, 3,13, 5, 9, 1,14, 6,10, 2,12, 4, 8, 0), cv0f_8 = _mm_set1_epi8(0xf);
__m128i lv = _mm_shuffle_epi8(fv,_mm_and_si128( v, cv0f_8));
__m128i hv = _mm_shuffle_epi8(fv,_mm_and_si128( mm_srli_epi64(v, 4), cv0f_8));
return _mm_or_si128( mm_slli_epi64(lv,4), hv);
}
static ALWAYS_INLINE __m128i mm_rev_epi16(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } // reverse vector bytes in uint??_t
static ALWAYS_INLINE __m128i mm_rev_epi32(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3)); }
static ALWAYS_INLINE __m128i mm_rev_epi64(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7)); }
static ALWAYS_INLINE __m128i mm_rev_si128(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15)); }
#endif
#endif
#endif

252
time_.h
View File

@ -1,252 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// time_.h : parameter free high precision time/benchmark functions
#include <time.h>
#include <float.h>
#ifdef _WIN32
#include <windows.h>
#ifndef sleep
#define sleep(n) Sleep((n) * 1000)
#endif
typedef unsigned __int64 uint64_t;
typedef unsigned __int64 tm_t;
#else
#include <stdint.h>
#include <unistd.h>
#define Sleep(ms) usleep((ms) * 1000)
typedef struct timespec tm_t;
#endif
#if defined (__i386__) || defined( __x86_64__ )
#ifdef _MSC_VER
#include <intrin.h> // __rdtsc
#else
#include <x86intrin.h>
#endif
#ifdef __corei7__
#define RDTSC_INI(_c_) do { unsigned _cl, _ch; \
__asm volatile ("cpuid\n\t" \
"rdtsc\n\t" \
"mov %%edx, %0\n" \
"mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \
"%rax", "%rbx", "%rcx", "%rdx"); \
_c_ = (uint64_t)_ch << 32 | _cl; \
} while(0)
#define RDTSC(_c_) do { unsigned _cl, _ch; \
__asm volatile("rdtscp\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1\n" \
"cpuid\n\t": "=r" (_ch), "=r" (_cl):: "%rax",\
"%rbx", "%rcx", "%rdx");\
_c_ = (uint64_t)_ch << 32 | _cl;\
} while(0)
#else
#define RDTSC(_c_) do { unsigned _cl, _ch;\
__asm volatile ("cpuid \n"\
"rdtsc"\
: "=a"(_cl), "=d"(_ch)\
: "a"(0)\
: "%ebx", "%ecx");\
_c_ = (uint64_t)_ch << 32 | _cl;\
} while(0)
#define RDTSC_INI(_c_) RDTSC(_c_)
#endif
#else
#define RDTSC_INI(_c_)
#define RDTSC(_c_)
#endif
#define tmrdtscini() ({ uint64_t _c; __asm volatile("" ::: "memory"); RDTSC_INI(_c); _c; })
#define tmrdtsc() ({ uint64_t _c; RDTSC(_c); _c; })
#ifndef TM_F
#define TM_F 1.0 // TM_F=4 -> MI/s
#endif
#ifdef RDTSC_ON
#define tminit() tmrdtscini()
#define tmtime() tmrdtsc()
#define TM_T CLOCKS_PER_SEC
static double TMBS(unsigned l, double t) { double dt = t, dl = l; return t/l; }
#define TM_C 1000
#else
#define TM_C 1
static double TMBS(unsigned l, double t) { return (l/t)/1000000.0; }
#ifdef _WIN32
static LARGE_INTEGER tps;
static tm_t tmtime(void) {
LARGE_INTEGER tm;
tm_t t;
QueryPerformanceCounter(&tm);
return tm.QuadPart;
}
static tm_t tminit() { tm_t t0,ts; QueryPerformanceFrequency(&tps); t0 = tmtime(); while((ts = tmtime())==t0) {}; return ts; }
static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start)/tps.QuadPart; }
static int tmiszero(tm_t t) { return !t; }
#else
#ifdef __APPLE__
#include <AvailabilityMacros.h>
#ifndef MAC_OS_X_VERSION_10_12
#define MAC_OS_X_VERSION_10_12 101200
#endif
#define CIVETWEB_APPLE_HAVE_CLOCK_GETTIME (defined(__APPLE__) && defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12)
#if !(CIVETWEB_APPLE_HAVE_CLOCK_GETTIME)
#include <sys/time.h>
#define CLOCK_REALTIME 0
#define CLOCK_MONOTONIC 0
int clock_gettime(int /*clk_id*/, struct timespec* t) {
struct timeval now;
int rv = gettimeofday(&now, NULL);
if (rv) return rv;
t->tv_sec = now.tv_sec;
t->tv_nsec = now.tv_usec * 1000;
return 0;
}
#endif
#endif
static tm_t tmtime() { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return tm; }
static double tmdiff(tm_t start, tm_t stop) { return (stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1e9f; }
static tm_t tminit() { tm_t t0 = tmtime(),t; while(!tmdiff(t = tmtime(),t0)) {}; return t; }
static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
#endif
#endif
//---------------------------------------- bench ----------------------------------------------------------------------
// for each a function call is repeated until exceeding tm_tx seconds.
// A run duration is always tm_tx seconds
// The number of runs can be set with the program options -I and -J (specify -I15 -J15 for more precision)
// sleep after each 8 runs to avoid cpu throttling.
#define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0)
// benchmark loop
#define TMBEG(_tm_Reps_) { unsigned _tm_r,_tm_c = 0,_tm_R,_tm_Rx = _tm_Reps_,_tm_Rn = _tm_Reps_; double _tm_t;\
for(tm_rm = tm_rep, tm_tm = DBL_MAX, _tm_R = 0; _tm_R < _tm_Rn; _tm_R++) { tm_t _tm_t0 = tminit(); /*for each run*/\
for(_tm_r = 0;_tm_r < tm_rm;) { /*repeat tm_rm times */
#define TMEND(_len_) \
_tm_r++; if(tm_tm == DBL_MAX && (_tm_t = tmdiff(_tm_t0, tmtime())) > tm_tx) break;\
}\
/*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\
/*other runs: break the loop only after 'tm_rm' repeats */ \
_tm_t = tmdiff(_tm_t0, tmtime());\
/*set min time, recalculate repeats tm_rm based on tm_tx, recalculate number of runs based on tm_TX*/\
if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("[%d,%d] ", tm_rm, _tm_Rn);*/ } tm_tm = _tm_t; _tm_c++; }\
else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\
if(tm_verbose) { printf("%8.2f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\
if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu throttling*/\
}\
}
static unsigned tm_rep = 1<<30, tm_Rep = 3, tm_Rep2 = 3, tm_rm, tm_RepMin = 1, tm_slp = 20, tm_verbose = 2;
static tm_t tm_0, tm_T;
static double tm_tm, tm_tx = 1, tm_TX = 60;
static void tm_init(int _tm_Rep, int _tm_verbose) { tm_verbose = _tm_verbose; if(_tm_Rep) tm_Rep = _tm_Rep; }
#define TMBENCH(_name_, _func_, _len_) do { if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
TMBEG(tm_Rep) _func_; TMEND(_len_); \
double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_, dm*TM_C/dr) );\
} while(0)
// second TMBENCH. Example: use TMBENCH for encoding and TMBENCH2 for decoding
#define TMBENCH2(_name_, _func_, _len_) do { \
TMBEG(tm_Rep2) _func_; TMEND(_len_);\
double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_, dm*TM_C/dr) );\
if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
} while(0)
// Check
#define TMBENCHT(_name_,_func_, _len_, _res_) do { \
TMBEG(tm_Rep) \
if(_func_ != _res_) { printf("ERROR: %lld != %lld", (long long)_func_, (long long)_res_ ); exit(0); };\
TMEND(_len_);\
if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_,(double)tm_tm*TM_C/(double)tm_rm) );\
if(tm_verbose) printf("%s ", _name_?_name_:#_func_ );\
} while(0)
//----------------------------------------------------------------------------------------------------------------------------------
#define Kb (1u<<10)
#define Mb (1u<<20)
#define Gb (1u<<30)
#define KB 1000
#define MB 1000000
#define GB 1000000000
static unsigned argtoi(char *s, unsigned def) {
char *p;
unsigned n = strtol(s, &p, 10),f = 1;
switch(*p) {
case 'K': f = KB; break;
case 'M': f = MB; break;
case 'G': f = GB; break;
case 'k': f = Kb; break;
case 'm': f = Mb; break;
case 'g': f = Gb; break;
case 'B': return n; break;
case 'b': def = 0;
default: if(!def) return n>=32?0xffffffffu:(1u << n); f = def;
}
return n*f;
}
static uint64_t argtol(char *s) {
char *p;
uint64_t n = strtol(s, &p, 10),f=1;
switch(*p) {
case 'K': f = KB; break;
case 'M': f = MB; break;
case 'G': f = GB; break;
case 'k': f = Kb; break;
case 'm': f = Mb; break;
case 'g': f = Gb; break;
case 'B': return n; break;
case 'b': return 1u << n;
default: f = MB;
}
return n*f;
}
static uint64_t argtot(char *s) {
char *p;
uint64_t n = strtol(s, &p, 10),f=1;
switch(*p) {
case 'h': f = 3600000; break;
case 'm': f = 60000; break;
case 's': f = 1000; break;
case 'M': f = 1; break;
default: f = 1000;
}
return n*f;
}
static void memrcpy(unsigned char *out, unsigned char *in, unsigned n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; }

View File

@ -1,113 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// transpose.h - Byte/Nibble transpose for further compressing with lz77 or other compressors
#ifdef __cplusplus
extern "C" {
#endif
// Syntax
// in : Input buffer
// n : Total number of bytes in input buffer
// out : output buffer
// esize : element size in bytes (ex. 2, 4, 8,... )
//---------- High level functions with dynamic cpu detection and JIT scalar/sse/avx2 switching
void tpenc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // tranpose
void tpdec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // reverse transpose
void tp2denc(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize); //2D transpose
void tp2ddec(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize);
void tp3denc(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); //3D transpose
void tp3ddec(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize);
void tp4denc(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); //4D transpose
void tp4ddec(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize);
// Nibble transpose SIMD (SSE2,AVX2, ARM Neon)
void tp4enc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
void tp4dec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
// bit transpose
//void tp1enc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
//void tp1dec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
//---------- Low level functions ------------------------------------
void tpenc2( unsigned char *in, unsigned n, unsigned char *out); // scalar
void tpenc3( unsigned char *in, unsigned n, unsigned char *out);
void tpenc4( unsigned char *in, unsigned n, unsigned char *out);
void tpenc8( unsigned char *in, unsigned n, unsigned char *out);
void tpenc16( unsigned char *in, unsigned n, unsigned char *out);
void tpdec2( unsigned char *in, unsigned n, unsigned char *out);
void tpdec3( unsigned char *in, unsigned n, unsigned char *out);
void tpdec4( unsigned char *in, unsigned n, unsigned char *out);
void tpdec8( unsigned char *in, unsigned n, unsigned char *out);
void tpdec16( unsigned char *in, unsigned n, unsigned char *out);
void tpenc128v2( unsigned char *in, unsigned n, unsigned char *out); // sse2
void tpdec128v2( unsigned char *in, unsigned n, unsigned char *out);
void tpenc128v4( unsigned char *in, unsigned n, unsigned char *out);
void tpdec128v4( unsigned char *in, unsigned n, unsigned char *out);
void tpenc128v8( unsigned char *in, unsigned n, unsigned char *out);
void tpdec128v8( unsigned char *in, unsigned n, unsigned char *out);
void tp4enc128v2( unsigned char *in, unsigned n, unsigned char *out);
void tp4dec128v2( unsigned char *in, unsigned n, unsigned char *out);
void tp4enc128v4( unsigned char *in, unsigned n, unsigned char *out);
void tp4dec128v4( unsigned char *in, unsigned n, unsigned char *out);
void tp4enc128v8( unsigned char *in, unsigned n, unsigned char *out);
void tp4dec128v8( unsigned char *in, unsigned n, unsigned char *out);
void tp1enc128v2( unsigned char *in, unsigned n, unsigned char *out);
void tp1dec128v2( unsigned char *in, unsigned n, unsigned char *out);
void tp1enc128v4( unsigned char *in, unsigned n, unsigned char *out);
void tp1dec128v4( unsigned char *in, unsigned n, unsigned char *out);
void tp1enc128v8( unsigned char *in, unsigned n, unsigned char *out);
void tp1dec128v8( unsigned char *in, unsigned n, unsigned char *out);
void tpenc256v2( unsigned char *in, unsigned n, unsigned char *out); // avx2
void tpdec256v2( unsigned char *in, unsigned n, unsigned char *out);
void tpenc256v4( unsigned char *in, unsigned n, unsigned char *out);
void tpdec256v4( unsigned char *in, unsigned n, unsigned char *out);
void tpenc256v8( unsigned char *in, unsigned n, unsigned char *out);
void tpdec256v8( unsigned char *in, unsigned n, unsigned char *out);
void tp4enc256v2( unsigned char *in, unsigned n, unsigned char *out);
void tp4dec256v2( unsigned char *in, unsigned n, unsigned char *out);
void tp4enc256v4( unsigned char *in, unsigned n, unsigned char *out);
void tp4dec256v4( unsigned char *in, unsigned n, unsigned char *out);
void tp4enc256v8( unsigned char *in, unsigned n, unsigned char *out);
void tp4dec256v8( unsigned char *in, unsigned n, unsigned char *out);
//------- CPU instruction set
// cpuiset = 0: return current simd set,
// cpuiset != 0: set simd set 0:scalar, 20:sse2, 52:avx2
unsigned cpuini(unsigned cpuiset);
// convert simd set to string "sse3", "sse3", "sse4.1" or "avx2"
// Ex.: printf("current cpu set=%s\n", cpustr(cpuini(0)) );
char *cpustr(unsigned cpuisa);
unsigned cpuisa(void);
#ifdef __cplusplus
}
#endif

72
trle.h
View File

@ -1,72 +0,0 @@
/**
Copyright (C) powturbo 2015-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
#if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h"
#else
#include <stdint.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
// RLE with specified escape char
unsigned _srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
unsigned _srled8( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint8_t e);
unsigned _srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
unsigned _srled16(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint16_t e);
unsigned _srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
unsigned _srled32(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint32_t e);
unsigned _srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
unsigned _srled64(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint64_t e);
// functions w/ overflow handling
unsigned srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
unsigned srled8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint8_t e);
unsigned srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
unsigned srled16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint16_t e);
unsigned srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
unsigned srled32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint32_t e);
unsigned srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
unsigned srled64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint64_t e);
// RLE w. automatic escape char determination
unsigned srlec( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
unsigned _srled( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen);
unsigned srled( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
// Turbo RLE
unsigned trlec( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
unsigned _trled( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen);
unsigned trled( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
#ifdef __cplusplus
}
#endif

401
vint.h
View File

@ -1,401 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "Integer Compression" variable byte include header (scalar TurboVByte+ SIMD TurboByte)
#ifndef _VINT_H_
#define _VINT_H_
#ifdef __cplusplus
extern "C" {
#endif
#ifdef VINT_IN
#include "conf.h"
//----------------------------------- Variable byte: single value macros (low level) -----------------------------------------------
//------------- 32 bits -------------
extern unsigned char _vtab32_[];
#define _vbxvlen32(_x_) _vtab32_[(unsigned char)(_x_)>>4] // (clz32((_x_) ^ 0xff) - 23) //
#define _vbxlen32(_x_) ((bsr32(_x_|1)+6)/7)
#define _vbxput32(_op_, _x_, _act_) {\
if(likely((_x_) < (1<< 7))) { *_op_++ = _x_; _act_;}\
else if(likely((_x_) < (1<<14))) { ctou16(_op_) = bswap16((_x_) | 0x8000u); _op_ += 2; _act_;}\
else if(likely((_x_) < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0u; ctou16(_op_) = _x_; _op_ += 2; _act_;}\
else if(likely((_x_) < (1<<28))) { ctou32(_op_) = bswap32((_x_) | 0xe0000000u); _op_ += 4; _act_;}\
else { *_op_++ = (unsigned long long)(_x_) >> 32 | 0xf0u; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
}
#define _vbxget32(_ip_, _x_, _act_) do { _x_ = (unsigned)(*_ip_++);\
if(!(_x_ & 0x80u)) { _act_;}\
else if(!(_x_ & 0x40u)) { _x_ = bswap16(ctou16(_ip_ - 1) & 0xff3fu); _ip_++; _act_;}\
else if(!(_x_ & 0x20u)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\
else if(!(_x_ & 0x10u)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0fu); _ip_ += 3; _act_;}\
else { _x_ = (unsigned long long)((_x_) & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\
} while(0)
//------------- 64 bits -----------
#define _vbxlen64(_x_) ((bsr64(_x_)+6)/7)
#define _vbxvlen64(_x_) ((_x_)==0xff?9:clz32((_x_) ^ 0xff) - 23)
#define _vbxput64(_op_, _x_, _act_) {\
if(likely(_x_ < (1<< 7))) { *_op_++ = _x_; _act_;}\
else if(likely(_x_ < (1<<14))) { ctou16(_op_) = bswap16(_x_| 0x8000); _op_ += 2; _act_;}\
else if(likely(_x_ < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0; ctou16(_op_) = _x_; _op_ += 2; _act_;}\
else if(likely(_x_ < (1<<28))) { ctou32(_op_) = bswap32(_x_| 0xe0000000); _op_ += 4; _act_;}\
else if( _x_ < 1ull<<35) { *_op_++ = _x_ >> 32 | 0xf0; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
else if( _x_ < 1ull<<42) { ctou16(_op_) = bswap16(_x_ >> 32 | 0xf800); _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
else if( _x_ < 1ull<<49) { *_op_++ = _x_ >> 48 | 0xfc; ctou16(_op_) = _x_ >> 32; _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
else if( _x_ < 1ull<<56) { ctou64(_op_) = bswap64(_x_ | 0xfe00000000000000ull); _op_ += 8; _act_;}\
else { *_op_++ = 0xff; ctou64(_op_) = _x_; _op_ += 8; _act_;}\
}
#define _vbxget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
if(!(_x_ & 0x80)) { _act_;}\
else if(!(_x_ & 0x40)) { _x_ = bswap16(ctou16(_ip_++-1) & 0xff3f); _act_;}\
else if(!(_x_ & 0x20)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\
else if(!(_x_ & 0x10)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0f); _ip_ += 3; _act_;}\
else if(!(_x_ & 0x08)) { _x_ = (_x_ & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\
else if(!(_x_ & 0x04)) { _x_ = (unsigned long long)(bswap16(ctou16(_ip_-1)) & 0x7ff) << 32 | ctou32(_ip_+1); _ip_ += 5; _act_;}\
else if(!(_x_ & 0x02)) { _x_ = (_x_ & 0x03)<<48 | (unsigned long long)ctou16(_ip_) << 32 | ctou32(_ip_+2); _ip_ += 6; _act_;}\
else if(!(_x_ & 0x01)) { _x_ = bswap64(ctou64(_ip_-1)) & 0x01ffffffffffffffull; _ip_ += 7; _act_;}\
else { _x_ = ctou64(_ip_); _ip_ += 8; _act_;}\
} while(0)
#define vbxput64(_op_, _x_) { unsigned long long _x = _x_; _vbxput64(_op_, _x, ;); }
#define vbxput32(_op_, _x_) { register unsigned _x = _x_; _vbxput32(_op_, _x, ;); }
#define vbxput16(_op_, _x_) vbxput32(_op_, _x_)
#define vbxput8( _op_, _x_) (*_op_++ = _x_)
#define vbxget64(_ip_, _x_) _vbxget64(_ip_, _x_, ;)
#define vbxget32(_ip_, _x_) _vbxget32(_ip_, _x_, ;)
#define vbxget16(_ip_, _x_) vbxget32(_ip_,_x_)
#define vbxget8(_ip_, _x_) (_x_ = *_ip_++)
//---------------------------------------------------------------------------
#define VB_SIZE 64
#define VB_MAX 254
#define VB_B2 6
#define VB_B3 3
#define VB_BA3 (VB_MAX - (VB_SIZE/8 - 3))
#define VB_BA2 (VB_BA3 - (1<<VB_B3))
#define VB_OFS1 (VB_BA2 - (1<<VB_B2))
#define VB_OFS2 (VB_OFS1 + (1 << (8+VB_B2)))
#define VB_OFS3 (VB_OFS2 + (1 << (16+VB_B3)))
#define _vblen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_OFS2?2:((_x_) < VB_OFS3)?3:(bsr32(_x_)+7)/8+1))
#define _vbvlen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_BA2?2:((_x_) < VB_BA3)?3:(_x_-VB_BA3)))
#define _vbput32(_op_, _x_, _act_) {\
if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_); _act_;}\
else if ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1)); _op_ += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\
else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\
else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou32(_op_) = (_x_); _op_ += _b; _act_;}\
}
#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
if(likely(_x_ < VB_OFS1)) { _act_ ;}\
else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \
else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\
else { unsigned _b = _x_-VB_BA3; _x_ = ctou32(_ip_) & ((1u << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\
} while(0)
#define _vblen64(_x_) _vblen32(_x_)
#define _vbvlen64(_x_) _vbvlen32(_x_)
#define _vbput64(_op_, _x_, _act_) {\
if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_); _act_;}\
else if ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1)); _op_ += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\
else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\
else { unsigned _b = (bsr64((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou64(_op_) = (_x_); _op_ += _b; _act_;}\
}
#define _vbget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
if(likely(_x_ < VB_OFS1)) { _act_ ;}\
else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \
else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\
else { unsigned _b = _x_-VB_BA3; _x_ = ctou64(_ip_) & ((1ull << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\
} while(0)
#ifdef _WIN32
//#define fgetc_unlocked(_f_) _fgetc_nolock(_f_)
#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_)
#define fgetc_unlocked(_f_) fgetc(_f_)
#else
#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_) //_IO_putc_unlocked(_c_,_f_)
#define fgetc_unlocked(_f_) fgetc(_f_) //_IO_getc_unlocked(_f_)
#endif
#define leb128put(_op_, _x_) { uint64_t _x = _x_; while(_x > 0x7f) { *_op_++ = _x & 0x7f; _x >>= 7; } *_op_++ = _x | 0x80; }
#define vbfput32(_f_, _x_) ({ uint64_t _x = _x_; while(_x > 0x7f) { fputc_unlocked(_x & 0x7f, _f_); _x >>= 7; } fputc_unlocked(_x | 0x80, _f_); })
#define _leb128get(_ip_, _x_, _act_) { unsigned _sft=0; for(_x_=0;;_sft += 7) { unsigned _c = *_ip_++; _x_ += (_c & 0x7f) << _sft; if(_c >= 0x80) { _act_; break; } } }
#define leb128get(_ip_, _x_) vbgetax(_ip_, _x_, ;)
#define vbfget32(_f_ ) ({ unsigned _sft=0,_x=0; for(;;_sft += 7) { unsigned _c = fgetc_unlocked(_f_); if(_c != EOF) { _x += (_c & 0x7f) << _sft; if(_c & 0x80) break; } else { _x = EOF; break; } } _x; })
//------------- 16 bits -----------
#define _vblen16(_x_) _vblen32(_x_)
#define _vbvlen16(_x_) _vbvlen32(_x_)
#define _vbput16(_op_, _x_, _act_) _vbput32(_op_, _x_, _act_)
#define _vbget16(_ip_, _x_, _act_) _vbget32(_ip_, _x_, _act_)
#define _vblen8(_x_) 1
#define _vbvlen8(_x_) 1
#define _vbput8(_op_, _x_, _act_) { *_op_++ = _x_; _act_; }
#define _vbget8(_ip_, _x_, _act_) { _x_ = *_ip_++; _act_; }
//----------------------------------- Variable byte: single value functions -----------------------------------------------
// ---- Variable byte length after compression
static inline unsigned vblen16(unsigned short x) { return _vblen16(x); }
static inline unsigned vblen32(unsigned x) { return _vblen32(x); }
static inline unsigned vblen64(uint64_t x) { return _vblen64(x); }
// ---- Length of compressed value. Input in is the first char of the compressed buffer start (Ex. vbvlen32(in[0]) )
static inline unsigned vbvlen16(unsigned x) { return _vbvlen32(x); }
static inline unsigned vbvlen32(unsigned x) { return _vbvlen32(x); }
static inline unsigned vbvlen64(unsigned x) { return _vbvlen64(x); }
//----- encode/decode 16/32/64 single value and advance output/input pointer
#define vbput64(_op_, _x_) { unsigned long long _x = _x_; _vbput64(_op_, _x, ;); }
#define vbput32(_op_, _x_) { register unsigned _x = _x_; _vbput32(_op_, _x, ;); }
#define vbput16(_op_, _x_) vbput32(_op_, _x_)
#define vbput8(_op_, _x_) (*_op_++ = _x_)
#define vbget64(_ip_, _x_) _vbget64(_ip_, _x_, ;)
#define vbget32(_ip_, _x_) _vbget32(_ip_, _x_, ;)
#define vbget16(_ip_, _x_) vbget32(_ip_,_x_)
#define vbget8(_ip_, _x_) (_x_ = *_ip_++)
#endif
//----------------------------- TurboVByte 'vb':Variable byte + SIMD TurboByte 'v8': array functions ----------------------------------------
// Encoding/DEcoding: Return value = end of compressed output/input buffer out/in
//----------------------- Encoding/Decoding unsorted array with n integer values --------------------------
unsigned char *vbenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); //TurboVByte
unsigned char *vbenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *vbenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
//-- Decode
unsigned char *vbdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out);
unsigned char *vbdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out);
unsigned char *vbdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out);
//-- Get value stored at index idx (idx:0...n-1)
unsigned short vbgetx16( unsigned char *__restrict in, unsigned idx);
unsigned vbgetx32( unsigned char *__restrict in, unsigned idx);
uint64_t vbgetx64( unsigned char *__restrict in, unsigned idx);
//-- Search and return index of next value equal to key or n when no key value found
// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
unsigned vbgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key);
unsigned vbgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned key);
unsigned vbgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t key);
//---------------------- Delta encoding/decoding sorted array ---------------------------------------------
//-- Increasing integer array. out[i] = out[i-1] + in[i]
unsigned char *vbdenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *vbdenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *vbdenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *vbddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *vbddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *vbddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
//-- Get value stored at index idx (idx:0...n-1)
unsigned short vbdgetx16( unsigned char *__restrict in, unsigned idx, unsigned short start);
unsigned vbdgetx32( unsigned char *__restrict in, unsigned idx, unsigned start);
uint64_t vbdgetx64( unsigned char *__restrict in, unsigned idx, uint64_t start);
//-- Search and return index of next value equal to key or n when no key value found
// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
unsigned vbdgetgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start);
unsigned vbdgetgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned *key, unsigned start);
unsigned vbdgetgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t *key, uint64_t start);
//-- Strictly increasing (never remaining constant or decreasing) integer array. out[i] = out[i-1] + in[i] + 1
unsigned char *vbd1enc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *vbd1enc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *vbd1dec16(unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *vbd1dec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
//-- Get value stored at index idx (idx:0...n-1)
unsigned short vbd1getx16( unsigned char *__restrict in, unsigned idx, unsigned short start);
unsigned vbd1getx32( unsigned char *__restrict in, unsigned idx, unsigned start);
uint64_t vbd1getx64( unsigned char *__restrict in, unsigned idx, uint64_t start);
//-- Search and return index of next value equal to key or n when no key value found
// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
unsigned vbd1getgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start);
unsigned vbd1getgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned *key, unsigned start);
unsigned vbd1getgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t *key, uint64_t start);
//---------------------- Zigzag encoding/decoding for unsorted integer lists.
unsigned char *vbzenc8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start);
unsigned char *vbzenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *vbzenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *vbzenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *vbzdec8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start);
unsigned char *vbzdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *vbzdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *vbzdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
//---------------------- XOR encoding/decoding for unsorted integer lists.
unsigned char *vbxenc8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start);
unsigned char *vbxenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *vbxenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *vbxenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *vbxdec8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start);
unsigned char *vbxdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *vbxdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *vbxdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
//---------------------- Delta of delta encoding/decoding for unsorted integer lists.
unsigned char *vbddenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *vbddenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *vbddenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *vbdddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *vbdddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
unsigned char *vbdddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
//-- Get value stored at index idx (idx:0...n-1)
unsigned short vbzgetx16( unsigned char *__restrict in, unsigned idx, unsigned short start);
unsigned vbzgetx32( unsigned char *__restrict in, unsigned idx, unsigned start);
uint64_t vbzgetx64( unsigned char *__restrict in, unsigned idx, uint64_t start);
//-- Search and return index of next value equal to key or n when no key value found
// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
/*unsigned vbzgeteq15( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start);
unsigned vbzgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start);
unsigned vbzgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned key, unsigned start);
unsigned vbzgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t key, unsigned start);*/
//-------------------------- TurboByte (SIMD Group varint) --------------------------------------------------------------
unsigned char *v8enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); //TurboByte
unsigned char *v8enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *v8dec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out);
unsigned char *v8dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out);
//------ delta ---------
unsigned char *v8denc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *v8denc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *v8ddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *v8ddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
//------ delta 1 -------
unsigned char *v8d1enc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *v8d1enc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *v8d1dec16(unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *v8d1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
//------- zigzag -------
unsigned char *v8zenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *v8zenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *v8zdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *v8zdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
//------- xor ----------
unsigned char *v8xenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
unsigned char *v8xenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
unsigned char *v8xdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
unsigned char *v8xdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
//-------------------------- TurboByte Hybrid (SIMD Group varint) + Bitpacking -------------------------------------------
size_t v8nenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nd1enc16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nd1enc32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nzenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nzenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8ndec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8ndec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nd1dec16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8nd1dec32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nzdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8nzdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nxdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8nxdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
//-------------
size_t v8nenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8ndenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nd1enc128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nzenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nzenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8ndec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8ndec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nddec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nxdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t v8nxdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
//-------------
size_t v8nenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nzenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8nxenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t v8ndec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t v8nxdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
#ifdef __cplusplus
}
#endif
#endif

355
vp4.h
View File

@ -1,355 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "TurboPFor: Integer Compression" PFor/PForDelta + Direct access
#ifndef VP4_H_
#define VP4_H_
#if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h"
#else
#include <stdint.h>
#endif
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
//************************************************ High level API - n unlimited ****************************************************
// Compress integer array with n values to the buffer out.
// Return value = number of bytes written to compressed buffer out
size_t p4nenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); // SIMD (Vertical bitpacking)
size_t p4nenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nenc128v64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nenc256w32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4ndenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4ndenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4ndenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nd1enc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nd1enc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nd1enc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nd1enc128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nd1enc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nzenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nzenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nzenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nzenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nzenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nzenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
size_t p4nzenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
// Decompress the compressed n values in input buffer in to the integer array out.
// Return value = number of bytes read from the ompressed buffer in
size_t p4ndec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t p4ndec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t p4ndec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4ndec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
size_t p4ndec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t p4ndec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4ndec128v64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
size_t p4ndec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
// Delta minimum = 0
size_t p4nddec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t p4nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t p4nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nddec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t p4nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nddec256w32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nddec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
// Delta minimum = 1
size_t p4nd1dec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t p4nd1dec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t p4nd1dec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nd1dec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
//Zigzag
size_t p4nzdec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
size_t p4nzdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t p4nzdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
size_t p4nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
size_t p4nzdec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
//************** Low level API - n limited to 128/256 ***************************************
#define P4D_MAX 256
// -------------- TurboPFor: Encode ------------
//#include <assert.h>
// Low level API: Single block n limited
//compress integer array with n values to the buffer out. Return value = end of compressed buffer out
unsigned char *p4enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4enc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out); // SSE (Vertical bitpacking)
unsigned char *p4enc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4enc128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4enc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out); // AVX2
unsigned char *p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4enc256w32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4encx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out);// Direct access
unsigned char *p4encx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4encx32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4encx64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
unsigned char *p4denc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start);
unsigned char *p4denc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
unsigned char *p4denc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4denc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
unsigned char *p4denc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4denc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4denc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *p4denc256w32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4dencx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); // Direct access
unsigned char *p4dencx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
unsigned char *p4dencx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4d1enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start);
unsigned char *p4d1enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
unsigned char *p4d1enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4d1enc128v16(uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); // SIMD (Vertical bitpacking)
unsigned char *p4d1enc128v32(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4d1enc256v32(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4d1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *p4d1encx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); // Direct access
unsigned char *p4d1encx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
unsigned char *p4d1encx32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4zenc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start);
unsigned char *p4zenc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
unsigned char *p4zenc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4zenc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
unsigned char *p4zenc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4zenc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
unsigned char *p4zenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
unsigned char *p4senc16(uint16_t *in, unsigned n, unsigned char *out, uint16_t start);
unsigned char *p4senc32(uint32_t *in, unsigned n, unsigned char *out, uint32_t start);
unsigned char *p4senc64(uint64_t *in, unsigned n, unsigned char *out, uint64_t start);
unsigned char *p4sdec16(unsigned char *in, unsigned n, uint16_t *out, uint16_t start);
unsigned char *p4sdec32(unsigned char *in, unsigned n, uint32_t *out, uint32_t start);
unsigned char *p4sdec64(unsigned char *in, unsigned n, uint64_t *out, uint64_t start);
size_t p4nsenc16(uint16_t *in, size_t n, unsigned char *out);
size_t p4nsenc32(uint32_t *in, size_t n, unsigned char *out);
size_t p4nsenc64(uint64_t *in, size_t n, unsigned char *out);
size_t p4nsdec16(unsigned char *in, size_t n, uint16_t *out);
size_t p4nsdec32(unsigned char *in, size_t n, uint32_t *out);
size_t p4nsdec64(unsigned char *in, size_t n, uint64_t *out);
// same as p4enc, but with b and bx as parameters. Call after _p4bitsXX
inline unsigned char *_p4enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4enc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
inline unsigned char *_p4enc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
inline unsigned char *_p4enc128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
inline unsigned char *_p4enc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
// calculate the best bit sizes b and bx, return b.
unsigned _p4bits8( uint8_t *__restrict in, unsigned n, unsigned *pbx);
unsigned _p4bits16( uint16_t *__restrict in, unsigned n, unsigned *pbx);
unsigned _p4bits32( uint32_t *__restrict in, unsigned n, unsigned *pbx);
unsigned _p4bits64( uint64_t *__restrict in, unsigned n, unsigned *pbx);
unsigned _p4bitsx8( uint8_t *__restrict in, unsigned n, unsigned *pbx);
unsigned _p4bitsx16( uint16_t *__restrict in, unsigned n, unsigned *pbx);
unsigned _p4bitsx32( uint32_t *__restrict in, unsigned n, unsigned *pbx);
unsigned _p4bitsx64( uint64_t *__restrict in, unsigned n, unsigned *pbx);
#define P4HVE(_out_, _b_, _bx_,_usize_) do { if(!_bx_) *_out_++ = _b_;else if(_bx_ <= _usize_) *_out_++ = 0x80|_b_, *_out_++ = _bx_; else *_out_++= (_bx_ == _usize_+1?0x40:0xc0)|_b_; } while(0)
#define P4HVE8( _out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_, 8)
#define P4HVE16(_out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_,16)
#define P4HVE32(_out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_,32)
#define P4HVE64(_out_, _b_, _bx_) do { unsigned _c = _b_==64?64-1:_b_; P4HVE(_out_, _c, _bx_,64); } while(0)
//---------------------------- TurboPFor: Decode --------------------------------------------------------
// decompress a previously (with p4enc32) bit packed array. Return value = end of packed buffer in
//-- scalar. (see p4getx32 for direct access)
// b and bx specified (not stored within the compressed stream header)
inline unsigned char *_p4dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical BitPacking)
inline unsigned char *_p4dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4dec128v64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx);
inline unsigned char *_p4dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx);
unsigned char *p4dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out);
unsigned char *p4dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out);
unsigned char *p4dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
unsigned char *p4dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out); // SIMD (Vertical BitPacking)
unsigned char *p4dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
unsigned char *p4dec128v64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out);
unsigned char *p4dec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
unsigned char *p4dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out);
//------ Delta decoding --------------------------- Return value = end of packed input buffer in ---------------------------
//-- Increasing integer lists. out[i] = out[i-1] + in[i]
// b and bx specified
unsigned char *_p4ddec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx);
unsigned char *_p4ddec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
unsigned char *_p4ddec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
unsigned char *_p4ddec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
unsigned char *_p4ddec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
unsigned char *_p4ddec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
unsigned char *_p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
unsigned char *p4ddec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start);
unsigned char *p4ddec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
unsigned char *p4ddec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4ddec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
unsigned char *p4ddec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4ddec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
//-- Strictly increasing (never remaining constant or decreasing) integer lists. out[i] = out[i-1] + in[i] + 1
// b and bx specified (see idxcr.c/idxqry.c for an example)
unsigned char *_p4d1dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx);
unsigned char *_p4d1dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
unsigned char *_p4d1dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
unsigned char *_p4d1dec128v16(unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); // SIMD (Vertical BitPacking)
unsigned char *_p4d1dec128v32(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
unsigned char *_p4d1dec256v32(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
unsigned char *_p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
unsigned char *p4d1dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start);
unsigned char *p4d1dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
unsigned char *p4d1dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4d1dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
unsigned char *p4d1dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4d1dec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
// ZigZag encoding
inline unsigned char *_p4zdec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx);
inline unsigned char *_p4zdec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
inline unsigned char *_p4zdec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
inline unsigned char *_p4zdec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
inline unsigned char *_p4zdec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
inline unsigned char *_p4zdec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
inline unsigned char *_p4zdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
unsigned char *p4zdec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start);
unsigned char *p4zdec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
unsigned char *p4zdec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4zdec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
unsigned char *p4zdec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4zdec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
unsigned char *p4zdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
//---------------- Direct Access functions to compressed TurboPFor array p4encx16/p4encx32 -------------------------------------------------------
#ifdef TURBOPFOR_DAC
#include "conf.h"
#define P4D_PAD8(_x_) ( (((_x_)+8-1)/8) )
#define P4D_B(_x_) ((_x_) & 0x7f)
#define P4D_XB(_x_) (((_x_) & 0x80)?((_x_) >> 8):0)
#define P4D_ININC(_in_, _x_) _in_ += 1+((_x_) >> 7)
static inline unsigned p4bits(unsigned char *__restrict in, int *bx) { unsigned i = ctou16(in); *bx = P4D_XB(i); return P4D_B(i); }
struct p4 {
unsigned long long *xmap;
unsigned char *ex;
unsigned isx,bx,cum[P4D_MAX/64+1];
int oval,idx;
};
static unsigned long long p4xmap[P4D_MAX/64+1] = { 0 };
// prepare direct access usage
static inline void p4ini(struct p4 *p4, unsigned char **pin, unsigned n, unsigned *b) { unsigned char *in = *pin;
unsigned p4i = ctou16(in);
p4->isx = p4i&0x80;
*b = P4D_B(p4i);
p4->bx = P4D_XB(p4i); //printf("p4i=%x,b=%d,bx=%d ", p4->i, *b, p4->bx); //assert(n <= P4D_MAX);
*pin = p4->ex = ++in;
if(p4->isx) {
unsigned num=0,j;
unsigned char *p;
++in;
p4->xmap = (unsigned long long *)in;
for(j=0; j < n/64; j++) { p4->cum[j] = num; num += popcnt64(ctou64(in+j*8)); }
if(n & 0x3f) num += popcnt64(ctou64(in+j*8) & ((1ull<<(n&0x3f))-1) );
p4->ex = p = in + (n+7)/8;
*pin = p = p4->ex+(((uint64_t)num*p4->bx+7)/8);
} else p4->xmap = p4xmap;
p4->oval = p4->idx = -1;
}
//---------- Get a single value with index "idx" from a "p4encx32" packed array
static ALWAYS_INLINE uint8_t p4getx8( struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx8( in, idx, b);
if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx8(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
return u;
}
static ALWAYS_INLINE uint16_t p4getx16(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx16(in, idx, b);
if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx16(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
return u;
}
static ALWAYS_INLINE uint32_t p4getx32(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx32(in, idx, b);
if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx32(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
return u;
}
// Get the next single value greater of equal to val
static ALWAYS_INLINE uint16_t p4geqx8( struct p4 *p4, unsigned char *in, unsigned b, uint8_t val) { do p4->oval += p4getx8( p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
static ALWAYS_INLINE uint16_t p4geqx16(struct p4 *p4, unsigned char *in, unsigned b, uint16_t val) { do p4->oval += p4getx16(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
static ALWAYS_INLINE uint32_t p4geqx32(struct p4 *p4, unsigned char *in, unsigned b, uint32_t val) { do p4->oval += p4getx32(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
/* DO NOT USE : like p4dec32 but using direct access. This is only a demo showing direct access usage. Use p4dec32 instead for decompressing entire blocks */
unsigned char *p4decx32( unsigned char *in, unsigned n, uint32_t *out); // unsorted
unsigned char *p4fdecx32( unsigned char *in, unsigned n, uint32_t *out, uint32_t start); // FOR increasing
unsigned char *p4f1decx32( unsigned char *in, unsigned n, uint32_t *out, uint32_t start); // FOR strictly increasing
#endif
#ifdef __cplusplus
}
#endif
#endif

View File

@ -1,47 +0,0 @@
/**
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- homepage : https://sites.google.com/site/powturbo/
- github : https://github.com/powturbo
- twitter : https://twitter.com/powturbo
- email : powturbo [_AT_] gmail [_DOT_] com
**/
// "Integer Compression" variable simple "SimpleV"
// this belongs to the integer compression known as "simple family", like simple-9,simple-16
// or simple-8b. SimpleV is compressing integers in groups into variable word size 32, 40 and 64 bits + RLE (run length encoding)
// SimpleV is faster than simple-16 and compress better than simple-16 or simple-8b.
#ifdef __cplusplus
extern "C" {
#endif
// vsencNN: compress array with n unsigned (NN bits in[n]) values to the buffer out. Return value = end of compressed output buffer out
unsigned char *vsenc8( unsigned char *__restrict in, size_t n, unsigned char *__restrict out);
unsigned char *vsenc16(unsigned short *__restrict in, size_t n, unsigned char *__restrict out);
unsigned char *vsenc32(unsigned *__restrict in, size_t n, unsigned char *__restrict out);
unsigned char *vsenc64(uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
// vsdecNN: decompress buffer into an array of n unsigned values. Return value = end of compressed input buffer in
unsigned char *vsdec8( unsigned char *__restrict in, size_t n, unsigned char *__restrict out);
unsigned char *vsdec16(unsigned char *__restrict in, size_t n, unsigned short *__restrict out);
unsigned char *vsdec32(unsigned char *__restrict in, size_t n, unsigned *__restrict out);
unsigned char *vsdec64(unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
#ifdef __cplusplus
}
#endif