.
This commit is contained in:
310
bitpack.h
310
bitpack.h
@ -1,310 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// bitpack.h - "Integer Compression" Binary Packing header file
|
||||
#ifndef BITPACK_H_
|
||||
#define BITPACK_H_
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1600
|
||||
#include "vs/stdint.h"
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//******************** Bit Packing High Level API - n unlimited ***************************************************
|
||||
size_t bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnpack128v64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t bitndpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitndpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitndpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitndpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitndpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitndpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t bitnd1pack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnd1pack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnd1pack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnd1pack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnd1pack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnd1pack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnd1pack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t bitnzpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnzpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnzpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnzpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnzpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t bitnfpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnfpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnfpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t bitnunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t bitnunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitnunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitnunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
size_t bitnunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitnunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitnunpack128v64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
size_t bitnunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t bitndunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t bitndunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitndunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitndunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
size_t bitndunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitndunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitndunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t bitnd1unpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t bitnd1unpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitnd1unpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitnd1unpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
size_t bitnd1unpack128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitnd1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitnd1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t bitnzunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t bitnzunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitnzunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitnzunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t bitnfunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t bitnfunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitnfunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
//******** Bit Packing Low level API ****************************************************************
|
||||
// bipackNN: Pack array with n unsigned (NN bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
|
||||
unsigned char *bitpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
|
||||
unsigned char *bitpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
|
||||
unsigned char *bitpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
|
||||
unsigned char *bitpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b);
|
||||
|
||||
// delta bit packing
|
||||
unsigned char *bitdpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitdpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitdpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitdpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
unsigned char *bitd1pack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitd1pack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitd1pack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitd1pack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
// FOR bit packing : sorted integer array
|
||||
unsigned char *bitfpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitfpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitfpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitfpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
unsigned char *bitf1pack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitf1pack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitf1pack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitf1pack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
// zigzag : unsorted integer array
|
||||
unsigned char *bitzpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitzpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitzpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitzpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
//-------------------------------------- SIMD ------------------------------------------------------------------------------------------
|
||||
// Pack array with 128 unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
|
||||
unsigned char *bitpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
|
||||
unsigned char *bitdpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
|
||||
unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
|
||||
unsigned char *bitfpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
|
||||
unsigned char *bitf1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
|
||||
unsigned char *bitzpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b);
|
||||
|
||||
unsigned char *bitpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
|
||||
unsigned char *bitdpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitfpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitf1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitzpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
//unsigned char *bitpack256w32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
|
||||
unsigned char *bitpack128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
|
||||
|
||||
unsigned char *bitpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b);
|
||||
unsigned char *bitdpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitfpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitzpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
//********************************** Bit Packing : Unpack ****************************************************************
|
||||
|
||||
// ---------------- Unpack a b-bits packed integer array -------------------------------------------------------------------------------
|
||||
// unpack a bitpacked integer array. Return value = end of packed buffer in
|
||||
unsigned char *bitunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, unsigned b);
|
||||
unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b);
|
||||
unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b);
|
||||
unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b);
|
||||
|
||||
// ---------------- Direct Access to a single packed integer array entry --------------------------------------------------------------
|
||||
#ifdef TURBOPFOR_DAC
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
|
||||
#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_)
|
||||
#else
|
||||
#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
|
||||
#define bzhi32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1))
|
||||
#endif
|
||||
|
||||
#include "conf.h"
|
||||
|
||||
static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
|
||||
//static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx;
|
||||
//return (ctou64((uint32_t *)in+(bidx>>5)) << 32+(bidx&0x1f)) >> (64-b);
|
||||
// return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
|
||||
static ALWAYS_INLINE unsigned _bitgetx32(const unsigned char *__restrict in, uint64_t bidx, unsigned b) { return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); }
|
||||
|
||||
// like bitgetx32 but for 16 bits integer array
|
||||
static ALWAYS_INLINE unsigned bitgetx8( const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou16((uint16_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
|
||||
static ALWAYS_INLINE unsigned _bitgetx8( const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou16((uint16_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
|
||||
static ALWAYS_INLINE unsigned bitgetx16(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou32((uint32_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
|
||||
static ALWAYS_INLINE unsigned _bitgetx16(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou32((uint32_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); }
|
||||
|
||||
// Set a single value with index "idx"
|
||||
static ALWAYS_INLINE void bitsetx16(const unsigned char *__restrict in, unsigned idx, unsigned v, unsigned b) { unsigned bidx = b*idx; unsigned *p = (unsigned *) in+(bidx>>4) ; *p = ( *p & ~(((1u <<b)-1) << (bidx& 0xf)) ) | v<<(bidx& 0xf);}
|
||||
static ALWAYS_INLINE void bitsetx32(const unsigned char *__restrict in, unsigned idx, unsigned v, unsigned b) { unsigned bidx = b*idx; unsigned long long *p = (unsigned long long *)((unsigned *)in+(bidx>>5)); *p = ( *p & ~(((1ull<<b)-1) << (bidx&0x1f)) ) | (unsigned long long)v<<(bidx&0x1f);}
|
||||
#endif
|
||||
// ---------------- DFOR : integrated bitpacking, for delta packed SORTED array (Ex. DocId in inverted index) -------------------------------
|
||||
// start <= out[0] <= out[1] <= ... <= out[n-2] <= out[n-1] <= (1<<N)-1 N=8,16,32 or 64
|
||||
// out[0] = start + in[0]; out[1] = out[0] + in[1]; ... ; out[i] = out[i-1] + in[i]
|
||||
unsigned char *bitdunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitdunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitdunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitdunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
// start < out[0] < out[1] < ... < out[n-2] < out[n-1] < (1<<N)-1, N=8,16,32 or 64
|
||||
// out[0] = start + in[0] + 1; out[1] = out[0] + in[1] + 1; ... ; out[i] = out[i-1] + in[i] + 1
|
||||
unsigned char *bitd1unpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitd1unpack16(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitd1unpack32(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitd1unpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
// ---------------- ZigZag : integrated bitpacking, for zigzag packed unsorted
|
||||
unsigned char *bitzunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitzunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitzunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitzunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
// ---------------- For : Direct Access for packed SORTED array --------------------------------------------
|
||||
// out[i] = start + in[i] + i
|
||||
unsigned char *bitfunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitfunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitfunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitfunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
// out[i] = start + in[i] + i + 1
|
||||
unsigned char *bitf1unpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b);
|
||||
unsigned char *bitf1unpack16(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b);
|
||||
unsigned char *bitf1unpack32(const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b);
|
||||
unsigned char *bitf1unpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b);
|
||||
|
||||
// ---------------- SIMD : unpack a SIMD bit packed integer array -------------------------------------------------------------------------------
|
||||
// SIMD unpack a 128/256 bitpacked integer array. Return value = end of packed buffer in
|
||||
unsigned char *bitunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b);
|
||||
unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
|
||||
unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
|
||||
unsigned char *bitd1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
|
||||
unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
|
||||
unsigned char *bitf1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b);
|
||||
|
||||
unsigned char *bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
|
||||
unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitf1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
unsigned char *bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
|
||||
unsigned char *bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b);
|
||||
|
||||
unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
|
||||
unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitf1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
unsigned char *bitunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b);
|
||||
unsigned char *bitzunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitdunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
unsigned char *bitd1unpack128h32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b);
|
||||
|
||||
// internal TurboPFor functions: masked unpack
|
||||
unsigned char *_bitunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitd1unpack128v16(const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb);
|
||||
|
||||
unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitd1unpack128v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
|
||||
unsigned char *_bitunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitdunpack128h32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitd1unpack128h32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
|
||||
//unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned char *bb);
|
||||
|
||||
unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
547
bitutil.h
547
bitutil.h
@ -1,547 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Integer Compression: max.bits, delta, zigzag, xor"
|
||||
|
||||
#ifdef BITUTIL_IN
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
#elif defined(__AVX__)
|
||||
#include <immintrin.h>
|
||||
#elif defined(__SSE4_1__)
|
||||
#include <smmintrin.h>
|
||||
#elif defined(__SSSE3__)
|
||||
#ifdef __powerpc64__
|
||||
#define __SSE__ 1
|
||||
#define __SSE2__ 1
|
||||
#define __SSE3__ 1
|
||||
#define NO_WARN_X86_INTRINSICS 1
|
||||
#endif
|
||||
#include <tmmintrin.h>
|
||||
#elif defined(__SSE2__)
|
||||
#include <emmintrin.h>
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1600
|
||||
#include "vs/stdint.h"
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#include "sse_neon.h"
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
#define PREFETCH(_ip_,_rw_)
|
||||
#else
|
||||
#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
|
||||
#endif
|
||||
//------------------------ zigzag encoding -------------------------------------------------------------
|
||||
static inline unsigned char zigzagenc8( signed char x) { return x << 1 ^ x >> 7; }
|
||||
static inline char zigzagdec8( unsigned char x) { return x >> 1 ^ -(x & 1); }
|
||||
|
||||
static inline unsigned short zigzagenc16(short x) { return x << 1 ^ x >> 15; }
|
||||
static inline short zigzagdec16(unsigned short x) { return x >> 1 ^ -(x & 1); }
|
||||
|
||||
static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; }
|
||||
static inline int zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); }
|
||||
|
||||
static inline uint64_t zigzagenc64(int64_t x) { return x << 1 ^ x >> 63; }
|
||||
static inline int64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); }
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16(v,15)); }
|
||||
static ALWAYS_INLINE __m128i mm_zzage_epi32(__m128i v) { return _mm_xor_si128( mm_slli_epi32(v,1), mm_srai_epi32(v,31)); }
|
||||
//static ALWAYS_INLINE __m128i mm_zzage_epi64(__m128i v) { return _mm_xor_si128( mm_slli_epi64(v,1), _mm_srai_epi64(v,63)); }
|
||||
|
||||
static ALWAYS_INLINE __m128i mm_zzagd_epi16(__m128i v) { return _mm_xor_si128( mm_srli_epi16(v,1), mm_srai_epi16( mm_slli_epi16(v,15),15) ); }
|
||||
static ALWAYS_INLINE __m128i mm_zzagd_epi32(__m128i v) { return _mm_xor_si128( mm_srli_epi32(v,1), mm_srai_epi32( mm_slli_epi32(v,31),31) ); }
|
||||
//static ALWAYS_INLINE __m128i mm_zzagd_epi64(__m128i v) { return _mm_xor_si128(mm_srli_epi64(v,1), _mm_srai_epi64( m_slli_epi64(v,63),63) ); }
|
||||
|
||||
#endif
|
||||
#ifdef __AVX2__
|
||||
static ALWAYS_INLINE __m256i mm256_zzage_epi32(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi32(v,1), _mm256_srai_epi32(v,31)); }
|
||||
static ALWAYS_INLINE __m256i mm256_zzagd_epi32(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi32(v,1), _mm256_srai_epi32(_mm256_slli_epi32(v,31),31) ); }
|
||||
#endif
|
||||
|
||||
//-------------- AVX2 delta + prefix sum (scan) / xor encode/decode ---------------------------------------------------------------------------------------
|
||||
#ifdef __AVX2__
|
||||
static ALWAYS_INLINE __m256i mm256_delta_epi32(__m256i v, __m256i sv) { return _mm256_sub_epi32(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
|
||||
static ALWAYS_INLINE __m256i mm256_delta_epi64(__m256i v, __m256i sv) { return _mm256_sub_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 8)); }
|
||||
static ALWAYS_INLINE __m256i mm256_xore_epi32( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); }
|
||||
static ALWAYS_INLINE __m256i mm256_xore_epi64( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 8)); }
|
||||
|
||||
static ALWAYS_INLINE __m256i mm256_scan_epi32(__m256i v, __m256i sv) {
|
||||
v = _mm256_add_epi32(v, _mm256_slli_si256(v, 4));
|
||||
v = _mm256_add_epi32(v, _mm256_slli_si256(v, 8));
|
||||
return _mm256_add_epi32( _mm256_permute2x128_si256( _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
|
||||
_mm256_add_epi32(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));
|
||||
}
|
||||
static ALWAYS_INLINE __m256i mm256_xord_epi32(__m256i v, __m256i sv) {
|
||||
v = _mm256_xor_si256(v, _mm256_slli_si256(v, 4));
|
||||
v = _mm256_xor_si256(v, _mm256_slli_si256(v, 8));
|
||||
return _mm256_xor_si256( _mm256_permute2x128_si256( _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11),
|
||||
_mm256_xor_si256(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE __m256i mm256_scan_epi64(__m256i v, __m256i sv) {
|
||||
v = _mm256_add_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
|
||||
return _mm256_add_epi64(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_add_epi64(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
|
||||
}
|
||||
static ALWAYS_INLINE __m256i mm256_xord_epi64(__m256i v, __m256i sv) {
|
||||
v = _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8));
|
||||
return _mm256_xor_si256(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_xor_si256(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) );
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE __m256i mm256_scani_epi32(__m256i v, __m256i sv, __m256i vi) { return _mm256_add_epi32(mm256_scan_epi32(v, sv), vi); }
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_alignr_epi8(v, sv, 14)); }
|
||||
static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_alignr_epi8(v, sv, 12)); }
|
||||
static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 14)); }
|
||||
static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 12)); }
|
||||
|
||||
#define MM_HDEC_EPI32(_v_,_sv_,_hop_) { _v_ = _hop_(_v_, _mm_slli_si128(_v_, 4)); _v_ = _hop_(mm_shuffle_nnnn_epi32(_sv_, 3), _hop_(_mm_slli_si128(_v_, 8), _v_)); }
|
||||
static ALWAYS_INLINE __m128i mm_scan_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_add_epi32); return v; }
|
||||
static ALWAYS_INLINE __m128i mm_xord_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_xor_si128); return v; }
|
||||
|
||||
#define MM_HDEC_EPI16(_v_,_sv_,_hop_) {\
|
||||
_v_ = _hop_( _v_, _mm_slli_si128(_v_, 2));\
|
||||
_v_ = _hop_( _v_, _mm_slli_si128(_v_, 4));\
|
||||
_v_ = _hop_(_hop_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi16(0x0f0e)));\
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE __m128i mm_scan_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_add_epi16); return v; }
|
||||
static ALWAYS_INLINE __m128i mm_xord_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_xor_si128); return v; }
|
||||
//-------- scan with vi delta > 0 -----------------------------
|
||||
static ALWAYS_INLINE __m128i mm_scani_epi16(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi16(mm_scan_epi16(v, sv), vi); }
|
||||
static ALWAYS_INLINE __m128i mm_scani_epi32(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi32(mm_scan_epi32(v, sv), vi); }
|
||||
|
||||
#elif defined(__SSE2__)
|
||||
static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
|
||||
static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); }
|
||||
static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
|
||||
static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); }
|
||||
#endif
|
||||
|
||||
#if !defined(_M_X64) && !defined(__x86_64__) && defined(__AVX__)
|
||||
#define _mm256_extract_epi64(v, index) ((__int64)((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2) | (((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2 + 1)) << 32)))
|
||||
#endif
|
||||
|
||||
//------------------ Horizontal OR -----------------------------------------------
|
||||
#ifdef __AVX2__
|
||||
static ALWAYS_INLINE unsigned mm256_hor_epi32(__m256i v) {
|
||||
v = _mm256_or_si256(v, _mm256_srli_si256(v, 8));
|
||||
v = _mm256_or_si256(v, _mm256_srli_si256(v, 4));
|
||||
return _mm256_extract_epi32(v,0) | _mm256_extract_epi32(v, 4);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
|
||||
v = _mm256_or_si256(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(2, 0, 0, 1)));
|
||||
return _mm256_extract_epi64(v, 1) | _mm256_extract_epi64(v,0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define MM_HOZ_EPI16(v,_hop_) {\
|
||||
v = _hop_(v, _mm_srli_si128(v, 8));\
|
||||
v = _hop_(v, _mm_srli_si128(v, 6));\
|
||||
v = _hop_(v, _mm_srli_si128(v, 4));\
|
||||
v = _hop_(v, _mm_srli_si128(v, 2));\
|
||||
}
|
||||
|
||||
#define MM_HOZ_EPI32(v,_hop_) {\
|
||||
v = _hop_(v, _mm_srli_si128(v, 8));\
|
||||
v = _hop_(v, _mm_srli_si128(v, 4));\
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint16_t mm_hor_epi16( __m128i v) { MM_HOZ_EPI16(v,_mm_or_si128); return (unsigned short)_mm_cvtsi128_si32(v); }
|
||||
static ALWAYS_INLINE uint32_t mm_hor_epi32( __m128i v) { MM_HOZ_EPI32(v,_mm_or_si128); return (unsigned )_mm_cvtsi128_si32(v); }
|
||||
static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _mm_srli_si128(v, 8)); return (uint64_t )_mm_cvtsi128_si64(v); }
|
||||
#endif
|
||||
|
||||
//----------------- sub / add ----------------------------------------------------------
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define SUBI16x8(_v_, _sv_) _mm_sub_epi16(_v_, _sv_)
|
||||
#define SUBI32x4(_v_, _sv_) _mm_sub_epi32(_v_, _sv_)
|
||||
#define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_)
|
||||
#define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_)
|
||||
|
||||
//---------------- Convert mm_cvtsi128_siXX -------------------------------------------
|
||||
static ALWAYS_INLINE uint8_t mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
|
||||
static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
|
||||
#endif
|
||||
|
||||
//--------- memset -----------------------------------------
|
||||
#define BITFORSET_(_out_, _n_, _start_, _mindelta_) do { unsigned _i;\
|
||||
for(_i = 0; _i != (_n_&~3); _i+=4) { \
|
||||
_out_[_i+0] = _start_+(_i )*_mindelta_; \
|
||||
_out_[_i+1] = _start_+(_i+1)*_mindelta_; \
|
||||
_out_[_i+2] = _start_+(_i+2)*_mindelta_; \
|
||||
_out_[_i+3] = _start_+(_i+3)*_mindelta_; \
|
||||
} \
|
||||
while(_i != _n_) \
|
||||
_out_[_i] = _start_+_i*_mindelta_, ++_i; \
|
||||
} while(0)
|
||||
|
||||
//--------- SIMD zero -----------------------------------------
|
||||
#ifdef __AVX2__
|
||||
#define BITZERO32(_out_, _n_, _start_) do {\
|
||||
__m256i _sv_ = _mm256_set1_epi32(_start_), *_ov = (__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\
|
||||
do _mm256_storeu_si256(_ov++, _sv_); while(_ov < _ove);\
|
||||
} while(0)
|
||||
|
||||
#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) do {\
|
||||
__m256i _sv = _mm256_set1_epi32(_start_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_), _cv = _mm256_set_epi32(7+_mindelta_,6+_mindelta_,5+_mindelta_,4+_mindelta_,3*_mindelta_,2*_mindelta_,1*_mindelta_,0); \
|
||||
_sv = _mm256_add_epi32(_sv, _cv);\
|
||||
_cv = _mm256_set1_epi32(4);\
|
||||
do { _mm256_storeu_si256(_ov++, _sv); _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\
|
||||
} while(0)
|
||||
|
||||
#define BITDIZERO32(_out_, _n_, _start_, _mindelta_) do { __m256i _sv = _mm256_set1_epi32(_start_), _cv = _mm256_set_epi32(7+_mindelta_,6+_mindelta_,5+_mindelta_,4+_mindelta_,3+_mindelta_,2+_mindelta_,1+_mindelta_,_mindelta_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\
|
||||
_sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_mindelta_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\
|
||||
} while(0)
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON) // -------------
|
||||
// SIMD set value (memset)
|
||||
#define BITZERO32(_out_, _n_, _v_) do {\
|
||||
__m128i _sv_ = _mm_set1_epi32(_v_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
|
||||
do _mm_storeu_si128(_ov++, _sv_); while(_ov < _ove); \
|
||||
} while(0)
|
||||
|
||||
#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) do {\
|
||||
__m128i _sv = _mm_set1_epi32(_start_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_), _cv = _mm_set_epi32(3*_mindelta_,2*_mindelta_,1*_mindelta_,0); \
|
||||
_sv = _mm_add_epi32(_sv, _cv);\
|
||||
_cv = _mm_set1_epi32(4);\
|
||||
do { _mm_storeu_si128(_ov++, _sv); _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
|
||||
} while(0)
|
||||
|
||||
#define BITDIZERO32(_out_, _n_, _start_, _mindelta_) do { __m128i _sv = _mm_set1_epi32(_start_), _cv = _mm_set_epi32(3+_mindelta_,2+_mindelta_,1+_mindelta_,_mindelta_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
|
||||
_sv = _mm_add_epi32(_sv, _cv); _cv = _mm_set1_epi32(4*_mindelta_); do { _mm_storeu_si128(_ov++, _sv), _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\
|
||||
} while(0)
|
||||
#else
|
||||
#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) BITFORSET_(_out_, _n_, _start_, _mindelta_)
|
||||
#define BITZERO32( _out_, _n_, _start_) BITFORSET_(_out_, _n_, _start_, 0)
|
||||
#endif
|
||||
|
||||
#define DELTR( _in_, _n_, _start_, _mindelta_, _out_) { unsigned _v; for( _v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_); }
|
||||
#define DELTRB(_in_, _n_, _start_, _mindelta_, _b_, _out_) { unsigned _v; for(_b_=0,_v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_), _b_ |= _out_[_v]; _b_ = bsr32(_b_); }
|
||||
|
||||
//----------------------------------------- bitreverse scalar + SIMD -------------------------------------------
|
||||
#if __clang__ && defined __has_builtin
|
||||
#if __has_builtin(__builtin_bitreverse64)
|
||||
#define BUILTIN_BITREVERSE
|
||||
#else
|
||||
#define BUILTIN_BITREVERSE
|
||||
#endif
|
||||
#endif
|
||||
#ifdef BUILTIN_BITREVERSE
|
||||
#define rbit8(x) __builtin_bitreverse8( x)
|
||||
#define rbit16(x) __builtin_bitreverse16(x)
|
||||
#define rbit32(x) __builtin_bitreverse32(x)
|
||||
#define rbit64(x) __builtin_bitreverse64(x)
|
||||
#else
|
||||
|
||||
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
|
||||
static ALWAYS_INLINE uint32_t _rbit_(uint32_t x) { uint32_t rc; __asm volatile ("rbit %0, %1" : "=r" (rc) : "r" (x) ); }
|
||||
#endif
|
||||
static ALWAYS_INLINE uint8_t rbit8(uint8_t x) {
|
||||
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
|
||||
return _rbit_(x) >> 24;
|
||||
#elif 0
|
||||
x = (x & 0xaa) >> 1 | (x & 0x55) << 1;
|
||||
x = (x & 0xcc) >> 2 | (x & 0x33) << 2;
|
||||
return x << 4 | x >> 4;
|
||||
#else
|
||||
return (x * 0x0202020202ull & 0x010884422010ull) % 1023;
|
||||
#endif
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint16_t rbit16(uint16_t x) {
|
||||
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
|
||||
return _rbit_(x) >> 16;
|
||||
#else
|
||||
x = (x & 0xaaaa) >> 1 | (x & 0x5555) << 1;
|
||||
x = (x & 0xcccc) >> 2 | (x & 0x3333) << 2;
|
||||
x = (x & 0xf0f0) >> 4 | (x & 0x0f0f) << 4;
|
||||
return x << 8 | x >> 8;
|
||||
#endif
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint32_t rbit32(uint32_t x) {
|
||||
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
|
||||
return _rbit_(x);
|
||||
#else
|
||||
x = ((x & 0xaaaaaaaa) >> 1 | (x & 0x55555555) << 1);
|
||||
x = ((x & 0xcccccccc) >> 2 | (x & 0x33333333) << 2);
|
||||
x = ((x & 0xf0f0f0f0) >> 4 | (x & 0x0f0f0f0f) << 4);
|
||||
x = ((x & 0xff00ff00) >> 8 | (x & 0x00ff00ff) << 8);
|
||||
return x << 16 | x >> 16;
|
||||
#endif
|
||||
}
|
||||
static ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
|
||||
#if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u)
|
||||
return (uint64_t)_rbit_(x) << 32 | _rbit_(x >> 32);
|
||||
#else
|
||||
x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1;
|
||||
x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2;
|
||||
x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4;
|
||||
x = (x & 0xff00ff00ff00ff00) >> 8 | (x & 0x00ff00ff00ff00ff) << 8;
|
||||
x = (x & 0xffff0000ffff0000) >> 16 | (x & 0x0000ffff0000ffff) << 16;
|
||||
return x << 32 | x >> 32;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
static ALWAYS_INLINE __m128i mm_rbit_epi16(__m128i v) { return mm_rbit_epi8(mm_rev_epi16(v)); }
|
||||
static ALWAYS_INLINE __m128i mm_rbit_epi32(__m128i v) { return mm_rbit_epi8(mm_rev_epi32(v)); }
|
||||
static ALWAYS_INLINE __m128i mm_rbit_epi64(__m128i v) { return mm_rbit_epi8(mm_rev_epi64(v)); }
|
||||
//static ALWAYS_INLINE __m128i mm_rbit_si128(__m128i v) { return mm_rbit_epi8(mm_rev_si128(v)); }
|
||||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
static ALWAYS_INLINE __m256i mm256_rbit_epi8(__m256i v) {
|
||||
__m256i fv = _mm256_setr_epi8(0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15, 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15), cv0f_8 = _mm256_set1_epi8(0xf);
|
||||
__m256i lv = _mm256_shuffle_epi8(fv,_mm256_and_si256( v, cv0f_8));
|
||||
__m256i hv = _mm256_shuffle_epi8(fv,_mm256_and_si256(_mm256_srli_epi64(v, 4), cv0f_8));
|
||||
return _mm256_or_si256(_mm256_slli_epi64(lv,4), hv);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE __m256i mm256_rev_epi16(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14)); }
|
||||
static ALWAYS_INLINE __m256i mm256_rev_epi32(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11,10, 9, 8,15,14,13,12, 3, 2, 1, 0, 7, 6, 5, 4, 11,10, 9, 8,15,14,13,12)); }
|
||||
static ALWAYS_INLINE __m256i mm256_rev_epi64(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8)); }
|
||||
static ALWAYS_INLINE __m256i mm256_rev_si128(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
|
||||
|
||||
static ALWAYS_INLINE __m256i mm256_rbit_epi16(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi16(v)); }
|
||||
static ALWAYS_INLINE __m256i mm256_rbit_epi32(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi32(v)); }
|
||||
static ALWAYS_INLINE __m256i mm256_rbit_epi64(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi64(v)); }
|
||||
static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi8(mm256_rev_si128(v)); }
|
||||
#endif
|
||||
|
||||
// ------------------ bitio genaral macros ---------------------------
|
||||
#ifdef __AVX2__
|
||||
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#define bzhi_u32(_u_, _b_) _bzhi_u32(_u_, _b_)
|
||||
|
||||
#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
|
||||
#define bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
|
||||
#else
|
||||
#define bzhi_u64(_u_, _b_) _bzhi_u64(_u_, _b_)
|
||||
#endif
|
||||
#else
|
||||
#define bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
|
||||
#define bzhi_u32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1))
|
||||
#endif
|
||||
|
||||
#define BZHI64(_u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))
|
||||
#define BZHI32(_u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1)))
|
||||
|
||||
#define bitdef( _bw_,_br_) uint64_t _bw_=0; unsigned _br_=0
|
||||
#define bitini( _bw_,_br_) _bw_=_br_=0
|
||||
//-- bitput ---------
|
||||
#define bitput( _bw_,_br_,_nb_,_x_) (_bw_) += (uint64_t)(_x_) << (_br_), (_br_) += (_nb_)
|
||||
#define bitenorm( _bw_,_br_,_op_) ctou64(_op_) = _bw_; _op_ += ((_br_)>>3), (_bw_) >>=((_br_)&~7), (_br_) &= 7
|
||||
#define bitflush( _bw_,_br_,_op_) ctou64(_op_) = _bw_, _op_ += ((_br_)+7)>>3, _bw_=_br_=0
|
||||
//-- bitget ---------
|
||||
#define bitbw( _bw_,_br_) ((_bw_)>>(_br_))
|
||||
#define bitrmv( _bw_,_br_,_nb_) (_br_) += _nb_
|
||||
|
||||
#define bitdnorm( _bw_,_br_,_ip_) _bw_ = ctou64((_ip_) += ((_br_)>>3)), (_br_) &= 7
|
||||
#define bitalign( _bw_,_br_,_ip_) ((_ip_) += ((_br_)+7)>>3)
|
||||
|
||||
#define BITPEEK32( _bw_,_br_,_nb_) BZHI32(bitbw(_bw_,_br_), _nb_)
|
||||
#define BITGET32( _bw_,_br_,_nb_,_x_) _x_ = BITPEEK32(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
|
||||
#define BITPEEK64( _bw_,_br_,_nb_) BZHI64(bitbw(_bw_,_br_), _nb_)
|
||||
#define BITGET64( _bw_,_br_,_nb_,_x_) _x_ = BITPEEK64(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
|
||||
|
||||
#define bitpeek57( _bw_,_br_,_nb_) bzhi_u64(bitbw(_bw_,_br_), _nb_)
|
||||
#define bitget57( _bw_,_br_,_nb_,_x_) _x_ = bitpeek57(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
|
||||
#define bitpeek31( _bw_,_br_,_nb_) bzhi_u32(bitbw(_bw_,_br_), _nb_)
|
||||
#define bitget31( _bw_,_br_,_nb_,_x_) _x_ = bitpeek31(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_)
|
||||
//------------------ templates -----------------------------------
|
||||
#define bitput8( _bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
|
||||
#define bitput16(_bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
|
||||
#define bitput32(_bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_)
|
||||
#define bitput64(_bw_,_br_,_b_,_x_,_op_) if((_b_)>45) { bitput(_bw_,_br_,(_b_)-32, (_x_)>>32); bitenorm(_bw_,_br_,_op_); bitput(_bw_,_br_,32,(unsigned)(_x_)); } else bitput(_bw_,_br_,_b_,_x_)
|
||||
|
||||
#define bitget8( _bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_)
|
||||
#define bitget16(_bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_)
|
||||
#define bitget32(_bw_,_br_,_b_,_x_,_ip_) bitget57(_bw_,_br_,_b_,_x_)
|
||||
#define bitget64(_bw_,_br_,_b_,_x_,_ip_) if((_b_)>45) { unsigned _v; bitget57(_bw_,_br_,(_b_)-32,_x_); bitdnorm(_bw_,_br_,_ip_); BITGET64(_bw_,_br_,32,_v); _x_ = _x_<<32|_v; } else bitget57(_bw_,_br_,_b_,_x_)
|
||||
#endif
|
||||
|
||||
//---------- max. bit length + transform for sorted/unsorted arrays, delta,delta 1, delta > 1, zigzag, zigzag of delta, xor, FOR,----------------
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
//------ ORed array, used to determine the maximum bit length of the elements in an unsorted integer array ---------------------
|
||||
uint8_t bit8( uint8_t *in, unsigned n, uint8_t *px);
|
||||
uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px);
|
||||
uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px);
|
||||
uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px);
|
||||
|
||||
//-------------- delta = 0: Sorted integer array w/ mindelta = 0 ----------------------------------------------
|
||||
//-- ORed array, maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1]
|
||||
uint8_t bitd8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
|
||||
uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
|
||||
|
||||
//-- in-place reverse delta 0
|
||||
void bitddec8( uint8_t *p, unsigned n, uint8_t start); // non decreasing (out[i] = in[i] - in[i-1])
|
||||
void bitddec16( uint16_t *p, unsigned n, uint16_t start);
|
||||
void bitddec32( uint32_t *p, unsigned n, uint32_t start);
|
||||
void bitddec64( uint64_t *p, unsigned n, uint64_t start);
|
||||
|
||||
//-- vectorized fast delta4 one: out[0] = in[4]-in[0], out[1]=in[5]-in[1], out[2]=in[6]-in[2], out[3]=in[7]-in[3],...
|
||||
uint16_t bits128v16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bits128v32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
|
||||
//------------- delta = 1: Sorted integer array w/ mindelta = 1 ---------------------------------------------
|
||||
//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
|
||||
uint8_t bitd18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
|
||||
uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
|
||||
|
||||
//-- in-place reverse delta one
|
||||
void bitd1dec8( uint8_t *p, unsigned n, uint8_t start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
|
||||
void bitd1dec16( uint16_t *p, unsigned n, uint16_t start);
|
||||
void bitd1dec32( uint32_t *p, unsigned n, uint32_t start);
|
||||
void bitd1dec64( uint64_t *p, unsigned n, uint64_t start);
|
||||
|
||||
//------------- delta > 1: Sorted integer array w/ mindelta > 1 ---------------------------------------------
|
||||
//-- ORed array, for max. bit length get min. delta ()
|
||||
uint8_t bitdi8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
|
||||
uint16_t bitdi16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bitdi32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
uint64_t bitdi64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
|
||||
//-- transform sorted integer array to delta array: out[i] = in[i] - in[i-1] - mindelta
|
||||
uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
|
||||
uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
|
||||
uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
|
||||
uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
|
||||
//-- in-place reverse delta
|
||||
void bitdidec8( uint8_t *in, unsigned n, uint8_t start, uint8_t mindelta);
|
||||
void bitdidec16(uint16_t *in, unsigned n, uint16_t start, uint16_t mindelta);
|
||||
void bitdidec32(uint32_t *in, unsigned n, uint32_t start, uint32_t mindelta);
|
||||
void bitdidec64(uint64_t *in, unsigned n, uint64_t start, uint64_t mindelta);
|
||||
|
||||
//------------- FOR : array bit length: ---------------------------------------------------------------------
|
||||
//------ ORed array, for max. bit length of the non decreasing integer array. out[i] = in[i] - start
|
||||
uint8_t bitf8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
|
||||
uint16_t bitf16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bitf32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
uint64_t bitf64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
|
||||
|
||||
//------ ORed array, for max. bit length of the non strictly decreasing integer array out[i] = in[i] - 1 - start
|
||||
uint8_t bitf18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
|
||||
uint16_t bitf116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bitf132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
uint64_t bitf164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
|
||||
|
||||
//------ ORed array, for max. bit length for usorted array
|
||||
uint8_t bitfm8( uint8_t *in, unsigned n, uint8_t *px, uint8_t *pmin); // unsorted
|
||||
uint16_t bitfm16(uint16_t *in, unsigned n, uint16_t *px, uint16_t *pmin);
|
||||
uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t *px, uint32_t *pmin);
|
||||
uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin);
|
||||
|
||||
//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] ------------------------
|
||||
//-- ORed array, to get maximum zigzag bit length integer array
|
||||
uint8_t bitz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
|
||||
uint16_t bitz16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bitz32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
uint64_t bitz64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
|
||||
//-- Zigzag transform
|
||||
uint8_t bitzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
|
||||
uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
|
||||
uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
|
||||
uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
|
||||
//-- in-place zigzag reverse transform
|
||||
void bitzdec8( uint8_t *in, unsigned n, uint8_t start);
|
||||
void bitzdec16( uint16_t *in, unsigned n, uint16_t start);
|
||||
void bitzdec32( uint32_t *in, unsigned n, uint32_t start);
|
||||
void bitzdec64( uint64_t *in, unsigned n, uint64_t start);
|
||||
|
||||
//------------- Zigzag of zigzag/delta : unsorted/sorted integer array ----------------------------------------------------
|
||||
//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1
|
||||
uint8_t bitzz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
|
||||
uint16_t bitzz16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bitzz32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
uint64_t bitzz64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
|
||||
|
||||
uint8_t bitzzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta);
|
||||
uint16_t bitzzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta);
|
||||
uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta);
|
||||
uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta);
|
||||
|
||||
//-- in-place reverse zigzag of delta (encoded w/ bitdiencNN and parameter mindelta = 1)
|
||||
void bitzzdec8( uint8_t *in, unsigned n, uint8_t start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1)
|
||||
void bitzzdec16( uint16_t *in, unsigned n, uint16_t start);
|
||||
void bitzzdec32( uint32_t *in, unsigned n, uint32_t start);
|
||||
void bitzzdec64( uint64_t *in, unsigned n, uint64_t start);
|
||||
|
||||
//------------- XOR encoding for unsorted integer lists: out[i] = in[i] - in[i-1] -------------
|
||||
//-- ORed array, to get maximum zigzag bit length integer array
|
||||
uint8_t bitx8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start);
|
||||
uint16_t bitx16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start);
|
||||
uint32_t bitx32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start);
|
||||
uint64_t bitx64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start);
|
||||
|
||||
//-- XOR transform
|
||||
uint8_t bitxenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start);
|
||||
uint16_t bitxenc16( uint16_t *in, unsigned n, uint16_t *out, uint16_t start);
|
||||
uint32_t bitxenc32( uint32_t *in, unsigned n, uint32_t *out, uint32_t start);
|
||||
uint64_t bitxenc64( uint64_t *in, unsigned n, uint64_t *out, uint64_t start);
|
||||
|
||||
//-- XOR in-place reverse transform
|
||||
void bitxdec8( uint8_t *p, unsigned n, uint8_t start);
|
||||
void bitxdec16( uint16_t *p, unsigned n, uint16_t start);
|
||||
void bitxdec32( uint32_t *p, unsigned n, uint32_t start);
|
||||
void bitxdec64( uint64_t *p, unsigned n, uint64_t start);
|
||||
|
||||
//------- Lossy floating point transform: pad the trailing mantissa bits with zeros according to the error e (ex. e=0.00001)
|
||||
#ifdef USE_FLOAT16
|
||||
void fppad16(_Float16 *in, size_t n, _Float16 *out, float e);
|
||||
#endif
|
||||
void fppad32(float *in, size_t n, float *out, float e);
|
||||
void fppad64(double *in, size_t n, double *out, double e);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//---- Floating point to Integer decomposition ---------------------------------
|
||||
// seeeeeeee21098765432109876543210 (s:sign, e:exponent, 0-9:mantissa)
|
||||
#ifdef BITUTIL_IN
|
||||
#define MANTF32 23
|
||||
#define MANTF64 52
|
||||
|
||||
#define BITFENC(_u_, _sgn_, _expo_, _mant_, _mantbits_, _one_) _sgn_ = _u_ >> (sizeof(_u_)*8-1); _expo_ = ((_u_ >> (_mantbits_)) & ( (_one_<<(sizeof(_u_)*8 - 1 - _mantbits_)) -1)); _mant_ = _u_ & ((_one_<<_mantbits_)-1);
|
||||
#define BITFDEC( _sgn_, _expo_, _mant_, _u_, _mantbits_) _u_ = (_sgn_) << (sizeof(_u_)*8-1) | (_expo_) << _mantbits_ | (_mant_)
|
||||
#endif
|
||||
282
conf.h
282
conf.h
@ -1,282 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
|
||||
// conf.h - config & common
|
||||
#ifndef CONF_H
|
||||
#define CONF_H
|
||||
//------------------------- Compiler ------------------------------------------
|
||||
#if defined(__GNUC__)
|
||||
#include <stdint.h>
|
||||
#define ALIGNED(t,v,n) t v __attribute__ ((aligned (n)))
|
||||
#define ALWAYS_INLINE inline __attribute__((always_inline))
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#define _PACKED __attribute__ ((packed))
|
||||
#define likely(x) __builtin_expect((x),1)
|
||||
#define unlikely(x) __builtin_expect((x),0)
|
||||
|
||||
#define popcnt32(_x_) __builtin_popcount(_x_)
|
||||
#define popcnt64(_x_) __builtin_popcountll(_x_)
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
//x,__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
|
||||
// x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
|
||||
static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; }
|
||||
static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
|
||||
static inline int bsr64(uint64_t x ) { return x?64 - __builtin_clzll(x):0; }
|
||||
static inline int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); }
|
||||
|
||||
static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
|
||||
static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
|
||||
static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
|
||||
static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
|
||||
#else
|
||||
static inline int __bsr32(unsigned x ) { return 31 - __builtin_clz( x); }
|
||||
static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; }
|
||||
static inline int bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
|
||||
|
||||
static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); }
|
||||
static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); }
|
||||
static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); }
|
||||
static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); }
|
||||
#endif
|
||||
|
||||
#define ctz64(_x_) __builtin_ctzll(_x_)
|
||||
#define ctz32(_x_) __builtin_ctz(_x_) // 0:32 ctz32(1<<a) = a (a=1..31)
|
||||
#define clz64(_x_) __builtin_clzll(_x_)
|
||||
#define clz32(_x_) __builtin_clz(_x_) // 00000000 00000000 00000000 01000000 = 25
|
||||
|
||||
//#define bswap8(x) (x)
|
||||
#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
|
||||
#define bswap16(x) __builtin_bswap16(x)
|
||||
#else
|
||||
static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); }
|
||||
#endif
|
||||
#define bswap32(x) __builtin_bswap32(x)
|
||||
#define bswap64(x) __builtin_bswap64(x)
|
||||
|
||||
#elif _MSC_VER //----------------------------------------------------
|
||||
#include <windows.h>
|
||||
#include <intrin.h>
|
||||
#if _MSC_VER < 1600
|
||||
#include "vs/stdint.h"
|
||||
#define __builtin_prefetch(x,a)
|
||||
#define inline __inline
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA)
|
||||
#endif
|
||||
|
||||
#define ALIGNED(t,v,n) __declspec(align(n)) t v
|
||||
#define ALWAYS_INLINE __forceinline
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#define THREADLOCAL __declspec(thread)
|
||||
#define likely(x) (x)
|
||||
#define unlikely(x) (x)
|
||||
|
||||
static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; }
|
||||
static inline int bsr32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?z+1:0; }
|
||||
static inline int ctz32( unsigned x) { unsigned long z; _BitScanForward(&z, x); return x?z:32; }
|
||||
static inline int clz32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?31-z:32; }
|
||||
#if !defined(_M_ARM64) && !defined(_M_X64)
|
||||
static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
|
||||
unsigned long x0 = (unsigned long)x, top, bottom; _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0);
|
||||
*ret = x0 ? bottom : 32 + top; return x != 0;
|
||||
}
|
||||
static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
|
||||
unsigned long x1 = (unsigned long)(x >> 32), top, bottom; _BitScanReverse(&top, x1); _BitScanReverse(&bottom, (unsigned long)x);
|
||||
*ret = x1 ? top + 32 : bottom; return x != 0;
|
||||
}
|
||||
#endif
|
||||
static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; }
|
||||
static inline int ctz64(uint64_t x) { unsigned long z; _BitScanForward64(&z, x); return x?z:64; }
|
||||
static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x); return x?63-z:64; }
|
||||
|
||||
#define rol32(x,s) _lrotl(x, s)
|
||||
#define ror32(x,s) _lrotr(x, s)
|
||||
|
||||
#define bswap16(x) _byteswap_ushort(x)
|
||||
#define bswap32(x) _byteswap_ulong(x)
|
||||
#define bswap64(x) _byteswap_uint64(x)
|
||||
|
||||
#define popcnt32(x) __popcnt(x)
|
||||
#ifdef _WIN64
|
||||
#define popcnt64(x) __popcnt64(x)
|
||||
#else
|
||||
#define popcnt64(x) (popcnt32(x) + popcnt32(x>>32))
|
||||
#endif
|
||||
|
||||
#define sleep(x) Sleep(x/1000)
|
||||
#define fseeko _fseeki64
|
||||
#define ftello _ftelli64
|
||||
#define strcasecmp _stricmp
|
||||
#define strncasecmp _strnicmp
|
||||
#define strtoull _strtoui64
|
||||
static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
|
||||
#endif
|
||||
|
||||
#define __bsr8(_x_) __bsr32(_x_)
|
||||
#define __bsr16(_x_) __bsr32(_x_)
|
||||
#define bsr8(_x_) bsr32(_x_)
|
||||
#define bsr16(_x_) bsr32(_x_)
|
||||
#define ctz8(_x_) ctz32(_x_)
|
||||
#define ctz16(_x_) ctz32(_x_)
|
||||
#define clz8(_x_) (clz32(_x_)-24)
|
||||
#define clz16(_x_) (clz32(_x_)-16)
|
||||
|
||||
#define popcnt8(x) popcnt32(x)
|
||||
#define popcnt16(x) popcnt32(x)
|
||||
|
||||
//--------------- Unaligned memory access -------------------------------------
|
||||
#ifdef UA_MEMCPY
|
||||
#include <string.h>
|
||||
static inline unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; }
|
||||
static inline unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; }
|
||||
static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; }
|
||||
static inline size_t ctousz(const void *cp) { size_t x; memcpy(&x, cp, sizeof(x)); return x; }
|
||||
static inline float ctof32(const void *cp) { float x; memcpy(&x, cp, sizeof(x)); return x; }
|
||||
static inline double ctof64(const void *cp) { double x; memcpy(&x, cp, sizeof(x)); return x; }
|
||||
|
||||
static inline void stou16( void *cp, unsigned short x) { memcpy(cp, &x, sizeof(x)); }
|
||||
static inline void stou32( void *cp, unsigned x) { memcpy(cp, &x, sizeof(x)); }
|
||||
static inline void stou64( void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); }
|
||||
static inline void stousz( void *cp, size_t x) { memcpy(cp, &x, sizeof(x)); }
|
||||
static inline void stof32( void *cp, float x) { memcpy(cp, &x, sizeof(x)); }
|
||||
static inline void stof64( void *cp, double x) { memcpy(cp, &x, sizeof(x)); }
|
||||
#elif defined(__i386__) || defined(__x86_64__) || \
|
||||
defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\
|
||||
defined(__powerpc__) || defined(__s390__) ||\
|
||||
defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
|
||||
defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
|
||||
defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
|
||||
defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
|
||||
#define ctou16(_cp_) (*(unsigned short *)(_cp_))
|
||||
#define ctou32(_cp_) (*(unsigned *)(_cp_))
|
||||
#define ctof32(_cp_) (*(float *)(_cp_))
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER)
|
||||
#define ctou64(_cp_) (*(uint64_t *)(_cp_))
|
||||
#define ctof64(_cp_) (*(double *)(_cp_))
|
||||
#elif defined(__ARM_FEATURE_UNALIGNED)
|
||||
struct _PACKED longu { uint64_t l; };
|
||||
struct _PACKED doubleu { double d; };
|
||||
#define ctou64(_cp_) ((struct longu *)(_cp_))->l
|
||||
#define ctof64(_cp_) ((struct doubleu *)(_cp_))->d
|
||||
#endif
|
||||
|
||||
#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__)
|
||||
struct _PACKED shortu { unsigned short s; };
|
||||
struct _PACKED unsignedu { unsigned u; };
|
||||
struct _PACKED longu { uint64_t l; };
|
||||
struct _PACKED floatu { float f; };
|
||||
struct _PACKED doubleu { double d; };
|
||||
|
||||
#define ctou16(_cp_) ((struct shortu *)(_cp_))->s
|
||||
#define ctou32(_cp_) ((struct unsignedu *)(_cp_))->u
|
||||
#define ctou64(_cp_) ((struct longu *)(_cp_))->l
|
||||
#define ctof32(_cp_) ((struct floatu *)(_cp_))->f
|
||||
#define ctof64(_cp_) ((struct doubleu *)(_cp_))->d
|
||||
#else
|
||||
#error "unknown cpu"
|
||||
#endif
|
||||
|
||||
#define ctou24(_cp_) (ctou32(_cp_) & 0xffffff)
|
||||
#define ctou48(_cp_) (ctou64(_cp_) & 0xffffffffffffull)
|
||||
#define ctou8(_cp_) (*(_cp_))
|
||||
//--------------------- wordsize ----------------------------------------------
|
||||
#if defined(__64BIT__) || defined(_LP64) || defined(__LP64__) || defined(_WIN64) ||\
|
||||
defined(__x86_64__) || defined(_M_X64) ||\
|
||||
defined(__ia64) || defined(_M_IA64) ||\
|
||||
defined(__aarch64__) ||\
|
||||
defined(__mips64) ||\
|
||||
defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
|
||||
defined(__s390x__)
|
||||
#define __WORDSIZE 64
|
||||
#else
|
||||
#define __WORDSIZE 32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//---------------------misc ---------------------------------------------------
|
||||
#define BZHI64F(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) // _b_ < 64
|
||||
#define BZHI32F(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) // _b_ < 32
|
||||
#define BZHI64( _u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1))) // Constant
|
||||
#define BZHI32( _u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1)))
|
||||
#define BZHI16( _u_, _b_) BZHI32(_u_, _b_)
|
||||
#define BZHI8( _u_, _b_) BZHI32(_u_, _b_)
|
||||
|
||||
#ifdef __AVX2__
|
||||
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_)
|
||||
|
||||
#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
|
||||
#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
|
||||
#else
|
||||
#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_)
|
||||
#endif
|
||||
#else
|
||||
#define bzhi_u64(_u_, _b_) BZHI64(_u_, _b_)
|
||||
#define bzhi_u32(_u_, _b_) BZHI32(_u_, _b_)
|
||||
#endif
|
||||
|
||||
#define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1))
|
||||
#define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1)))
|
||||
|
||||
#define TEMPLATE2_(_x_, _y_) _x_##_y_
|
||||
#define TEMPLATE2(_x_, _y_) TEMPLATE2_(_x_,_y_)
|
||||
|
||||
#define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_
|
||||
#define TEMPLATE3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_)
|
||||
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
|
||||
|
||||
#define CLAMP(_x_, _low_, _high_) (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_)))
|
||||
|
||||
//--- NDEBUG -------
|
||||
#include <stdio.h>
|
||||
#ifdef _MSC_VER
|
||||
#ifdef NDEBUG
|
||||
#define AS(expr, fmt, ...)
|
||||
#define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
|
||||
#define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
|
||||
#else
|
||||
#define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
|
||||
#define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0)
|
||||
#define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0)
|
||||
#endif
|
||||
#else
|
||||
#ifdef NDEBUG
|
||||
#define AS(expr, fmt,args...)
|
||||
#define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
|
||||
#define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
|
||||
#else
|
||||
#define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
|
||||
#define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0)
|
||||
#define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
|
||||
#endif
|
||||
#endif
|
||||
61
eliasfano.h
61
eliasfano.h
@ -1,61 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// eliasfano.h - "Integer Compression" Elias Fano c/c++ header
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1600
|
||||
#include "vs/stdint.h"
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
// compress/decompress integer array with n values to the buffer out. Return value = end of output/input buffer
|
||||
unsigned char *efanoenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *efanoenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *efanodec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
unsigned char *efanodec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *efano1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *efano1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *efano1dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
unsigned char *efano1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *efanoenc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *efanodec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
|
||||
unsigned char *efano1enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *efano1dec128v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
|
||||
unsigned char *efanoenc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *efanodec256v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
|
||||
unsigned char *efano1enc256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *efano1dec256v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
125
fp.h
125
fp.h
@ -1,125 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Floating Point + Integer Compression"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1600
|
||||
#include "vs/stdint.h"
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
// ---------- TurboPFor Zigzag of delta (=delta of delta + zigzag encoding) (TurboPFor)
|
||||
size_t p4nzzenc128v8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t p4nzzdec128v8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t p4nzzenc128v16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t p4nzzdec128v16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t p4nzzenc128v32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t p4nzzdec128v32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t p4nzzenc128v64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t p4nzzdec128v64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
|
||||
|
||||
//----------- Zigzag (bit/io) -------------------------------------------------------
|
||||
size_t bvzenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t bvzdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t bvzenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t bvzdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t bvzenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t bvzdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t bvzenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t bvzdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
|
||||
//----------- Zigzag of delta (bit/io) ---------------------------------------------
|
||||
size_t bvzzenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t bvzzdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t bvzzenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t bvzzdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t bvzzenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t bvzzdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t bvzzenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t bvzzdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
|
||||
|
||||
//----------- TurboGorilla : Improved gorilla style + RLE (bit/io) ------------------
|
||||
size_t fpgenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t fpgdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t fpgenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t fpgdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t fpgenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t fpgdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t fpgenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t fpgdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
|
||||
|
||||
//----------- TurboFloat XOR : Last value predictor (TurboPFor) ---------------------
|
||||
size_t fpxenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t fpxdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t fpxenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t fpxdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t fpxenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t fpxdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t fpxenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t fpxdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
|
||||
|
||||
//----------- TurboFloat FCM: Finite Context Method Predictor (TurboPFor) -----------
|
||||
size_t fpfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t fpfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t fpfcmenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t fpfcmdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t fpfcmenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t fpfcmdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t fpfcmenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t fpfcmdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
|
||||
|
||||
//----------- TurboFloat DFCM: Differential Finite Context Method Predictor (TurboPFor)
|
||||
size_t fpdfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t fpdfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t fpdfcmenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t fpdfcmdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t fpdfcmenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t fpdfcmdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t fpdfcmenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t fpdfcmdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);
|
||||
|
||||
//----------- TurboFloat 2D DFCM: Differential Finite Context Method Predictor -----
|
||||
size_t fp2dfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t fp2dfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t fp2dfcmenc16(uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t fp2dfcmdec16(unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t fp2dfcmenc32(uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t fp2dfcmdec32(unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t fp2dfcmenc64(uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t fp2dfcmdec64(unsigned char *in, size_t n, uint64_t *out, uint64_t start);
|
||||
|
||||
/*/-------------- delta (=zigzag). Same as p4zenc ------------------------------------
|
||||
size_t fppenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start);
|
||||
size_t fppdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start);
|
||||
size_t fppenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start);
|
||||
size_t fppdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start);
|
||||
size_t fppenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start);
|
||||
size_t fppdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start);
|
||||
size_t fppenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start);
|
||||
size_t fppdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
355
sse_neon.h
355
sse_neon.h
@ -1,355 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2021
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// Intel SSE to ARM NEON optimized for maximum speed (and compatibility gcc/clang) with possible minor changes to the source code
|
||||
|
||||
#ifndef _SSE_NEON_H_
|
||||
#define _SSE_NEON_H_
|
||||
#include "conf.h"
|
||||
|
||||
#ifdef __ARM_NEON //------------------------------------------------------------------------------------------------------------------
|
||||
#include <arm_neon.h>
|
||||
#define __m128i uint32x4_t // int32x4_t can also be used
|
||||
#define __m128 float32x4_t
|
||||
|
||||
//#define USE_MACROS
|
||||
#define uint8x16_to_8x8x2(_u_) ((uint8x8x2_t) { vget_low_u8(_u_), vget_high_u8(_u_) })
|
||||
|
||||
#ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
|
||||
#define _mm_set_epi8(u15,u14,u13,u12,\
|
||||
u11,u10, u9, u8,\
|
||||
u7,u6,u5,u4,\
|
||||
u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (uint32x4_t)vld1q_u8( _u);})
|
||||
#define _mm_set_epi16( u7,u6,u5,u4,\
|
||||
u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (uint32x4_t)vld1q_u16(_u);})
|
||||
//#define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; vld1q_u32(_u);})
|
||||
//#define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (uint32x4_t)vld1q_u64(_u);})
|
||||
#define _mm_set_epi32(u3, u2, u1, u0) vcombine_u32(vcreate_u32((uint64_t)u1 << 32 | u0), vcreate_u32((uint64_t)u3 << 32 | u2))
|
||||
#define _mm_set_epi64x(u1, u0) (__m128i)vcombine_u64(vcreate_u64(u0), vcreate_u64(u1))
|
||||
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8,
|
||||
uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4,
|
||||
uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) {
|
||||
uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (uint32x4_t)vld1q_u8( u); }
|
||||
static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4,
|
||||
uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (uint32x4_t)vld1q_u16(u); }
|
||||
static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return vld1q_u32(u); }
|
||||
static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (uint32x4_t)vld1q_u64(u); }
|
||||
#endif
|
||||
|
||||
#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7)
|
||||
#define _mm_setr_epi32(u3,u2,u1,u0) _mm_set_epi32( u0,u1,u2,u3)
|
||||
#define _mm_setr_epi64x(u1,u0) _mm_set_epi64x(u0,u0)
|
||||
|
||||
#define _mm_set1_epi8( _u8_ ) (__m128i)vdupq_n_u8( _u8_ )
|
||||
#define _mm_set1_epi16( _u16_) (__m128i)vdupq_n_u16(_u16_)
|
||||
#define _mm_set1_epi32( _u32_) vdupq_n_u32(_u32_)
|
||||
#define _mm_set1_epi64x(_u64_) (__m128i)vdupq_n_u64(_u64_)
|
||||
#define _mm_setzero_si128() vdupq_n_u32( 0 )
|
||||
|
||||
#define _mm_cvtss_f32(_u_) vgetq_lane_f32((float32x4_t)(_u_), 0)
|
||||
#define _mm_setzero_ps() (__m128)vdupq_n_f32(0)
|
||||
#define _mm_set1_ps(_f32_) (__m128)vdupq_n_f32(_f32_)
|
||||
//---------------------------------------------- Arithmetic -----------------------------------------------------------------------
|
||||
#define _mm_add_epi8( _u_,_v_) (__m128i)vaddq_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
|
||||
#define _mm_add_epi16( _u_,_v_) (__m128i)vaddq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
||||
#define _mm_add_epi32( _u_,_v_) vaddq_u32( _u_, _v_ )
|
||||
#define _mm_sub_epi8( _u_,_v_) (__m128i)vsubq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
|
||||
#define _mm_sub_epi16( _u_,_v_) (__m128i)vsubq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
||||
#define _mm_sub_epi32( _u_,_v_) (__m128i)vsubq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
||||
#define _mm_subs_epu8( _u_,_v_) (__m128i)vqsubq_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
|
||||
|
||||
#define _mm_mullo_epi16(_u_,_v_) (__m128i)vmulq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
|
||||
#define _mm_mullo_epi32(_u_,_v_) (__m128i)vmulq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
|
||||
#define mm_mullo_epu32(_u_,_v_) vmulq_u32(_u_,_v_)
|
||||
|
||||
#define _mm_mulhi_epi16s(_u_,_v_) (__m128i)vqdmulhq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) //only for small values??
|
||||
static ALWAYS_INLINE __m128i _mm_mulhi_epi16(__m128i u, __m128i v) {
|
||||
int32x4_t lo = vmull_s16(vget_low_s16( (int16x8_t)(u)), vget_low_s16( (int16x8_t)(v)));
|
||||
int32x4_t hi = vmull_s16(vget_high_s16((int16x8_t)(u)), vget_high_s16((int16x8_t)(v)));
|
||||
uint16x8x2_t a = vuzpq_u16((uint16x8_t)(lo), (uint16x8_t)(hi));
|
||||
return (__m128i)(vreinterpretq_s32_u16(a.val[1]));
|
||||
}
|
||||
#define _mm_mul_epu32( _u_,_v_) (__m128i)vmull_u32(vget_low_u32(_u_),vget_low_u32(_v_))
|
||||
#define _mm_adds_epu16( _u_,_v_) (__m128i)vqaddq_u16((uint16x8_t)(_u_),(uint16x8_t)(_v_))
|
||||
static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) {
|
||||
int32x4_t mlo = vmull_s16(vget_low_s16( (int16x8_t)u), vget_low_s16( (int16x8_t)v)),
|
||||
mhi = vmull_s16(vget_high_s16((int16x8_t)u), vget_high_s16((int16x8_t)v));
|
||||
int32x2_t alo = vpadd_s32(vget_low_s32(mlo), vget_high_s32(mlo)),
|
||||
ahi = vpadd_s32(vget_low_s32(mhi), vget_high_s32(mhi));
|
||||
return (__m128i)vcombine_s32(alo, ahi);
|
||||
}
|
||||
//---------------------------------------------- Special math functions -----------------------------------------------------------
|
||||
#define _mm_min_epu8( _u_,_v_) (__m128i)vminq_u8( (uint8x16_t)(_u_), (uint8x16_t)(_v_))
|
||||
#define _mm_min_epu16( _u_,_v_) (__m128i)vminq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
||||
#define _mm_min_epi16( _u_,_v_) (__m128i)vminq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
|
||||
//---------------------------------------------- Logical --------------------------------------------------------------------------
|
||||
#define mm_testnz_epu32(_u_) vmaxvq_u32(_u_) //vaddvq_u32(_u_)
|
||||
#define mm_testnz_epu8( _u_) vmaxv_u8(_u_)
|
||||
#define _mm_or_si128( _u_,_v_) (__m128i)vorrq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
||||
#define _mm_and_si128( _u_,_v_) (__m128i)vandq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
||||
#define _mm_xor_si128( _u_,_v_) (__m128i)veorq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
||||
//---------------------------------------------- Shift ----------------------------------------------------------------------------
|
||||
#define mm_slli_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)> 7?vdupq_n_u8( 0):vshlq_n_u8( (uint8x16_t)(_u_), (_c_)))) // parameter c MUST be a constant / vshlq_n_u8: __constrange(0-(N-1))
|
||||
#define mm_slli_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u16(0):vshlq_n_u16((uint16x8_t)(_u_), (_c_))))
|
||||
#define mm_slli_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>31?vdupq_n_u32(0):vshlq_n_u32((uint32x4_t)(_u_), (_c_))))
|
||||
#define mm_slli_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>63?vdupq_n_u64(0):vshlq_n_u64((uint64x2_t)(_u_), (_c_))))
|
||||
#define _mm_slli_si128( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u8( 0):vextq_u8(vdupq_n_u8(0), (uint8x16_t)(_u_), 16-(_c_) )) ) // vextq_u8: __constrange(0-15)
|
||||
|
||||
#define mm_srli_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)> 7?vdupq_n_u8( 0):vshrq_n_u8( (uint8x16_t)(_u_), (_c_)))) // vshrq_n: __constrange(1-N)
|
||||
#define mm_srli_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u16(0):vshrq_n_u16((uint16x8_t)(_u_), (_c_))))
|
||||
#define mm_srli_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>31?vdupq_n_u32(0):vshrq_n_u32((uint32x4_t)(_u_), (_c_))))
|
||||
#define mm_srli_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>63?vdupq_n_u64(0):vshlq_n_u64((uint64x2_t)(_u_), (_c_))))
|
||||
#define _mm_srli_si128( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u8(0):vextq_u8((uint8x16_t)(_u_), vdupq_n_u8(0), (_c_) )) ) // vextq_u8: __constrange(0-15)
|
||||
|
||||
#define mm_srai_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s8( (int8x16_t)(_u_), (_c_))) // c <= 8 (vshrq_n:1-N)
|
||||
#define mm_srai_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s16((int16x8_t)(_u_), (_c_))) // c <= 16
|
||||
#define mm_srai_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s32((int32x4_t)(_u_), (_c_))) // c <= 32
|
||||
#define mm_srai_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s64((int64x2_t)(_u_), (_c_))) // c <= 64
|
||||
|
||||
#define _mm_slli_epi8( _u_,_m_) (__m128i)vshlq_u8( (uint8x16_t)(_u_), vdupq_n_s8( (_m_))) // parameter c integer constant/variable
|
||||
#define _mm_slli_epi16( _u_,_m_) (__m128i)vshlq_u16((uint16x8_t)(_u_), vdupq_n_s16( (_m_)))
|
||||
#define _mm_slli_epi32( _u_,_m_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vdupq_n_s32( (_m_)))
|
||||
#define _mm_slli_epi64( _u_,_m_) (__m128i)vshlq_u64((uint64x2_t)(_u_), vdupq_n_s64( (_m_)))
|
||||
|
||||
#define _mm_srli_epi8( _u_,_m_) (__m128i)vshlq_u8( (uint8x16_t)(_u_), vdupq_n_s8( -(_m_)))
|
||||
#define _mm_srli_epi16( _u_,_m_) (__m128i)vshlq_u16((uint16x8_t)(_u_), vdupq_n_s16(-(_m_)))
|
||||
#define _mm_srli_epi32( _u_,_m_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vdupq_n_s32(-(_m_)))
|
||||
#define _mm_srli_epi64( _u_,_m_) (__m128i)vshlq_u64((uint64x2_t)(_u_), vdupq_n_s64(-(_m_)))
|
||||
|
||||
#define _mm_srai_epi8( _u_,_m_) (__m128i)vshlq_s8( (int8x16_t)(_u_), vdupq_n_s8( -(_m_)))
|
||||
#define _mm_srai_epi16( _u_,_m_) (__m128i)vshlq_s16((int16x8_t)(_u_), vdupq_n_s16(-(_m_)))
|
||||
#define _mm_srai_epi32( _u_,_m_) (__m128i)vshlq_s32((int32x4_t)(_u_), vdupq_n_s32(-(_m_)))
|
||||
#define _mm_srai_epi64( _u_,_m_) (__m128i)vshlq_s64((int64x2_t)(_u_), vdupq_n_s64(-(_m_)))
|
||||
|
||||
#define _mm_sll_epi8( _u_,_v_) (__m128i)vshlq_s8( (int8x16_t)(_u_), (int8x16_t)(_v_)) //_v_:all lanes equal
|
||||
#define _mm_sll_epi16( _u_,_v_) (__m128i)vshlq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_))
|
||||
#define _mm_sll_epi32( _u_,_v_) (__m128i)vshlq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_))
|
||||
#define _mm_sll_epi64( _u_,_v_) (__m128i)vshlq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_))
|
||||
|
||||
#define _mm_srl_epi8( _u_,_v_) (__m128i)vshrq_s8( (int8x16_t)(_u_), (int8x16_t)(_v_))
|
||||
#define _mm_srl_epi16( _u_,_v_) (__m128i)vshrq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_))
|
||||
#define _mm_srl_epi32( _u_,_v_) (__m128i)vshrq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_))
|
||||
#define _mm_srl_epi64( _u_,_v_) (__m128i)vshrq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_))
|
||||
|
||||
#define _mm_sllv_epi32( _u_,_v_) (__m128i)vshlq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_)) //variable shift
|
||||
#define _mm_srlv_epi32( _u_,_v_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vnegq_s32((int32x4_t)(_v_)))
|
||||
//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
|
||||
#define _mm_cmpeq_epi8( _u_,_v_) (__m128i)vceqq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
|
||||
#define _mm_cmpeq_epi16( _u_,_v_) (__m128i)vceqq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
|
||||
#define _mm_cmpeq_epi32( _u_,_v_) (__m128i)vceqq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
|
||||
|
||||
#define _mm_cmpgt_epi8( _u_,_v_) (__m128i)vcgtq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
|
||||
#define _mm_cmpgt_epi16( _u_,_v_) (__m128i)vcgtq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
|
||||
#define _mm_cmpgt_epi32( _u_,_v_) (__m128i)vcgtq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
|
||||
|
||||
#define _mm_cmpgt_epu16( _u_,_v_) (__m128i)vcgtq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
||||
#define mm_cmpgt_epu32( _u_,_v_) (__m128i)vcgtq_u32( _u_, _v_)
|
||||
//---------------------------------------------- Load -----------------------------------------------------------------------------
|
||||
#define _mm_loadl_epi64( _u64p_) (__m128i)vcombine_s32(vld1_s32((int32_t const *)(_u64p_)), vcreate_s32(0))
|
||||
#define mm_loadu_epi64p(_u64p_,_u_) (__m128i)vld1q_lane_u64((uint64_t *)(_u64p_), (uint64x2_t)(_u_), 0)
|
||||
#define _mm_loadu_si128( _ip_) vld1q_u32(_ip_)
|
||||
#define _mm_load_si128( _ip_) vld1q_u32(_ip_)
|
||||
|
||||
#define _mm_load_ps( _ip_) (__m128)vld1q_f32((float32_t *)(_ip_))
|
||||
#define _mm_loadu_ps( _ip_) (__m128)vld1q_f32((float32_t *)(_ip_))
|
||||
#define _mm_load1_ps( _ip_) (__m128)vld1q_dup_f32((float32_t *)(_p_))
|
||||
#define _mm_loadl_pi(_u_,_ip_) (__m128)vcombine_f32((float32x2_t)vld1_f32((float32_t *)(_ip)), (float32x2_t)vget_high_f32(_u_))
|
||||
#define _mm_loadh_pi(_u_,_ip_) (__m128)vcombine_f32((float32x2_t)vget_low_f32(_u_), (float32x2_t)vld1_f32((const float *)(_ip_)))
|
||||
//---------------------------------------------- Store ----------------------------------------------------------------------------
|
||||
#define _mm_storel_epi64(_ip_,_u_) vst1q_lane_u64((uint64_t *)(_ip_), (uint64x2_t)(_u_), 0)
|
||||
#define _mm_storeu_si128(_ip_,_u_) vst1q_u32((__m128i *)(_ip_), _u_)
|
||||
|
||||
#define _mm_store_ps( _ip_,_u_) vst1q_f32( (float32_t *)(_ip_), (float32x4_t)(_u_))
|
||||
#define _mm_storeu_ps( _ip_,_u_) vst1q_f32( (float32_t *)(_ip_), (float32x4_t)(_u_))
|
||||
#define _mm_store_ss( _ip_,_u_) vst1q_lane_f32((float32_t *)(_ip_), (float32x4_t)(_u_), 0)
|
||||
//---------------------------------------------- Convert --------------------------------------------------------------------------
|
||||
#define mm_cvtsi64_si128p(_u64p_,_u_) mm_loadu_epi64p(_u64p_,_u_)
|
||||
#define _mm_cvtsi64_si128(_u_) (__m128i)vdupq_n_u64(_u_) //vld1q_s64(_u_)
|
||||
//---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
|
||||
#define mm_rbit_epi8(_v_) (__m128i)vrbitq_u8( (uint8x16_t)(_v_)) // reverse bits
|
||||
#define mm_rev_epi16(_v_) vrev16q_u8((uint8x16_t)(_v_)) // reverse bytes
|
||||
#define mm_rev_epi32(_v_) vrev32q_u8((uint8x16_t)(_v_))
|
||||
#define mm_rev_epi64(_v_) vrev64q_u8((uint8x16_t)(_v_))
|
||||
//--------------------------------------------- Insert/extract --------------------------------------------------------------------
|
||||
#define mm_extract_epi32x(_u_,_u32_,_id_) vst1q_lane_u32((uint32_t *)&(_u32_), _u_, _id_)
|
||||
#define _mm_extract_epi64x(_u_,_u64_,_id_) vst1q_lane_u64((uint64_t *)&(_u64_), (uint64x2_t)(_u_), _id_)
|
||||
|
||||
#define _mm_extract_epi8( _u_, _id_) vgetq_lane_u8( (uint8x16_t)(_u_), _id_)
|
||||
#define _mm_extract_epi16(_u_, _id_) vgetq_lane_u16(_u_, _id_)
|
||||
#define _mm_extract_epi32(_u_, _id_) vgetq_lane_u32(_u_, _id_)
|
||||
#define mm_extract_epu32(_u_, _id_) vgetq_lane_u32(_u_, _id_)
|
||||
#define _mm_cvtsi128_si32(_u_) vgetq_lane_u32((uint32x4_t)(_u_),0)
|
||||
#define _mm_cvtsi128_si64(_u_) vgetq_lane_u64((uint64x2_t)(_u_),0)
|
||||
|
||||
#define _mm_insert_epu32p(_u_,_u32p_,_id_) vsetq_lane_u32(_u32p_, _u_, _id_)
|
||||
#define mm_insert_epi32p(_u_,_u32p_,_id_) vld1q_lane_u32(_u32p_, (uint32x4_t)(_u_), _id_)
|
||||
#define _mm_cvtsi32_si128(_x_) (__m128i)vsetq_lane_s32(_x_, vdupq_n_s32(0), 0)
|
||||
|
||||
#define _mm_blendv_epi8(_u_,_v_,_m_) vbslq_u32(_m_,_v_,_u_)
|
||||
//---------------------------------------------- Miscellaneous --------------------------------------------------------------------
|
||||
#define _mm_alignr_epi8(_u_,_v_,_m_) (__m128i)vextq_u8( (uint8x16_t)(_v_), (uint8x16_t)(_u_), _m_)
|
||||
#define _mm_packs_epi16( _u_,_v_) (__m128i)vcombine_s8( vqmovn_s16((int16x8_t)(_u_)), vqmovn_s16((int16x8_t)(_v_)))
|
||||
#define _mm_packs_epi32( _u_,_v_) (__m128i)vcombine_s16(vqmovn_s32((int32x4_t)(_u_)), vqmovn_s32((int32x4_t)(_v_)))
|
||||
|
||||
#define _mm_packs_epu16( _u_,_v_) (__m128i)vcombine_u8((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
||||
#define _mm_packus_epi16( _u_,_v_) (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)(_u_)), vqmovun_s16((int16x8_t)(_v_)))
|
||||
|
||||
static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
|
||||
const uint8x16_t __attribute__ ((aligned (16))) m = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7};
|
||||
uint8x16_t mv = (uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcltq_s8((int8x16_t)v, vdupq_n_s8(0)), m))));
|
||||
return vgetq_lane_u8(mv, 8) << 8 | vgetq_lane_u8(mv, 0);
|
||||
}
|
||||
//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
|
||||
#ifdef __aarch64__
|
||||
static ALWAYS_INLINE uint8_t mm_movemask_epi8s(uint8x8_t sv) { const uint8x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddv_u8( vand_u8( sv, m)); } // short only ARM
|
||||
//static ALWAYS_INLINE uint16_t mm_movemask_epu16(uint32x4_t v) { const uint16x8_t m = { 1, 1<<2, 1<<4, 1<<6, 1<<8, 1<<10, 1<<12, 1<<14}; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
|
||||
static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) { const uint16x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
|
||||
static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) { const uint32x4_t m = { 1, 1<<1, 1<<2, 1<<3 }; return vaddvq_u32(vandq_u32((uint32x4_t)v, m)); }
|
||||
static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { const uint64x2_t m = { 1, 1<<1 }; return vaddvq_u64(vandq_u64((uint64x2_t)v, m)); }
|
||||
#else
|
||||
static ALWAYS_INLINE uint32_t mm_movemask_epu32(uint32x4_t v) { const uint32x4_t mask = {1,2,4,8}, av = vandq_u32(v, mask), xv = vextq_u32(av, av, 2), ov = vorrq_u32(av, xv); return vgetq_lane_u32(vorrq_u32(ov, vextq_u32(ov, ov, 3)), 0); }
|
||||
#endif
|
||||
// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
|
||||
#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_))
|
||||
|
||||
#define _mm_shuffle_epi8(_u_, _v_) (__m128i)vqtbl1q_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
|
||||
#if defined(__aarch64__)
|
||||
#define mm_shuffle_nnnn_epi32(_u_,_m_) (__m128i)vdupq_laneq_u32(_u_, _m_)
|
||||
#else
|
||||
#define mm_shuffle_nnnn_epi32(_u_,_m_) (__m128i)vdupq_n_u32(vgetq_lane_u32(_u_, _m_)
|
||||
#endif
|
||||
|
||||
#ifdef USE_MACROS
|
||||
#define mm_shuffle_2031_epi32(_u_) ({ uint32x4_t _zv = (uint32x4_t)vrev64q_u32(_u_); uint32x2x2_t _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
|
||||
#define mm_shuffle_3120_epi32(_u_) ({ uint32x4_t _zv = _u_; _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) { uint32x4_t a = (uint32x4_t)vrev64q_u32(v); uint32x2x2_t z = vtrn_u32(vget_low_u32(a), vget_high_u32(a)); return vcombine_u32(z.val[0], z.val[1]);}
|
||||
static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) { uint32x2x2_t z = vtrn_u32(vget_low_u32(v), vget_high_u32(v)); return vcombine_u32(z.val[0], z.val[1]);}
|
||||
#endif
|
||||
|
||||
#if defined(USE_MACROS) || defined(__clang__)
|
||||
#define _mm_shuffle_epi32(_u_, _m_) ({ const uint32x4_t _av =_u_;\
|
||||
uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));\
|
||||
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);\
|
||||
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);\
|
||||
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); _v;\
|
||||
})
|
||||
#define _mm_shuffle_epi32s(_u_, _m_) _mm_set_epi32(vgetq_lane_u32(_u_, ((_m_) ) & 0x3),\
|
||||
vgetq_lane_u32(_u_, ((_m_) >> 2) & 0x3),\
|
||||
vgetq_lane_u32(_u_, ((_m_) >> 4) & 0x3),\
|
||||
vgetq_lane_u32(_u_, ((_m_) >> 6) & 0x3))
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _u_, const unsigned _m_) { const uint32x4_t _av =_u_;
|
||||
uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));
|
||||
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);
|
||||
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);
|
||||
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3);
|
||||
return _v;
|
||||
}
|
||||
static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _u_, const unsigned _m_) {
|
||||
return _mm_set_epi32(vgetq_lane_u32(_u_, ((_m_) ) & 0x3),
|
||||
vgetq_lane_u32(_u_, ((_m_) >> 2) & 0x3),
|
||||
vgetq_lane_u32(_u_, ((_m_) >> 4) & 0x3),
|
||||
vgetq_lane_u32(_u_, ((_m_) >> 6) & 0x3));
|
||||
}
|
||||
#endif
|
||||
#ifdef USE_MACROS
|
||||
#define _mm_unpacklo_epi8( _u_,_v_) ({ uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
|
||||
#define _mm_unpacklo_epi16(_u_,_v_) ({ uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
|
||||
#define _mm_unpacklo_epi32(_u_,_v_) ({ uint32x2x2_t _zv = vzip_u32( vget_low_u32( _u_ ), vget_low_u32( _v_ )); vcombine_u32(_zv.val[0], _zv.val[1]);})
|
||||
#define _mm_unpacklo_epi64(_u_,_v_) (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_)))
|
||||
|
||||
#define _mm_unpackhi_epi8( _u_,_v_) ({ uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
|
||||
#define _mm_unpackhi_epi16(_u_,_v_) ({ uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
|
||||
#define _mm_unpackhi_epi32(_u_,_v_) ({ uint32x2x2_t _zv = vzip_u32(vget_high_u32( _u_ ), vget_high_u32( _v_ )); vcombine_u32(_zv.val[0], _zv.val[1]);})
|
||||
#define _mm_unpackhi_epi64(_u_,_v_) (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_)))
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i _mm_unpacklo_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32( vget_low_u32( _u_ ), vget_low_u32( _v_ )); return vcombine_u32(_zv.val[0], _zv.val[1]);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_))); }
|
||||
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]); }
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]); }
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32(vget_high_u32( _u_ ), vget_high_u32( _v_ )); return vcombine_u32(_zv.val[0], _zv.val[1]); }
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); }
|
||||
#endif
|
||||
|
||||
#else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) --------------
|
||||
#define mm_movemask_epu32(_u_) _mm_movemask_ps(_mm_castsi128_ps(_u_))
|
||||
#define mm_movemask_epu16(_u_) _mm_movemask_epi8(_u_)
|
||||
#define mm_loadu_epi64p( _u64p_,_u_) _u_ = _mm_cvtsi64_si128(ctou64(_u64p_))
|
||||
|
||||
#define mm_extract_epu32( _u_, _id_) _mm_extract_epi32(_u_, _id_)
|
||||
#define mm_extract_epi32x(_u_,_u32_, _id_) _u32_ = _mm_extract_epi32(_u_, _id_)
|
||||
#define mm_extract_epi64x(_u_,_u64_, _id_) _u64_ = _mm_extract_epi64(_u_, _id_)
|
||||
#define mm_insert_epi32p( _u_,_u32p_,_c_) _mm_insert_epi32( _u_,ctou32(_u32p_),_c_)
|
||||
|
||||
#define mm_mullo_epu32( _u_,_v_) _mm_mullo_epi32(_u_,_v_)
|
||||
#define mm_cvtsi64_si128p(_u64p_,_u_) _u_ = _mm_cvtsi64_si128(ctou64(_u64p_))
|
||||
|
||||
#define mm_cmplt_epu32( _u_, _v_) _mm_cmplt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000)) //__m128i cv80000000 = _mm_set1_epi32(0x80000000); must be declared
|
||||
#define mm_cmpgt_epu32( _u_, _v_) _mm_cmpgt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000))
|
||||
#define _mm_cmplt_epu32( _u_, _v_) _mm_cmplt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000)))
|
||||
#define _mm_cmpgt_epu32( _u_, _v_) _mm_cmpgt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000)))
|
||||
|
||||
#define mm_shuffle_nnnn_epi32(_u_, _n_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(_n_,_n_,_n_,_n_))
|
||||
#define mm_shuffle_2031_epi32(_u_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(2,0,3,1))
|
||||
#define mm_shuffle_3120_epi32(_u_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(3,1,2,0))
|
||||
|
||||
#define _mm_slli_epi8(_u_, _m_ ) _mm_and_si128(_mm_set1_epi8(0xff << _m_), _mm_slli_epi32(_u_, _m_ ))
|
||||
#define _mm_srli_epi8(_u_, _m_ ) _mm_and_si128(_mm_set1_epi8(0xff >> _m_), _mm_srli_epi32(_u_, _m_ ))
|
||||
|
||||
#define mm_slli_epi8( _u_,_c_) _mm_slli_epi8( _u_,_c_) // parameter c MUST be a constant for compatibilty with the arm functions above
|
||||
#define mm_slli_epi16( _u_,_c_) _mm_slli_epi16(_u_,_c_)
|
||||
#define mm_slli_epi32( _u_,_c_) _mm_slli_epi32(_u_,_c_)
|
||||
#define mm_slli_epi64( _u_,_c_) _mm_slli_epi64(_u_,_c_)
|
||||
|
||||
#define mm_srli_epi8( _u_,_c_) _mm_srli_epi8( _u_,_c_)
|
||||
#define mm_srli_epi16( _u_,_c_) _mm_srli_epi16(_u_,_c_)
|
||||
#define mm_srli_epi32( _u_,_c_) _mm_srli_epi32(_u_,_c_)
|
||||
#define mm_srli_epi64( _u_,_c_) _mm_srli_epi64(_u_,_c_)
|
||||
|
||||
#define mm_srai_epi8( _u_,_c_) _mm_srai_epi8( _u_,_c_)
|
||||
#define mm_srai_epi16( _u_,_c_) _mm_srai_epi16(_u_,_c_)
|
||||
#define mm_srai_epi32( _u_,_c_) _mm_srai_epi32(_u_,_c_)
|
||||
#define mm_srai_epi64( _u_,_c_) _mm_srai_epi64(_u_,_c_)
|
||||
|
||||
#ifdef __SSSE3__
|
||||
static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i v) { // reverse bits in bytes
|
||||
__m128i fv = _mm_set_epi8(15, 7,11, 3,13, 5, 9, 1,14, 6,10, 2,12, 4, 8, 0), cv0f_8 = _mm_set1_epi8(0xf);
|
||||
__m128i lv = _mm_shuffle_epi8(fv,_mm_and_si128( v, cv0f_8));
|
||||
__m128i hv = _mm_shuffle_epi8(fv,_mm_and_si128( mm_srli_epi64(v, 4), cv0f_8));
|
||||
return _mm_or_si128( mm_slli_epi64(lv,4), hv);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE __m128i mm_rev_epi16(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } // reverse vector bytes in uint??_t
|
||||
static ALWAYS_INLINE __m128i mm_rev_epi32(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3)); }
|
||||
static ALWAYS_INLINE __m128i mm_rev_epi64(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7)); }
|
||||
static ALWAYS_INLINE __m128i mm_rev_si128(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15)); }
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
252
time_.h
252
time_.h
@ -1,252 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// time_.h : parameter free high precision time/benchmark functions
|
||||
#include <time.h>
|
||||
#include <float.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#ifndef sleep
|
||||
#define sleep(n) Sleep((n) * 1000)
|
||||
#endif
|
||||
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef unsigned __int64 tm_t;
|
||||
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#include <unistd.h>
|
||||
#define Sleep(ms) usleep((ms) * 1000)
|
||||
|
||||
typedef struct timespec tm_t;
|
||||
#endif
|
||||
|
||||
#if defined (__i386__) || defined( __x86_64__ )
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h> // __rdtsc
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __corei7__
|
||||
#define RDTSC_INI(_c_) do { unsigned _cl, _ch; \
|
||||
__asm volatile ("cpuid\n\t" \
|
||||
"rdtsc\n\t" \
|
||||
"mov %%edx, %0\n" \
|
||||
"mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \
|
||||
"%rax", "%rbx", "%rcx", "%rdx"); \
|
||||
_c_ = (uint64_t)_ch << 32 | _cl; \
|
||||
} while(0)
|
||||
|
||||
#define RDTSC(_c_) do { unsigned _cl, _ch; \
|
||||
__asm volatile("rdtscp\n" \
|
||||
"mov %%edx, %0\n" \
|
||||
"mov %%eax, %1\n" \
|
||||
"cpuid\n\t": "=r" (_ch), "=r" (_cl):: "%rax",\
|
||||
"%rbx", "%rcx", "%rdx");\
|
||||
_c_ = (uint64_t)_ch << 32 | _cl;\
|
||||
} while(0)
|
||||
#else
|
||||
#define RDTSC(_c_) do { unsigned _cl, _ch;\
|
||||
__asm volatile ("cpuid \n"\
|
||||
"rdtsc"\
|
||||
: "=a"(_cl), "=d"(_ch)\
|
||||
: "a"(0)\
|
||||
: "%ebx", "%ecx");\
|
||||
_c_ = (uint64_t)_ch << 32 | _cl;\
|
||||
} while(0)
|
||||
#define RDTSC_INI(_c_) RDTSC(_c_)
|
||||
#endif
|
||||
#else
|
||||
#define RDTSC_INI(_c_)
|
||||
#define RDTSC(_c_)
|
||||
#endif
|
||||
|
||||
#define tmrdtscini() ({ uint64_t _c; __asm volatile("" ::: "memory"); RDTSC_INI(_c); _c; })
|
||||
#define tmrdtsc() ({ uint64_t _c; RDTSC(_c); _c; })
|
||||
|
||||
#ifndef TM_F
|
||||
#define TM_F 1.0 // TM_F=4 -> MI/s
|
||||
#endif
|
||||
|
||||
#ifdef RDTSC_ON
|
||||
#define tminit() tmrdtscini()
|
||||
#define tmtime() tmrdtsc()
|
||||
#define TM_T CLOCKS_PER_SEC
|
||||
static double TMBS(unsigned l, double t) { double dt = t, dl = l; return t/l; }
|
||||
#define TM_C 1000
|
||||
|
||||
#else
|
||||
#define TM_C 1
|
||||
static double TMBS(unsigned l, double t) { return (l/t)/1000000.0; }
|
||||
|
||||
#ifdef _WIN32
|
||||
static LARGE_INTEGER tps;
|
||||
static tm_t tmtime(void) {
|
||||
LARGE_INTEGER tm;
|
||||
tm_t t;
|
||||
QueryPerformanceCounter(&tm);
|
||||
return tm.QuadPart;
|
||||
}
|
||||
|
||||
static tm_t tminit() { tm_t t0,ts; QueryPerformanceFrequency(&tps); t0 = tmtime(); while((ts = tmtime())==t0) {}; return ts; }
|
||||
static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start)/tps.QuadPart; }
|
||||
static int tmiszero(tm_t t) { return !t; }
|
||||
#else
|
||||
#ifdef __APPLE__
|
||||
#include <AvailabilityMacros.h>
|
||||
#ifndef MAC_OS_X_VERSION_10_12
|
||||
#define MAC_OS_X_VERSION_10_12 101200
|
||||
#endif
|
||||
#define CIVETWEB_APPLE_HAVE_CLOCK_GETTIME (defined(__APPLE__) && defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12)
|
||||
#if !(CIVETWEB_APPLE_HAVE_CLOCK_GETTIME)
|
||||
#include <sys/time.h>
|
||||
#define CLOCK_REALTIME 0
|
||||
#define CLOCK_MONOTONIC 0
|
||||
int clock_gettime(int /*clk_id*/, struct timespec* t) {
|
||||
struct timeval now;
|
||||
int rv = gettimeofday(&now, NULL);
|
||||
if (rv) return rv;
|
||||
t->tv_sec = now.tv_sec;
|
||||
t->tv_nsec = now.tv_usec * 1000;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
static tm_t tmtime() { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return tm; }
|
||||
static double tmdiff(tm_t start, tm_t stop) { return (stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1e9f; }
|
||||
static tm_t tminit() { tm_t t0 = tmtime(),t; while(!tmdiff(t = tmtime(),t0)) {}; return t; }
|
||||
static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//---------------------------------------- bench ----------------------------------------------------------------------
|
||||
// for each a function call is repeated until exceeding tm_tx seconds.
|
||||
// A run duration is always tm_tx seconds
|
||||
// The number of runs can be set with the program options -I and -J (specify -I15 -J15 for more precision)
|
||||
|
||||
// sleep after each 8 runs to avoid cpu throttling.
|
||||
#define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0)
|
||||
|
||||
// benchmark loop
|
||||
#define TMBEG(_tm_Reps_) { unsigned _tm_r,_tm_c = 0,_tm_R,_tm_Rx = _tm_Reps_,_tm_Rn = _tm_Reps_; double _tm_t;\
|
||||
for(tm_rm = tm_rep, tm_tm = DBL_MAX, _tm_R = 0; _tm_R < _tm_Rn; _tm_R++) { tm_t _tm_t0 = tminit(); /*for each run*/\
|
||||
for(_tm_r = 0;_tm_r < tm_rm;) { /*repeat tm_rm times */
|
||||
|
||||
#define TMEND(_len_) \
|
||||
_tm_r++; if(tm_tm == DBL_MAX && (_tm_t = tmdiff(_tm_t0, tmtime())) > tm_tx) break;\
|
||||
}\
|
||||
/*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\
|
||||
/*other runs: break the loop only after 'tm_rm' repeats */ \
|
||||
_tm_t = tmdiff(_tm_t0, tmtime());\
|
||||
/*set min time, recalculate repeats tm_rm based on tm_tx, recalculate number of runs based on tm_TX*/\
|
||||
if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("[%d,%d] ", tm_rm, _tm_Rn);*/ } tm_tm = _tm_t; _tm_c++; }\
|
||||
else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\
|
||||
if(tm_verbose) { printf("%8.2f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\
|
||||
if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu throttling*/\
|
||||
}\
|
||||
}
|
||||
|
||||
static unsigned tm_rep = 1<<30, tm_Rep = 3, tm_Rep2 = 3, tm_rm, tm_RepMin = 1, tm_slp = 20, tm_verbose = 2;
|
||||
static tm_t tm_0, tm_T;
|
||||
static double tm_tm, tm_tx = 1, tm_TX = 60;
|
||||
|
||||
static void tm_init(int _tm_Rep, int _tm_verbose) { tm_verbose = _tm_verbose; if(_tm_Rep) tm_Rep = _tm_Rep; }
|
||||
|
||||
#define TMBENCH(_name_, _func_, _len_) do { if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
|
||||
TMBEG(tm_Rep) _func_; TMEND(_len_); \
|
||||
double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_, dm*TM_C/dr) );\
|
||||
} while(0)
|
||||
|
||||
// second TMBENCH. Example: use TMBENCH for encoding and TMBENCH2 for decoding
|
||||
#define TMBENCH2(_name_, _func_, _len_) do { \
|
||||
TMBEG(tm_Rep2) _func_; TMEND(_len_);\
|
||||
double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_, dm*TM_C/dr) );\
|
||||
if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\
|
||||
} while(0)
|
||||
|
||||
// Check
|
||||
#define TMBENCHT(_name_,_func_, _len_, _res_) do { \
|
||||
TMBEG(tm_Rep) \
|
||||
if(_func_ != _res_) { printf("ERROR: %lld != %lld", (long long)_func_, (long long)_res_ ); exit(0); };\
|
||||
TMEND(_len_);\
|
||||
if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_,(double)tm_tm*TM_C/(double)tm_rm) );\
|
||||
if(tm_verbose) printf("%s ", _name_?_name_:#_func_ );\
|
||||
} while(0)
|
||||
//----------------------------------------------------------------------------------------------------------------------------------
|
||||
#define Kb (1u<<10)
|
||||
#define Mb (1u<<20)
|
||||
#define Gb (1u<<30)
|
||||
#define KB 1000
|
||||
#define MB 1000000
|
||||
#define GB 1000000000
|
||||
|
||||
static unsigned argtoi(char *s, unsigned def) {
|
||||
char *p;
|
||||
unsigned n = strtol(s, &p, 10),f = 1;
|
||||
switch(*p) {
|
||||
case 'K': f = KB; break;
|
||||
case 'M': f = MB; break;
|
||||
case 'G': f = GB; break;
|
||||
case 'k': f = Kb; break;
|
||||
case 'm': f = Mb; break;
|
||||
case 'g': f = Gb; break;
|
||||
case 'B': return n; break;
|
||||
case 'b': def = 0;
|
||||
default: if(!def) return n>=32?0xffffffffu:(1u << n); f = def;
|
||||
}
|
||||
return n*f;
|
||||
}
|
||||
static uint64_t argtol(char *s) {
|
||||
char *p;
|
||||
uint64_t n = strtol(s, &p, 10),f=1;
|
||||
switch(*p) {
|
||||
case 'K': f = KB; break;
|
||||
case 'M': f = MB; break;
|
||||
case 'G': f = GB; break;
|
||||
case 'k': f = Kb; break;
|
||||
case 'm': f = Mb; break;
|
||||
case 'g': f = Gb; break;
|
||||
case 'B': return n; break;
|
||||
case 'b': return 1u << n;
|
||||
default: f = MB;
|
||||
}
|
||||
return n*f;
|
||||
}
|
||||
|
||||
static uint64_t argtot(char *s) {
|
||||
char *p;
|
||||
uint64_t n = strtol(s, &p, 10),f=1;
|
||||
switch(*p) {
|
||||
case 'h': f = 3600000; break;
|
||||
case 'm': f = 60000; break;
|
||||
case 's': f = 1000; break;
|
||||
case 'M': f = 1; break;
|
||||
default: f = 1000;
|
||||
}
|
||||
return n*f;
|
||||
}
|
||||
|
||||
static void memrcpy(unsigned char *out, unsigned char *in, unsigned n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; }
|
||||
|
||||
113
transpose.h
113
transpose.h
@ -1,113 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// transpose.h - Byte/Nibble transpose for further compressing with lz77 or other compressors
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
// Syntax
|
||||
// in : Input buffer
|
||||
// n : Total number of bytes in input buffer
|
||||
// out : output buffer
|
||||
// esize : element size in bytes (ex. 2, 4, 8,... )
|
||||
|
||||
//---------- High level functions with dynamic cpu detection and JIT scalar/sse/avx2 switching
|
||||
void tpenc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // tranpose
|
||||
void tpdec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // reverse transpose
|
||||
|
||||
void tp2denc(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize); //2D transpose
|
||||
void tp2ddec(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize);
|
||||
void tp3denc(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); //3D transpose
|
||||
void tp3ddec(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize);
|
||||
void tp4denc(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); //4D transpose
|
||||
void tp4ddec(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize);
|
||||
|
||||
// Nibble transpose SIMD (SSE2,AVX2, ARM Neon)
|
||||
void tp4enc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
|
||||
void tp4dec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
|
||||
|
||||
// bit transpose
|
||||
//void tp1enc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
|
||||
//void tp1dec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize);
|
||||
|
||||
//---------- Low level functions ------------------------------------
|
||||
void tpenc2( unsigned char *in, unsigned n, unsigned char *out); // scalar
|
||||
void tpenc3( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpenc4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpenc8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpenc16( unsigned char *in, unsigned n, unsigned char *out);
|
||||
|
||||
void tpdec2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpdec3( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpdec4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpdec8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpdec16( unsigned char *in, unsigned n, unsigned char *out);
|
||||
|
||||
void tpenc128v2( unsigned char *in, unsigned n, unsigned char *out); // sse2
|
||||
void tpdec128v2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpenc128v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpdec128v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpenc128v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpdec128v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
|
||||
void tp4enc128v2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4dec128v2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4enc128v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4dec128v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4enc128v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4dec128v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
|
||||
void tp1enc128v2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp1dec128v2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp1enc128v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp1dec128v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp1enc128v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp1dec128v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
|
||||
void tpenc256v2( unsigned char *in, unsigned n, unsigned char *out); // avx2
|
||||
void tpdec256v2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpenc256v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpdec256v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpenc256v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tpdec256v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
|
||||
void tp4enc256v2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4dec256v2( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4enc256v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4dec256v4( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4enc256v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
void tp4dec256v8( unsigned char *in, unsigned n, unsigned char *out);
|
||||
|
||||
//------- CPU instruction set
|
||||
// cpuiset = 0: return current simd set,
|
||||
// cpuiset != 0: set simd set 0:scalar, 20:sse2, 52:avx2
|
||||
unsigned cpuini(unsigned cpuiset);
|
||||
|
||||
// convert simd set to string "sse3", "sse3", "sse4.1" or "avx2"
|
||||
// Ex.: printf("current cpu set=%s\n", cpustr(cpuini(0)) );
|
||||
char *cpustr(unsigned cpuisa);
|
||||
|
||||
unsigned cpuisa(void);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
72
trle.h
72
trle.h
@ -1,72 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2015-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- email : powturbo [AT] gmail.com
|
||||
- github : https://github.com/powturbo
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- twitter : https://twitter.com/powturbo
|
||||
|
||||
TurboRLE - "Most efficient and fastest Run Length Encoding"
|
||||
**/
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1600
|
||||
#include "vs/stdint.h"
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
// RLE with specified escape char
|
||||
unsigned _srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
|
||||
unsigned _srled8( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint8_t e);
|
||||
|
||||
unsigned _srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
|
||||
unsigned _srled16(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint16_t e);
|
||||
|
||||
unsigned _srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
|
||||
unsigned _srled32(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint32_t e);
|
||||
|
||||
unsigned _srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
|
||||
unsigned _srled64(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint64_t e);
|
||||
|
||||
// functions w/ overflow handling
|
||||
unsigned srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
|
||||
unsigned srled8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint8_t e);
|
||||
|
||||
unsigned srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
|
||||
unsigned srled16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint16_t e);
|
||||
|
||||
unsigned srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
|
||||
unsigned srled32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint32_t e);
|
||||
|
||||
unsigned srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
|
||||
unsigned srled64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint64_t e);
|
||||
|
||||
// RLE w. automatic escape char determination
|
||||
unsigned srlec( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
|
||||
unsigned _srled( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen);
|
||||
unsigned srled( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
|
||||
|
||||
// Turbo RLE
|
||||
unsigned trlec( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
|
||||
unsigned _trled( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen);
|
||||
unsigned trled( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
401
vint.h
401
vint.h
@ -1,401 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Integer Compression" variable byte include header (scalar TurboVByte+ SIMD TurboByte)
|
||||
#ifndef _VINT_H_
|
||||
#define _VINT_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef VINT_IN
|
||||
#include "conf.h"
|
||||
//----------------------------------- Variable byte: single value macros (low level) -----------------------------------------------
|
||||
//------------- 32 bits -------------
|
||||
extern unsigned char _vtab32_[];
|
||||
#define _vbxvlen32(_x_) _vtab32_[(unsigned char)(_x_)>>4] // (clz32((_x_) ^ 0xff) - 23) //
|
||||
#define _vbxlen32(_x_) ((bsr32(_x_|1)+6)/7)
|
||||
|
||||
#define _vbxput32(_op_, _x_, _act_) {\
|
||||
if(likely((_x_) < (1<< 7))) { *_op_++ = _x_; _act_;}\
|
||||
else if(likely((_x_) < (1<<14))) { ctou16(_op_) = bswap16((_x_) | 0x8000u); _op_ += 2; _act_;}\
|
||||
else if(likely((_x_) < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0u; ctou16(_op_) = _x_; _op_ += 2; _act_;}\
|
||||
else if(likely((_x_) < (1<<28))) { ctou32(_op_) = bswap32((_x_) | 0xe0000000u); _op_ += 4; _act_;}\
|
||||
else { *_op_++ = (unsigned long long)(_x_) >> 32 | 0xf0u; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
|
||||
}
|
||||
|
||||
#define _vbxget32(_ip_, _x_, _act_) do { _x_ = (unsigned)(*_ip_++);\
|
||||
if(!(_x_ & 0x80u)) { _act_;}\
|
||||
else if(!(_x_ & 0x40u)) { _x_ = bswap16(ctou16(_ip_ - 1) & 0xff3fu); _ip_++; _act_;}\
|
||||
else if(!(_x_ & 0x20u)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\
|
||||
else if(!(_x_ & 0x10u)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0fu); _ip_ += 3; _act_;}\
|
||||
else { _x_ = (unsigned long long)((_x_) & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\
|
||||
} while(0)
|
||||
|
||||
//------------- 64 bits -----------
|
||||
#define _vbxlen64(_x_) ((bsr64(_x_)+6)/7)
|
||||
#define _vbxvlen64(_x_) ((_x_)==0xff?9:clz32((_x_) ^ 0xff) - 23)
|
||||
|
||||
#define _vbxput64(_op_, _x_, _act_) {\
|
||||
if(likely(_x_ < (1<< 7))) { *_op_++ = _x_; _act_;}\
|
||||
else if(likely(_x_ < (1<<14))) { ctou16(_op_) = bswap16(_x_| 0x8000); _op_ += 2; _act_;}\
|
||||
else if(likely(_x_ < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0; ctou16(_op_) = _x_; _op_ += 2; _act_;}\
|
||||
else if(likely(_x_ < (1<<28))) { ctou32(_op_) = bswap32(_x_| 0xe0000000); _op_ += 4; _act_;}\
|
||||
else if( _x_ < 1ull<<35) { *_op_++ = _x_ >> 32 | 0xf0; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
|
||||
else if( _x_ < 1ull<<42) { ctou16(_op_) = bswap16(_x_ >> 32 | 0xf800); _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
|
||||
else if( _x_ < 1ull<<49) { *_op_++ = _x_ >> 48 | 0xfc; ctou16(_op_) = _x_ >> 32; _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\
|
||||
else if( _x_ < 1ull<<56) { ctou64(_op_) = bswap64(_x_ | 0xfe00000000000000ull); _op_ += 8; _act_;}\
|
||||
else { *_op_++ = 0xff; ctou64(_op_) = _x_; _op_ += 8; _act_;}\
|
||||
}
|
||||
|
||||
#define _vbxget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
|
||||
if(!(_x_ & 0x80)) { _act_;}\
|
||||
else if(!(_x_ & 0x40)) { _x_ = bswap16(ctou16(_ip_++-1) & 0xff3f); _act_;}\
|
||||
else if(!(_x_ & 0x20)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\
|
||||
else if(!(_x_ & 0x10)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0f); _ip_ += 3; _act_;}\
|
||||
else if(!(_x_ & 0x08)) { _x_ = (_x_ & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\
|
||||
else if(!(_x_ & 0x04)) { _x_ = (unsigned long long)(bswap16(ctou16(_ip_-1)) & 0x7ff) << 32 | ctou32(_ip_+1); _ip_ += 5; _act_;}\
|
||||
else if(!(_x_ & 0x02)) { _x_ = (_x_ & 0x03)<<48 | (unsigned long long)ctou16(_ip_) << 32 | ctou32(_ip_+2); _ip_ += 6; _act_;}\
|
||||
else if(!(_x_ & 0x01)) { _x_ = bswap64(ctou64(_ip_-1)) & 0x01ffffffffffffffull; _ip_ += 7; _act_;}\
|
||||
else { _x_ = ctou64(_ip_); _ip_ += 8; _act_;}\
|
||||
} while(0)
|
||||
|
||||
#define vbxput64(_op_, _x_) { unsigned long long _x = _x_; _vbxput64(_op_, _x, ;); }
|
||||
#define vbxput32(_op_, _x_) { register unsigned _x = _x_; _vbxput32(_op_, _x, ;); }
|
||||
#define vbxput16(_op_, _x_) vbxput32(_op_, _x_)
|
||||
#define vbxput8( _op_, _x_) (*_op_++ = _x_)
|
||||
|
||||
#define vbxget64(_ip_, _x_) _vbxget64(_ip_, _x_, ;)
|
||||
#define vbxget32(_ip_, _x_) _vbxget32(_ip_, _x_, ;)
|
||||
#define vbxget16(_ip_, _x_) vbxget32(_ip_,_x_)
|
||||
#define vbxget8(_ip_, _x_) (_x_ = *_ip_++)
|
||||
//---------------------------------------------------------------------------
|
||||
#define VB_SIZE 64
|
||||
#define VB_MAX 254
|
||||
#define VB_B2 6
|
||||
#define VB_B3 3
|
||||
#define VB_BA3 (VB_MAX - (VB_SIZE/8 - 3))
|
||||
#define VB_BA2 (VB_BA3 - (1<<VB_B3))
|
||||
|
||||
#define VB_OFS1 (VB_BA2 - (1<<VB_B2))
|
||||
#define VB_OFS2 (VB_OFS1 + (1 << (8+VB_B2)))
|
||||
#define VB_OFS3 (VB_OFS2 + (1 << (16+VB_B3)))
|
||||
|
||||
#define _vblen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_OFS2?2:((_x_) < VB_OFS3)?3:(bsr32(_x_)+7)/8+1))
|
||||
#define _vbvlen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_BA2?2:((_x_) < VB_BA3)?3:(_x_-VB_BA3)))
|
||||
|
||||
#define _vbput32(_op_, _x_, _act_) {\
|
||||
if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_); _act_;}\
|
||||
else if ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1)); _op_ += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\
|
||||
else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\
|
||||
else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou32(_op_) = (_x_); _op_ += _b; _act_;}\
|
||||
}
|
||||
|
||||
#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
|
||||
if(likely(_x_ < VB_OFS1)) { _act_ ;}\
|
||||
else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \
|
||||
else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\
|
||||
else { unsigned _b = _x_-VB_BA3; _x_ = ctou32(_ip_) & ((1u << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\
|
||||
} while(0)
|
||||
|
||||
#define _vblen64(_x_) _vblen32(_x_)
|
||||
#define _vbvlen64(_x_) _vbvlen32(_x_)
|
||||
#define _vbput64(_op_, _x_, _act_) {\
|
||||
if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_); _act_;}\
|
||||
else if ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1)); _op_ += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\
|
||||
else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\
|
||||
else { unsigned _b = (bsr64((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou64(_op_) = (_x_); _op_ += _b; _act_;}\
|
||||
}
|
||||
|
||||
#define _vbget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
|
||||
if(likely(_x_ < VB_OFS1)) { _act_ ;}\
|
||||
else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \
|
||||
else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\
|
||||
else { unsigned _b = _x_-VB_BA3; _x_ = ctou64(_ip_) & ((1ull << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\
|
||||
} while(0)
|
||||
|
||||
#ifdef _WIN32
|
||||
//#define fgetc_unlocked(_f_) _fgetc_nolock(_f_)
|
||||
#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_)
|
||||
#define fgetc_unlocked(_f_) fgetc(_f_)
|
||||
#else
|
||||
#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_) //_IO_putc_unlocked(_c_,_f_)
|
||||
#define fgetc_unlocked(_f_) fgetc(_f_) //_IO_getc_unlocked(_f_)
|
||||
#endif
|
||||
|
||||
#define leb128put(_op_, _x_) { uint64_t _x = _x_; while(_x > 0x7f) { *_op_++ = _x & 0x7f; _x >>= 7; } *_op_++ = _x | 0x80; }
|
||||
#define vbfput32(_f_, _x_) ({ uint64_t _x = _x_; while(_x > 0x7f) { fputc_unlocked(_x & 0x7f, _f_); _x >>= 7; } fputc_unlocked(_x | 0x80, _f_); })
|
||||
|
||||
#define _leb128get(_ip_, _x_, _act_) { unsigned _sft=0; for(_x_=0;;_sft += 7) { unsigned _c = *_ip_++; _x_ += (_c & 0x7f) << _sft; if(_c >= 0x80) { _act_; break; } } }
|
||||
#define leb128get(_ip_, _x_) vbgetax(_ip_, _x_, ;)
|
||||
#define vbfget32(_f_ ) ({ unsigned _sft=0,_x=0; for(;;_sft += 7) { unsigned _c = fgetc_unlocked(_f_); if(_c != EOF) { _x += (_c & 0x7f) << _sft; if(_c & 0x80) break; } else { _x = EOF; break; } } _x; })
|
||||
|
||||
//------------- 16 bits -----------
|
||||
#define _vblen16(_x_) _vblen32(_x_)
|
||||
#define _vbvlen16(_x_) _vbvlen32(_x_)
|
||||
|
||||
#define _vbput16(_op_, _x_, _act_) _vbput32(_op_, _x_, _act_)
|
||||
#define _vbget16(_ip_, _x_, _act_) _vbget32(_ip_, _x_, _act_)
|
||||
|
||||
#define _vblen8(_x_) 1
|
||||
#define _vbvlen8(_x_) 1
|
||||
#define _vbput8(_op_, _x_, _act_) { *_op_++ = _x_; _act_; }
|
||||
#define _vbget8(_ip_, _x_, _act_) { _x_ = *_ip_++; _act_; }
|
||||
//----------------------------------- Variable byte: single value functions -----------------------------------------------
|
||||
// ---- Variable byte length after compression
|
||||
static inline unsigned vblen16(unsigned short x) { return _vblen16(x); }
|
||||
static inline unsigned vblen32(unsigned x) { return _vblen32(x); }
|
||||
static inline unsigned vblen64(uint64_t x) { return _vblen64(x); }
|
||||
|
||||
// ---- Length of compressed value. Input in is the first char of the compressed buffer start (Ex. vbvlen32(in[0]) )
|
||||
static inline unsigned vbvlen16(unsigned x) { return _vbvlen32(x); }
|
||||
static inline unsigned vbvlen32(unsigned x) { return _vbvlen32(x); }
|
||||
static inline unsigned vbvlen64(unsigned x) { return _vbvlen64(x); }
|
||||
|
||||
//----- encode/decode 16/32/64 single value and advance output/input pointer
|
||||
#define vbput64(_op_, _x_) { unsigned long long _x = _x_; _vbput64(_op_, _x, ;); }
|
||||
#define vbput32(_op_, _x_) { register unsigned _x = _x_; _vbput32(_op_, _x, ;); }
|
||||
#define vbput16(_op_, _x_) vbput32(_op_, _x_)
|
||||
#define vbput8(_op_, _x_) (*_op_++ = _x_)
|
||||
|
||||
#define vbget64(_ip_, _x_) _vbget64(_ip_, _x_, ;)
|
||||
#define vbget32(_ip_, _x_) _vbget32(_ip_, _x_, ;)
|
||||
#define vbget16(_ip_, _x_) vbget32(_ip_,_x_)
|
||||
#define vbget8(_ip_, _x_) (_x_ = *_ip_++)
|
||||
#endif
|
||||
//----------------------------- TurboVByte 'vb':Variable byte + SIMD TurboByte 'v8': array functions ----------------------------------------
|
||||
// Encoding/DEcoding: Return value = end of compressed output/input buffer out/in
|
||||
|
||||
//----------------------- Encoding/Decoding unsorted array with n integer values --------------------------
|
||||
unsigned char *vbenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); //TurboVByte
|
||||
unsigned char *vbenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *vbenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
|
||||
//-- Decode
|
||||
unsigned char *vbdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out);
|
||||
unsigned char *vbdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out);
|
||||
unsigned char *vbdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out);
|
||||
|
||||
//-- Get value stored at index idx (idx:0...n-1)
|
||||
unsigned short vbgetx16( unsigned char *__restrict in, unsigned idx);
|
||||
unsigned vbgetx32( unsigned char *__restrict in, unsigned idx);
|
||||
uint64_t vbgetx64( unsigned char *__restrict in, unsigned idx);
|
||||
|
||||
//-- Search and return index of next value equal to key or n when no key value found
|
||||
// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
|
||||
unsigned vbgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key);
|
||||
unsigned vbgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned key);
|
||||
unsigned vbgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t key);
|
||||
|
||||
//---------------------- Delta encoding/decoding sorted array ---------------------------------------------
|
||||
//-- Increasing integer array. out[i] = out[i-1] + in[i]
|
||||
unsigned char *vbdenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *vbdenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *vbdenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *vbddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *vbddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
unsigned char *vbddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
//-- Get value stored at index idx (idx:0...n-1)
|
||||
unsigned short vbdgetx16( unsigned char *__restrict in, unsigned idx, unsigned short start);
|
||||
unsigned vbdgetx32( unsigned char *__restrict in, unsigned idx, unsigned start);
|
||||
uint64_t vbdgetx64( unsigned char *__restrict in, unsigned idx, uint64_t start);
|
||||
|
||||
//-- Search and return index of next value equal to key or n when no key value found
|
||||
// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
|
||||
unsigned vbdgetgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start);
|
||||
unsigned vbdgetgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned *key, unsigned start);
|
||||
unsigned vbdgetgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t *key, uint64_t start);
|
||||
|
||||
//-- Strictly increasing (never remaining constant or decreasing) integer array. out[i] = out[i-1] + in[i] + 1
|
||||
unsigned char *vbd1enc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *vbd1enc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *vbd1dec16(unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
unsigned char *vbd1dec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
|
||||
//-- Get value stored at index idx (idx:0...n-1)
|
||||
unsigned short vbd1getx16( unsigned char *__restrict in, unsigned idx, unsigned short start);
|
||||
unsigned vbd1getx32( unsigned char *__restrict in, unsigned idx, unsigned start);
|
||||
uint64_t vbd1getx64( unsigned char *__restrict in, unsigned idx, uint64_t start);
|
||||
|
||||
//-- Search and return index of next value equal to key or n when no key value found
|
||||
// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
|
||||
unsigned vbd1getgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start);
|
||||
unsigned vbd1getgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned *key, unsigned start);
|
||||
unsigned vbd1getgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t *key, uint64_t start);
|
||||
|
||||
//---------------------- Zigzag encoding/decoding for unsorted integer lists.
|
||||
unsigned char *vbzenc8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start);
|
||||
unsigned char *vbzenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *vbzenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *vbzenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *vbzdec8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start);
|
||||
unsigned char *vbzdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *vbzdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
unsigned char *vbzdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
//---------------------- XOR encoding/decoding for unsorted integer lists.
|
||||
unsigned char *vbxenc8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start);
|
||||
unsigned char *vbxenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *vbxenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *vbxenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *vbxdec8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start);
|
||||
unsigned char *vbxdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *vbxdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
unsigned char *vbxdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
//---------------------- Delta of delta encoding/decoding for unsorted integer lists.
|
||||
unsigned char *vbddenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *vbddenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
unsigned char *vbddenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *vbdddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *vbdddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
unsigned char *vbdddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
//-- Get value stored at index idx (idx:0...n-1)
|
||||
unsigned short vbzgetx16( unsigned char *__restrict in, unsigned idx, unsigned short start);
|
||||
unsigned vbzgetx32( unsigned char *__restrict in, unsigned idx, unsigned start);
|
||||
uint64_t vbzgetx64( unsigned char *__restrict in, unsigned idx, uint64_t start);
|
||||
|
||||
//-- Search and return index of next value equal to key or n when no key value found
|
||||
// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); }
|
||||
/*unsigned vbzgeteq15( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start);
|
||||
unsigned vbzgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start);
|
||||
unsigned vbzgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned key, unsigned start);
|
||||
unsigned vbzgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t key, unsigned start);*/
|
||||
|
||||
//-------------------------- TurboByte (SIMD Group varint) --------------------------------------------------------------
|
||||
unsigned char *v8enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); //TurboByte
|
||||
unsigned char *v8enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
|
||||
unsigned char *v8dec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out);
|
||||
unsigned char *v8dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out);
|
||||
|
||||
//------ delta ---------
|
||||
unsigned char *v8denc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *v8denc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
|
||||
unsigned char *v8ddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *v8ddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
|
||||
//------ delta 1 -------
|
||||
unsigned char *v8d1enc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *v8d1enc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
|
||||
unsigned char *v8d1dec16(unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *v8d1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
|
||||
//------- zigzag -------
|
||||
unsigned char *v8zenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *v8zenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
|
||||
unsigned char *v8zdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *v8zdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
|
||||
//------- xor ----------
|
||||
unsigned char *v8xenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start);
|
||||
unsigned char *v8xenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start);
|
||||
|
||||
unsigned char *v8xdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start);
|
||||
unsigned char *v8xdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start);
|
||||
//-------------------------- TurboByte Hybrid (SIMD Group varint) + Bitpacking -------------------------------------------
|
||||
size_t v8nenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8nd1enc16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nd1enc32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8nzenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nzenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8ndec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8ndec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t v8nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t v8nd1dec16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8nd1dec32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t v8nzdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8nzdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t v8nxdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8nxdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
//-------------
|
||||
size_t v8nenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8ndenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8nd1enc128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8nzenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nzenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8ndec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8ndec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t v8nddec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t v8nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t v8nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
size_t v8nxdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t v8nxdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
//-------------
|
||||
size_t v8nenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nzenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t v8nxenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t v8ndec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t v8nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t v8nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t v8nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t v8nxdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
355
vp4.h
355
vp4.h
@ -1,355 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "TurboPFor: Integer Compression" PFor/PForDelta + Direct access
|
||||
#ifndef VP4_H_
|
||||
#define VP4_H_
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1600
|
||||
#include "vs/stdint.h"
|
||||
#else
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
//************************************************ High level API - n unlimited ****************************************************
|
||||
// Compress integer array with n values to the buffer out.
|
||||
// Return value = number of bytes written to compressed buffer out
|
||||
size_t p4nenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); // SIMD (Vertical bitpacking)
|
||||
size_t p4nenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nenc128v64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nenc256w32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
|
||||
size_t p4ndenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4ndenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4ndenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t p4nd1enc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nd1enc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nd1enc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nd1enc128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nd1enc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
size_t p4nzenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nzenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nzenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nzenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nzenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nzenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
size_t p4nzenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
// Decompress the compressed n values in input buffer in to the integer array out.
|
||||
// Return value = number of bytes read from the ompressed buffer in
|
||||
size_t p4ndec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t p4ndec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t p4ndec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4ndec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
size_t p4ndec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t p4ndec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4ndec128v64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
size_t p4ndec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
|
||||
// Delta minimum = 0
|
||||
size_t p4nddec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t p4nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t p4nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nddec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t p4nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nddec256w32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nddec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
// Delta minimum = 1
|
||||
size_t p4nd1dec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t p4nd1dec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t p4nd1dec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nd1dec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
//Zigzag
|
||||
size_t p4nzdec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out);
|
||||
size_t p4nzdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t p4nzdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out);
|
||||
size_t p4nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out);
|
||||
size_t p4nzdec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
|
||||
//************** Low level API - n limited to 128/256 ***************************************
|
||||
#define P4D_MAX 256
|
||||
|
||||
// -------------- TurboPFor: Encode ------------
|
||||
//#include <assert.h>
|
||||
// Low level API: Single block n limited
|
||||
//compress integer array with n values to the buffer out. Return value = end of compressed buffer out
|
||||
unsigned char *p4enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4enc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out); // SSE (Vertical bitpacking)
|
||||
unsigned char *p4enc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4enc128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4enc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out); // AVX2
|
||||
unsigned char *p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
|
||||
unsigned char *p4enc256w32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
|
||||
unsigned char *p4encx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out);// Direct access
|
||||
unsigned char *p4encx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4encx32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
unsigned char *p4encx64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out);
|
||||
|
||||
unsigned char *p4denc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start);
|
||||
unsigned char *p4denc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
|
||||
unsigned char *p4denc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4denc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
|
||||
unsigned char *p4denc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4denc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4denc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *p4denc256w32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
|
||||
unsigned char *p4dencx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); // Direct access
|
||||
unsigned char *p4dencx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
|
||||
unsigned char *p4dencx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
|
||||
unsigned char *p4d1enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start);
|
||||
unsigned char *p4d1enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
|
||||
unsigned char *p4d1enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4d1enc128v16(uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); // SIMD (Vertical bitpacking)
|
||||
unsigned char *p4d1enc128v32(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4d1enc256v32(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4d1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *p4d1encx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); // Direct access
|
||||
unsigned char *p4d1encx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
|
||||
unsigned char *p4d1encx32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
|
||||
unsigned char *p4zenc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start);
|
||||
unsigned char *p4zenc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
|
||||
unsigned char *p4zenc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4zenc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start);
|
||||
unsigned char *p4zenc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4zenc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start);
|
||||
unsigned char *p4zenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start);
|
||||
|
||||
unsigned char *p4senc16(uint16_t *in, unsigned n, unsigned char *out, uint16_t start);
|
||||
unsigned char *p4senc32(uint32_t *in, unsigned n, unsigned char *out, uint32_t start);
|
||||
unsigned char *p4senc64(uint64_t *in, unsigned n, unsigned char *out, uint64_t start);
|
||||
|
||||
unsigned char *p4sdec16(unsigned char *in, unsigned n, uint16_t *out, uint16_t start);
|
||||
unsigned char *p4sdec32(unsigned char *in, unsigned n, uint32_t *out, uint32_t start);
|
||||
unsigned char *p4sdec64(unsigned char *in, unsigned n, uint64_t *out, uint64_t start);
|
||||
|
||||
size_t p4nsenc16(uint16_t *in, size_t n, unsigned char *out);
|
||||
size_t p4nsenc32(uint32_t *in, size_t n, unsigned char *out);
|
||||
size_t p4nsenc64(uint64_t *in, size_t n, unsigned char *out);
|
||||
|
||||
size_t p4nsdec16(unsigned char *in, size_t n, uint16_t *out);
|
||||
size_t p4nsdec32(unsigned char *in, size_t n, uint32_t *out);
|
||||
size_t p4nsdec64(unsigned char *in, size_t n, uint64_t *out);
|
||||
|
||||
// same as p4enc, but with b and bx as parameters. Call after _p4bitsXX
|
||||
inline unsigned char *_p4enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4enc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
|
||||
inline unsigned char *_p4enc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
|
||||
inline unsigned char *_p4enc128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking)
|
||||
inline unsigned char *_p4enc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx);
|
||||
// calculate the best bit sizes b and bx, return b.
|
||||
unsigned _p4bits8( uint8_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
unsigned _p4bits16( uint16_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
unsigned _p4bits32( uint32_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
unsigned _p4bits64( uint64_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
|
||||
unsigned _p4bitsx8( uint8_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
unsigned _p4bitsx16( uint16_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
unsigned _p4bitsx32( uint32_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
unsigned _p4bitsx64( uint64_t *__restrict in, unsigned n, unsigned *pbx);
|
||||
|
||||
#define P4HVE(_out_, _b_, _bx_,_usize_) do { if(!_bx_) *_out_++ = _b_;else if(_bx_ <= _usize_) *_out_++ = 0x80|_b_, *_out_++ = _bx_; else *_out_++= (_bx_ == _usize_+1?0x40:0xc0)|_b_; } while(0)
|
||||
|
||||
#define P4HVE8( _out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_, 8)
|
||||
#define P4HVE16(_out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_,16)
|
||||
#define P4HVE32(_out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_,32)
|
||||
#define P4HVE64(_out_, _b_, _bx_) do { unsigned _c = _b_==64?64-1:_b_; P4HVE(_out_, _c, _bx_,64); } while(0)
|
||||
|
||||
//---------------------------- TurboPFor: Decode --------------------------------------------------------
|
||||
// decompress a previously (with p4enc32) bit packed array. Return value = end of packed buffer in
|
||||
//-- scalar. (see p4getx32 for direct access)
|
||||
// b and bx specified (not stored within the compressed stream header)
|
||||
inline unsigned char *_p4dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical BitPacking)
|
||||
inline unsigned char *_p4dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4dec128v64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx);
|
||||
|
||||
unsigned char *p4dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out);
|
||||
unsigned char *p4dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out);
|
||||
unsigned char *p4dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
|
||||
unsigned char *p4dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out); // SIMD (Vertical BitPacking)
|
||||
unsigned char *p4dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
|
||||
unsigned char *p4dec128v64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out);
|
||||
unsigned char *p4dec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out);
|
||||
unsigned char *p4dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out);
|
||||
//------ Delta decoding --------------------------- Return value = end of packed input buffer in ---------------------------
|
||||
//-- Increasing integer lists. out[i] = out[i-1] + in[i]
|
||||
// b and bx specified
|
||||
unsigned char *_p4ddec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4ddec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4ddec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4ddec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4ddec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4ddec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
|
||||
|
||||
unsigned char *p4ddec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start);
|
||||
unsigned char *p4ddec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
|
||||
unsigned char *p4ddec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4ddec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
|
||||
unsigned char *p4ddec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4ddec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
//-- Strictly increasing (never remaining constant or decreasing) integer lists. out[i] = out[i-1] + in[i] + 1
|
||||
// b and bx specified (see idxcr.c/idxqry.c for an example)
|
||||
unsigned char *_p4d1dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4d1dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4d1dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4d1dec128v16(unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); // SIMD (Vertical BitPacking)
|
||||
unsigned char *_p4d1dec128v32(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4d1dec256v32(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
unsigned char *_p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
|
||||
|
||||
unsigned char *p4d1dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start);
|
||||
unsigned char *p4d1dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
|
||||
unsigned char *p4d1dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4d1dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
|
||||
unsigned char *p4d1dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4d1dec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
// ZigZag encoding
|
||||
inline unsigned char *_p4zdec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4zdec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4zdec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4zdec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4zdec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4zdec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx);
|
||||
inline unsigned char *_p4zdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx);
|
||||
|
||||
unsigned char *p4zdec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start);
|
||||
unsigned char *p4zdec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start);
|
||||
unsigned char *p4zdec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4zdec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking)
|
||||
unsigned char *p4zdec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4zdec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start);
|
||||
unsigned char *p4zdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start);
|
||||
|
||||
//---------------- Direct Access functions to compressed TurboPFor array p4encx16/p4encx32 -------------------------------------------------------
|
||||
#ifdef TURBOPFOR_DAC
|
||||
#include "conf.h"
|
||||
#define P4D_PAD8(_x_) ( (((_x_)+8-1)/8) )
|
||||
#define P4D_B(_x_) ((_x_) & 0x7f)
|
||||
#define P4D_XB(_x_) (((_x_) & 0x80)?((_x_) >> 8):0)
|
||||
#define P4D_ININC(_in_, _x_) _in_ += 1+((_x_) >> 7)
|
||||
|
||||
static inline unsigned p4bits(unsigned char *__restrict in, int *bx) { unsigned i = ctou16(in); *bx = P4D_XB(i); return P4D_B(i); }
|
||||
|
||||
struct p4 {
|
||||
unsigned long long *xmap;
|
||||
unsigned char *ex;
|
||||
unsigned isx,bx,cum[P4D_MAX/64+1];
|
||||
int oval,idx;
|
||||
};
|
||||
|
||||
static unsigned long long p4xmap[P4D_MAX/64+1] = { 0 };
|
||||
|
||||
// prepare direct access usage
|
||||
static inline void p4ini(struct p4 *p4, unsigned char **pin, unsigned n, unsigned *b) { unsigned char *in = *pin;
|
||||
unsigned p4i = ctou16(in);
|
||||
p4->isx = p4i&0x80;
|
||||
*b = P4D_B(p4i);
|
||||
p4->bx = P4D_XB(p4i); //printf("p4i=%x,b=%d,bx=%d ", p4->i, *b, p4->bx); //assert(n <= P4D_MAX);
|
||||
*pin = p4->ex = ++in;
|
||||
if(p4->isx) {
|
||||
unsigned num=0,j;
|
||||
unsigned char *p;
|
||||
++in;
|
||||
p4->xmap = (unsigned long long *)in;
|
||||
for(j=0; j < n/64; j++) { p4->cum[j] = num; num += popcnt64(ctou64(in+j*8)); }
|
||||
if(n & 0x3f) num += popcnt64(ctou64(in+j*8) & ((1ull<<(n&0x3f))-1) );
|
||||
p4->ex = p = in + (n+7)/8;
|
||||
*pin = p = p4->ex+(((uint64_t)num*p4->bx+7)/8);
|
||||
} else p4->xmap = p4xmap;
|
||||
p4->oval = p4->idx = -1;
|
||||
}
|
||||
|
||||
//---------- Get a single value with index "idx" from a "p4encx32" packed array
|
||||
static ALWAYS_INLINE uint8_t p4getx8( struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx8( in, idx, b);
|
||||
if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx8(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
|
||||
return u;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint16_t p4getx16(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx16(in, idx, b);
|
||||
if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx16(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
|
||||
return u;
|
||||
}
|
||||
static ALWAYS_INLINE uint32_t p4getx32(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx32(in, idx, b);
|
||||
if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx32(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<<cl)), p4->bx) << b;
|
||||
return u;
|
||||
}
|
||||
|
||||
// Get the next single value greater of equal to val
|
||||
static ALWAYS_INLINE uint16_t p4geqx8( struct p4 *p4, unsigned char *in, unsigned b, uint8_t val) { do p4->oval += p4getx8( p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
|
||||
static ALWAYS_INLINE uint16_t p4geqx16(struct p4 *p4, unsigned char *in, unsigned b, uint16_t val) { do p4->oval += p4getx16(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
|
||||
static ALWAYS_INLINE uint32_t p4geqx32(struct p4 *p4, unsigned char *in, unsigned b, uint32_t val) { do p4->oval += p4getx32(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; }
|
||||
|
||||
/* DO NOT USE : like p4dec32 but using direct access. This is only a demo showing direct access usage. Use p4dec32 instead for decompressing entire blocks */
|
||||
unsigned char *p4decx32( unsigned char *in, unsigned n, uint32_t *out); // unsorted
|
||||
unsigned char *p4fdecx32( unsigned char *in, unsigned n, uint32_t *out, uint32_t start); // FOR increasing
|
||||
unsigned char *p4f1decx32( unsigned char *in, unsigned n, uint32_t *out, uint32_t start); // FOR strictly increasing
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
47
vsimple.h
47
vsimple.h
@ -1,47 +0,0 @@
|
||||
/**
|
||||
Copyright (C) powturbo 2013-2019
|
||||
GPL v2 License
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
- homepage : https://sites.google.com/site/powturbo/
|
||||
- github : https://github.com/powturbo
|
||||
- twitter : https://twitter.com/powturbo
|
||||
- email : powturbo [_AT_] gmail [_DOT_] com
|
||||
**/
|
||||
// "Integer Compression" variable simple "SimpleV"
|
||||
// this belongs to the integer compression known as "simple family", like simple-9,simple-16
|
||||
// or simple-8b. SimpleV is compressing integers in groups into variable word size 32, 40 and 64 bits + RLE (run length encoding)
|
||||
// SimpleV is faster than simple-16 and compress better than simple-16 or simple-8b.
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// vsencNN: compress array with n unsigned (NN bits in[n]) values to the buffer out. Return value = end of compressed output buffer out
|
||||
unsigned char *vsenc8( unsigned char *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
unsigned char *vsenc16(unsigned short *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
unsigned char *vsenc32(unsigned *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
unsigned char *vsenc64(uint64_t *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
|
||||
// vsdecNN: decompress buffer into an array of n unsigned values. Return value = end of compressed input buffer in
|
||||
unsigned char *vsdec8( unsigned char *__restrict in, size_t n, unsigned char *__restrict out);
|
||||
unsigned char *vsdec16(unsigned char *__restrict in, size_t n, unsigned short *__restrict out);
|
||||
unsigned char *vsdec32(unsigned char *__restrict in, size_t n, unsigned *__restrict out);
|
||||
unsigned char *vsdec64(unsigned char *__restrict in, size_t n, uint64_t *__restrict out);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user