From 59f99b9b3c0d45bc5235cd1d262aa11a129ed724 Mon Sep 17 00:00:00 2001 From: x Date: Fri, 10 Mar 2023 20:50:21 +0100 Subject: [PATCH] . --- bitpack.h | 310 ----------------------------- bitutil.h | 547 ---------------------------------------------------- conf.h | 282 --------------------------- eliasfano.h | 61 ------ fp.h | 125 ------------ sse_neon.h | 355 ---------------------------------- time_.h | 252 ------------------------ transpose.h | 113 ----------- trle.h | 72 ------- vint.h | 401 -------------------------------------- vp4.h | 355 ---------------------------------- vsimple.h | 47 ----- 12 files changed, 2920 deletions(-) delete mode 100644 bitpack.h delete mode 100644 bitutil.h delete mode 100644 conf.h delete mode 100644 eliasfano.h delete mode 100644 fp.h delete mode 100755 sse_neon.h delete mode 100644 time_.h delete mode 100644 transpose.h delete mode 100644 trle.h delete mode 100644 vint.h delete mode 100644 vp4.h delete mode 100644 vsimple.h diff --git a/bitpack.h b/bitpack.h deleted file mode 100644 index e4a2e68..0000000 --- a/bitpack.h +++ /dev/null @@ -1,310 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// bitpack.h - "Integer Compression" Binary Packing header file -#ifndef BITPACK_H_ -#define BITPACK_H_ -#if defined(_MSC_VER) && _MSC_VER < 1600 -#include "vs/stdint.h" -#else -#include -#endif -#include - -#ifdef __cplusplus -extern "C" { -#endif - -//******************** Bit Packing High Level API - n unlimited *************************************************** -size_t bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnpack128v64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t bitndpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitndpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitndpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitndpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitndpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitndpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t bitnd1pack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnd1pack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnd1pack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnd1pack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnd1pack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnd1pack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnd1pack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t bitnzpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnzpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnzpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnzpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnzpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t bitnfpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnfpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnfpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t bitnunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t bitnunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitnunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitnunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -size_t bitnunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitnunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitnunpack128v64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -size_t bitnunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t bitndunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t bitndunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitndunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitndunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -size_t bitndunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitndunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitndunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t bitnd1unpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t bitnd1unpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitnd1unpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitnd1unpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -size_t bitnd1unpack128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitnd1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitnd1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t bitnzunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t bitnzunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitnzunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitnzunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t bitnfunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t bitnfunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitnfunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -//******** Bit Packing Low level API **************************************************************** -// bipackNN: Pack array with n unsigned (NN bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out -unsigned char *bitpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b); -unsigned char *bitpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b); -unsigned char *bitpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b); -unsigned char *bitpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b); - -// delta bit packing -unsigned char *bitdpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b); -unsigned char *bitdpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b); -unsigned char *bitdpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b); -unsigned char *bitdpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b); - -unsigned char *bitd1pack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b); -unsigned char *bitd1pack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b); -unsigned char *bitd1pack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b); -unsigned char *bitd1pack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b); - -// FOR bit packing : sorted integer array -unsigned char *bitfpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b); -unsigned char *bitfpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b); -unsigned char *bitfpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b); -unsigned char *bitfpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b); - -unsigned char *bitf1pack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b); -unsigned char *bitf1pack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b); -unsigned char *bitf1pack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b); -unsigned char *bitf1pack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b); - -// zigzag : unsorted integer array -unsigned char *bitzpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint8_t start, unsigned b); -unsigned char *bitzpack16( uint16_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint16_t start, unsigned b); -unsigned char *bitzpack32( uint32_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint32_t start, unsigned b); -unsigned char *bitzpack64( uint64_t *__restrict in, unsigned n, const unsigned char *__restrict out, uint64_t start, unsigned b); - -//-------------------------------------- SIMD ------------------------------------------------------------------------------------------ -// Pack array with 128 unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out -unsigned char *bitpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); -unsigned char *bitdpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b); -unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b); -unsigned char *bitfpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b); -unsigned char *bitf1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b); -unsigned char *bitzpack128v16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b); - -unsigned char *bitpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); -unsigned char *bitdpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitfpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitf1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitzpack128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); - -//unsigned char *bitpack256w32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); -unsigned char *bitpack128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); - -unsigned char *bitpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); -unsigned char *bitdpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitfpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitzpack256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); - -//********************************** Bit Packing : Unpack **************************************************************** - -// ---------------- Unpack a b-bits packed integer array ------------------------------------------------------------------------------- -// unpack a bitpacked integer array. Return value = end of packed buffer in -unsigned char *bitunpack8( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, unsigned b); -unsigned char *bitunpack16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b); -unsigned char *bitunpack32( const unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b); -unsigned char *bitunpack64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b); - -// ---------------- Direct Access to a single packed integer array entry -------------------------------------------------------------- - #ifdef TURBOPFOR_DAC - #ifdef __AVX2__ -#include -#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_) -#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_) - #else -#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) -#define bzhi32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) - #endif - -#include "conf.h" - -static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); } -//static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; - //return (ctou64((uint32_t *)in+(bidx>>5)) << 32+(bidx&0x1f)) >> (64-b); -// return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); } -static ALWAYS_INLINE unsigned _bitgetx32(const unsigned char *__restrict in, uint64_t bidx, unsigned b) { return bzhi64( ctou64((uint32_t *)in+(bidx>>5)) >> (bidx&0x1f), b ); } - -// like bitgetx32 but for 16 bits integer array -static ALWAYS_INLINE unsigned bitgetx8( const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou16((uint16_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); } -static ALWAYS_INLINE unsigned _bitgetx8( const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou16((uint16_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); } -static ALWAYS_INLINE unsigned bitgetx16(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou32((uint32_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); } -static ALWAYS_INLINE unsigned _bitgetx16(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou32((uint32_t *)in+(bidx>>4)) >> (bidx& 0xf), b ); } - -// Set a single value with index "idx" -static ALWAYS_INLINE void bitsetx16(const unsigned char *__restrict in, unsigned idx, unsigned v, unsigned b) { unsigned bidx = b*idx; unsigned *p = (unsigned *) in+(bidx>>4) ; *p = ( *p & ~(((1u <>5)); *p = ( *p & ~(((1ull< - #elif defined(__AVX__) -#include - #elif defined(__SSE4_1__) -#include - #elif defined(__SSSE3__) - #ifdef __powerpc64__ -#define __SSE__ 1 -#define __SSE2__ 1 -#define __SSE3__ 1 -#define NO_WARN_X86_INTRINSICS 1 - #endif -#include - #elif defined(__SSE2__) -#include - #elif defined(__ARM_NEON) -#include - #endif - #if defined(_MSC_VER) && _MSC_VER < 1600 -#include "vs/stdint.h" - #else -#include - #endif -#include "sse_neon.h" - - #ifdef __ARM_NEON -#define PREFETCH(_ip_,_rw_) - #else -#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_) - #endif -//------------------------ zigzag encoding ------------------------------------------------------------- -static inline unsigned char zigzagenc8( signed char x) { return x << 1 ^ x >> 7; } -static inline char zigzagdec8( unsigned char x) { return x >> 1 ^ -(x & 1); } - -static inline unsigned short zigzagenc16(short x) { return x << 1 ^ x >> 15; } -static inline short zigzagdec16(unsigned short x) { return x >> 1 ^ -(x & 1); } - -static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; } -static inline int zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); } - -static inline uint64_t zigzagenc64(int64_t x) { return x << 1 ^ x >> 63; } -static inline int64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); } - - #if defined(__SSE2__) || defined(__ARM_NEON) -static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16(v,15)); } -static ALWAYS_INLINE __m128i mm_zzage_epi32(__m128i v) { return _mm_xor_si128( mm_slli_epi32(v,1), mm_srai_epi32(v,31)); } -//static ALWAYS_INLINE __m128i mm_zzage_epi64(__m128i v) { return _mm_xor_si128( mm_slli_epi64(v,1), _mm_srai_epi64(v,63)); } - -static ALWAYS_INLINE __m128i mm_zzagd_epi16(__m128i v) { return _mm_xor_si128( mm_srli_epi16(v,1), mm_srai_epi16( mm_slli_epi16(v,15),15) ); } -static ALWAYS_INLINE __m128i mm_zzagd_epi32(__m128i v) { return _mm_xor_si128( mm_srli_epi32(v,1), mm_srai_epi32( mm_slli_epi32(v,31),31) ); } -//static ALWAYS_INLINE __m128i mm_zzagd_epi64(__m128i v) { return _mm_xor_si128(mm_srli_epi64(v,1), _mm_srai_epi64( m_slli_epi64(v,63),63) ); } - - #endif - #ifdef __AVX2__ -static ALWAYS_INLINE __m256i mm256_zzage_epi32(__m256i v) { return _mm256_xor_si256(_mm256_slli_epi32(v,1), _mm256_srai_epi32(v,31)); } -static ALWAYS_INLINE __m256i mm256_zzagd_epi32(__m256i v) { return _mm256_xor_si256(_mm256_srli_epi32(v,1), _mm256_srai_epi32(_mm256_slli_epi32(v,31),31) ); } - #endif - -//-------------- AVX2 delta + prefix sum (scan) / xor encode/decode --------------------------------------------------------------------------------------- - #ifdef __AVX2__ -static ALWAYS_INLINE __m256i mm256_delta_epi32(__m256i v, __m256i sv) { return _mm256_sub_epi32(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); } -static ALWAYS_INLINE __m256i mm256_delta_epi64(__m256i v, __m256i sv) { return _mm256_sub_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 8)); } -static ALWAYS_INLINE __m256i mm256_xore_epi32( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 12)); } -static ALWAYS_INLINE __m256i mm256_xore_epi64( __m256i v, __m256i sv) { return _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2f128_si256(sv, v, _MM_SHUFFLE(0, 2, 0, 1)), 8)); } - -static ALWAYS_INLINE __m256i mm256_scan_epi32(__m256i v, __m256i sv) { - v = _mm256_add_epi32(v, _mm256_slli_si256(v, 4)); - v = _mm256_add_epi32(v, _mm256_slli_si256(v, 8)); - return _mm256_add_epi32( _mm256_permute2x128_si256( _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11), - _mm256_add_epi32(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)), 0x20))); -} -static ALWAYS_INLINE __m256i mm256_xord_epi32(__m256i v, __m256i sv) { - v = _mm256_xor_si256(v, _mm256_slli_si256(v, 4)); - v = _mm256_xor_si256(v, _mm256_slli_si256(v, 8)); - return _mm256_xor_si256( _mm256_permute2x128_si256( _mm256_shuffle_epi32(sv,_MM_SHUFFLE(3, 3, 3, 3)), sv, 0x11), - _mm256_xor_si256(v, _mm256_permute2x128_si256(_mm256_setzero_si256(),_mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 3, 3)), 0x20))); -} - -static ALWAYS_INLINE __m256i mm256_scan_epi64(__m256i v, __m256i sv) { - v = _mm256_add_epi64(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8)); - return _mm256_add_epi64(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_add_epi64(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) ); -} -static ALWAYS_INLINE __m256i mm256_xord_epi64(__m256i v, __m256i sv) { - v = _mm256_xor_si256(v, _mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), 8)); - return _mm256_xor_si256(_mm256_permute4x64_epi64(sv, _MM_SHUFFLE(3, 3, 3, 3)), _mm256_xor_si256(_mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0, 0, 2, 0)), v) ); -} - -static ALWAYS_INLINE __m256i mm256_scani_epi32(__m256i v, __m256i sv, __m256i vi) { return _mm256_add_epi32(mm256_scan_epi32(v, sv), vi); } - #endif - - #if defined(__SSSE3__) || defined(__ARM_NEON) -static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_alignr_epi8(v, sv, 14)); } -static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_alignr_epi8(v, sv, 12)); } -static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 14)); } -static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_alignr_epi8(v, sv, 12)); } - -#define MM_HDEC_EPI32(_v_,_sv_,_hop_) { _v_ = _hop_(_v_, _mm_slli_si128(_v_, 4)); _v_ = _hop_(mm_shuffle_nnnn_epi32(_sv_, 3), _hop_(_mm_slli_si128(_v_, 8), _v_)); } -static ALWAYS_INLINE __m128i mm_scan_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_add_epi32); return v; } -static ALWAYS_INLINE __m128i mm_xord_epi32(__m128i v, __m128i sv) { MM_HDEC_EPI32(v,sv,_mm_xor_si128); return v; } - -#define MM_HDEC_EPI16(_v_,_sv_,_hop_) {\ - _v_ = _hop_( _v_, _mm_slli_si128(_v_, 2));\ - _v_ = _hop_( _v_, _mm_slli_si128(_v_, 4));\ - _v_ = _hop_(_hop_(_v_, _mm_slli_si128(_v_, 8)), _mm_shuffle_epi8(_sv_, _mm_set1_epi16(0x0f0e)));\ -} - -static ALWAYS_INLINE __m128i mm_scan_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_add_epi16); return v; } -static ALWAYS_INLINE __m128i mm_xord_epi16(__m128i v, __m128i sv) { MM_HDEC_EPI16(v,sv,_mm_xor_si128); return v; } -//-------- scan with vi delta > 0 ----------------------------- -static ALWAYS_INLINE __m128i mm_scani_epi16(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi16(mm_scan_epi16(v, sv), vi); } -static ALWAYS_INLINE __m128i mm_scani_epi32(__m128i v, __m128i sv, __m128i vi) { return _mm_add_epi32(mm_scan_epi32(v, sv), vi); } - - #elif defined(__SSE2__) -static ALWAYS_INLINE __m128i mm_delta_epi16(__m128i v, __m128i sv) { return _mm_sub_epi16(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); } -static ALWAYS_INLINE __m128i mm_xore_epi16( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 14), _mm_slli_si128(v, 2))); } -static ALWAYS_INLINE __m128i mm_delta_epi32(__m128i v, __m128i sv) { return _mm_sub_epi32(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); } -static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_xor_si128(v, _mm_or_si128(_mm_srli_si128(sv, 12), _mm_slli_si128(v, 4))); } - #endif - -#if !defined(_M_X64) && !defined(__x86_64__) && defined(__AVX__) -#define _mm256_extract_epi64(v, index) ((__int64)((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2) | (((uint64_t)(uint32_t)_mm256_extract_epi32((v), (index) * 2 + 1)) << 32))) -#endif - -//------------------ Horizontal OR ----------------------------------------------- - #ifdef __AVX2__ -static ALWAYS_INLINE unsigned mm256_hor_epi32(__m256i v) { - v = _mm256_or_si256(v, _mm256_srli_si256(v, 8)); - v = _mm256_or_si256(v, _mm256_srli_si256(v, 4)); - return _mm256_extract_epi32(v,0) | _mm256_extract_epi32(v, 4); -} - -static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) { - v = _mm256_or_si256(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(2, 0, 0, 1))); - return _mm256_extract_epi64(v, 1) | _mm256_extract_epi64(v,0); -} - #endif - - #if defined(__SSE2__) || defined(__ARM_NEON) -#define MM_HOZ_EPI16(v,_hop_) {\ - v = _hop_(v, _mm_srli_si128(v, 8));\ - v = _hop_(v, _mm_srli_si128(v, 6));\ - v = _hop_(v, _mm_srli_si128(v, 4));\ - v = _hop_(v, _mm_srli_si128(v, 2));\ -} - -#define MM_HOZ_EPI32(v,_hop_) {\ - v = _hop_(v, _mm_srli_si128(v, 8));\ - v = _hop_(v, _mm_srli_si128(v, 4));\ -} - -static ALWAYS_INLINE uint16_t mm_hor_epi16( __m128i v) { MM_HOZ_EPI16(v,_mm_or_si128); return (unsigned short)_mm_cvtsi128_si32(v); } -static ALWAYS_INLINE uint32_t mm_hor_epi32( __m128i v) { MM_HOZ_EPI32(v,_mm_or_si128); return (unsigned )_mm_cvtsi128_si32(v); } -static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _mm_srli_si128(v, 8)); return (uint64_t )_mm_cvtsi128_si64(v); } - #endif - -//----------------- sub / add ---------------------------------------------------------- - #if defined(__SSE2__) || defined(__ARM_NEON) -#define SUBI16x8(_v_, _sv_) _mm_sub_epi16(_v_, _sv_) -#define SUBI32x4(_v_, _sv_) _mm_sub_epi32(_v_, _sv_) -#define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_) -#define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_) - -//---------------- Convert mm_cvtsi128_siXX ------------------------------------------- -static ALWAYS_INLINE uint8_t mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); } -static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); } - #endif - -//--------- memset ----------------------------------------- -#define BITFORSET_(_out_, _n_, _start_, _mindelta_) do { unsigned _i;\ - for(_i = 0; _i != (_n_&~3); _i+=4) { \ - _out_[_i+0] = _start_+(_i )*_mindelta_; \ - _out_[_i+1] = _start_+(_i+1)*_mindelta_; \ - _out_[_i+2] = _start_+(_i+2)*_mindelta_; \ - _out_[_i+3] = _start_+(_i+3)*_mindelta_; \ - } \ - while(_i != _n_) \ - _out_[_i] = _start_+_i*_mindelta_, ++_i; \ -} while(0) - -//--------- SIMD zero ----------------------------------------- - #ifdef __AVX2__ -#define BITZERO32(_out_, _n_, _start_) do {\ - __m256i _sv_ = _mm256_set1_epi32(_start_), *_ov = (__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\ - do _mm256_storeu_si256(_ov++, _sv_); while(_ov < _ove);\ -} while(0) - -#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) do {\ - __m256i _sv = _mm256_set1_epi32(_start_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_), _cv = _mm256_set_epi32(7+_mindelta_,6+_mindelta_,5+_mindelta_,4+_mindelta_,3*_mindelta_,2*_mindelta_,1*_mindelta_,0); \ - _sv = _mm256_add_epi32(_sv, _cv);\ - _cv = _mm256_set1_epi32(4);\ - do { _mm256_storeu_si256(_ov++, _sv); _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\ -} while(0) - -#define BITDIZERO32(_out_, _n_, _start_, _mindelta_) do { __m256i _sv = _mm256_set1_epi32(_start_), _cv = _mm256_set_epi32(7+_mindelta_,6+_mindelta_,5+_mindelta_,4+_mindelta_,3+_mindelta_,2+_mindelta_,1+_mindelta_,_mindelta_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\ - _sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_mindelta_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\ -} while(0) - - #elif defined(__SSE2__) || defined(__ARM_NEON) // ------------- -// SIMD set value (memset) -#define BITZERO32(_out_, _n_, _v_) do {\ - __m128i _sv_ = _mm_set1_epi32(_v_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\ - do _mm_storeu_si128(_ov++, _sv_); while(_ov < _ove); \ -} while(0) - -#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) do {\ - __m128i _sv = _mm_set1_epi32(_start_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_), _cv = _mm_set_epi32(3*_mindelta_,2*_mindelta_,1*_mindelta_,0); \ - _sv = _mm_add_epi32(_sv, _cv);\ - _cv = _mm_set1_epi32(4);\ - do { _mm_storeu_si128(_ov++, _sv); _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\ -} while(0) - -#define BITDIZERO32(_out_, _n_, _start_, _mindelta_) do { __m128i _sv = _mm_set1_epi32(_start_), _cv = _mm_set_epi32(3+_mindelta_,2+_mindelta_,1+_mindelta_,_mindelta_), *_ov=(__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\ - _sv = _mm_add_epi32(_sv, _cv); _cv = _mm_set1_epi32(4*_mindelta_); do { _mm_storeu_si128(_ov++, _sv), _sv = _mm_add_epi32(_sv, _cv); } while(_ov < _ove);\ -} while(0) - #else -#define BITFORZERO32(_out_, _n_, _start_, _mindelta_) BITFORSET_(_out_, _n_, _start_, _mindelta_) -#define BITZERO32( _out_, _n_, _start_) BITFORSET_(_out_, _n_, _start_, 0) - #endif - -#define DELTR( _in_, _n_, _start_, _mindelta_, _out_) { unsigned _v; for( _v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_); } -#define DELTRB(_in_, _n_, _start_, _mindelta_, _b_, _out_) { unsigned _v; for(_b_=0,_v = 0; _v < _n_; _v++) _out_[_v] = _in_[_v] - (_start_) - _v*(_mindelta_) - (_mindelta_), _b_ |= _out_[_v]; _b_ = bsr32(_b_); } - -//----------------------------------------- bitreverse scalar + SIMD ------------------------------------------- - #if __clang__ && defined __has_builtin - #if __has_builtin(__builtin_bitreverse64) -#define BUILTIN_BITREVERSE - #else -#define BUILTIN_BITREVERSE - #endif - #endif - #ifdef BUILTIN_BITREVERSE -#define rbit8(x) __builtin_bitreverse8( x) -#define rbit16(x) __builtin_bitreverse16(x) -#define rbit32(x) __builtin_bitreverse32(x) -#define rbit64(x) __builtin_bitreverse64(x) - #else - - #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u) -static ALWAYS_INLINE uint32_t _rbit_(uint32_t x) { uint32_t rc; __asm volatile ("rbit %0, %1" : "=r" (rc) : "r" (x) ); } - #endif -static ALWAYS_INLINE uint8_t rbit8(uint8_t x) { - #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u) - return _rbit_(x) >> 24; - #elif 0 - x = (x & 0xaa) >> 1 | (x & 0x55) << 1; - x = (x & 0xcc) >> 2 | (x & 0x33) << 2; - return x << 4 | x >> 4; - #else - return (x * 0x0202020202ull & 0x010884422010ull) % 1023; - #endif -} - -static ALWAYS_INLINE uint16_t rbit16(uint16_t x) { - #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u) - return _rbit_(x) >> 16; - #else - x = (x & 0xaaaa) >> 1 | (x & 0x5555) << 1; - x = (x & 0xcccc) >> 2 | (x & 0x3333) << 2; - x = (x & 0xf0f0) >> 4 | (x & 0x0f0f) << 4; - return x << 8 | x >> 8; - #endif -} - -static ALWAYS_INLINE uint32_t rbit32(uint32_t x) { - #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u) - return _rbit_(x); - #else - x = ((x & 0xaaaaaaaa) >> 1 | (x & 0x55555555) << 1); - x = ((x & 0xcccccccc) >> 2 | (x & 0x33333333) << 2); - x = ((x & 0xf0f0f0f0) >> 4 | (x & 0x0f0f0f0f) << 4); - x = ((x & 0xff00ff00) >> 8 | (x & 0x00ff00ff) << 8); - return x << 16 | x >> 16; - #endif -} -static ALWAYS_INLINE uint64_t rbit64(uint64_t x) { - #if (__CORTEX_M >= 0x03u) || (__CORTEX_SC >= 300u) - return (uint64_t)_rbit_(x) << 32 | _rbit_(x >> 32); - #else - x = (x & 0xaaaaaaaaaaaaaaaa) >> 1 | (x & 0x5555555555555555) << 1; - x = (x & 0xcccccccccccccccc) >> 2 | (x & 0x3333333333333333) << 2; - x = (x & 0xf0f0f0f0f0f0f0f0) >> 4 | (x & 0x0f0f0f0f0f0f0f0f) << 4; - x = (x & 0xff00ff00ff00ff00) >> 8 | (x & 0x00ff00ff00ff00ff) << 8; - x = (x & 0xffff0000ffff0000) >> 16 | (x & 0x0000ffff0000ffff) << 16; - return x << 32 | x >> 32; - #endif -} - #endif - - #if defined(__SSSE3__) || defined(__ARM_NEON) -static ALWAYS_INLINE __m128i mm_rbit_epi16(__m128i v) { return mm_rbit_epi8(mm_rev_epi16(v)); } -static ALWAYS_INLINE __m128i mm_rbit_epi32(__m128i v) { return mm_rbit_epi8(mm_rev_epi32(v)); } -static ALWAYS_INLINE __m128i mm_rbit_epi64(__m128i v) { return mm_rbit_epi8(mm_rev_epi64(v)); } -//static ALWAYS_INLINE __m128i mm_rbit_si128(__m128i v) { return mm_rbit_epi8(mm_rev_si128(v)); } - #endif - - #ifdef __AVX2__ -static ALWAYS_INLINE __m256i mm256_rbit_epi8(__m256i v) { - __m256i fv = _mm256_setr_epi8(0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15, 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15), cv0f_8 = _mm256_set1_epi8(0xf); - __m256i lv = _mm256_shuffle_epi8(fv,_mm256_and_si256( v, cv0f_8)); - __m256i hv = _mm256_shuffle_epi8(fv,_mm256_and_si256(_mm256_srli_epi64(v, 4), cv0f_8)); - return _mm256_or_si256(_mm256_slli_epi64(lv,4), hv); -} - -static ALWAYS_INLINE __m256i mm256_rev_epi16(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14)); } -static ALWAYS_INLINE __m256i mm256_rev_epi32(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 3, 2, 1, 0, 7, 6, 5, 4, 11,10, 9, 8,15,14,13,12, 3, 2, 1, 0, 7, 6, 5, 4, 11,10, 9, 8,15,14,13,12)); } -static ALWAYS_INLINE __m256i mm256_rev_epi64(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8( 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8)); } -static ALWAYS_INLINE __m256i mm256_rev_si128(__m256i v) { return _mm256_shuffle_epi8(v, _mm256_setr_epi8(15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); } - -static ALWAYS_INLINE __m256i mm256_rbit_epi16(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi16(v)); } -static ALWAYS_INLINE __m256i mm256_rbit_epi32(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi32(v)); } -static ALWAYS_INLINE __m256i mm256_rbit_epi64(__m256i v) { return mm256_rbit_epi8(mm256_rev_epi64(v)); } -static ALWAYS_INLINE __m256i mm256_rbit_si128(__m256i v) { return mm256_rbit_epi8(mm256_rev_si128(v)); } - #endif - -// ------------------ bitio genaral macros --------------------------- - #ifdef __AVX2__ - #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) -#include - #else -#include - #endif -#define bzhi_u32(_u_, _b_) _bzhi_u32(_u_, _b_) - - #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) -#define bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) - #else -#define bzhi_u64(_u_, _b_) _bzhi_u64(_u_, _b_) - #endif - #else -#define bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) -#define bzhi_u32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) - #endif - -#define BZHI64(_u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1))) -#define BZHI32(_u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1))) - -#define bitdef( _bw_,_br_) uint64_t _bw_=0; unsigned _br_=0 -#define bitini( _bw_,_br_) _bw_=_br_=0 -//-- bitput --------- -#define bitput( _bw_,_br_,_nb_,_x_) (_bw_) += (uint64_t)(_x_) << (_br_), (_br_) += (_nb_) -#define bitenorm( _bw_,_br_,_op_) ctou64(_op_) = _bw_; _op_ += ((_br_)>>3), (_bw_) >>=((_br_)&~7), (_br_) &= 7 -#define bitflush( _bw_,_br_,_op_) ctou64(_op_) = _bw_, _op_ += ((_br_)+7)>>3, _bw_=_br_=0 -//-- bitget --------- -#define bitbw( _bw_,_br_) ((_bw_)>>(_br_)) -#define bitrmv( _bw_,_br_,_nb_) (_br_) += _nb_ - -#define bitdnorm( _bw_,_br_,_ip_) _bw_ = ctou64((_ip_) += ((_br_)>>3)), (_br_) &= 7 -#define bitalign( _bw_,_br_,_ip_) ((_ip_) += ((_br_)+7)>>3) - -#define BITPEEK32( _bw_,_br_,_nb_) BZHI32(bitbw(_bw_,_br_), _nb_) -#define BITGET32( _bw_,_br_,_nb_,_x_) _x_ = BITPEEK32(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_) -#define BITPEEK64( _bw_,_br_,_nb_) BZHI64(bitbw(_bw_,_br_), _nb_) -#define BITGET64( _bw_,_br_,_nb_,_x_) _x_ = BITPEEK64(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_) - -#define bitpeek57( _bw_,_br_,_nb_) bzhi_u64(bitbw(_bw_,_br_), _nb_) -#define bitget57( _bw_,_br_,_nb_,_x_) _x_ = bitpeek57(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_) -#define bitpeek31( _bw_,_br_,_nb_) bzhi_u32(bitbw(_bw_,_br_), _nb_) -#define bitget31( _bw_,_br_,_nb_,_x_) _x_ = bitpeek31(_bw_, _br_, _nb_), bitrmv(_bw_, _br_, _nb_) -//------------------ templates ----------------------------------- -#define bitput8( _bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_) -#define bitput16(_bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_) -#define bitput32(_bw_,_br_,_b_,_x_,_op_) bitput(_bw_,_br_,_b_,_x_) -#define bitput64(_bw_,_br_,_b_,_x_,_op_) if((_b_)>45) { bitput(_bw_,_br_,(_b_)-32, (_x_)>>32); bitenorm(_bw_,_br_,_op_); bitput(_bw_,_br_,32,(unsigned)(_x_)); } else bitput(_bw_,_br_,_b_,_x_) - -#define bitget8( _bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_) -#define bitget16(_bw_,_br_,_b_,_x_,_ip_) bitget31(_bw_,_br_,_b_,_x_) -#define bitget32(_bw_,_br_,_b_,_x_,_ip_) bitget57(_bw_,_br_,_b_,_x_) -#define bitget64(_bw_,_br_,_b_,_x_,_ip_) if((_b_)>45) { unsigned _v; bitget57(_bw_,_br_,(_b_)-32,_x_); bitdnorm(_bw_,_br_,_ip_); BITGET64(_bw_,_br_,32,_v); _x_ = _x_<<32|_v; } else bitget57(_bw_,_br_,_b_,_x_) -#endif - -//---------- max. bit length + transform for sorted/unsorted arrays, delta,delta 1, delta > 1, zigzag, zigzag of delta, xor, FOR,---------------- -#ifdef __cplusplus -extern "C" { -#endif -//------ ORed array, used to determine the maximum bit length of the elements in an unsorted integer array --------------------- -uint8_t bit8( uint8_t *in, unsigned n, uint8_t *px); -uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px); -uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px); -uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px); - -//-------------- delta = 0: Sorted integer array w/ mindelta = 0 ---------------------------------------------- -//-- ORed array, maximum bit length of the non decreasing integer array. out[i] = in[i] - in[i-1] -uint8_t bitd8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start); -uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start); -uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start); - -//-- in-place reverse delta 0 -void bitddec8( uint8_t *p, unsigned n, uint8_t start); // non decreasing (out[i] = in[i] - in[i-1]) -void bitddec16( uint16_t *p, unsigned n, uint16_t start); -void bitddec32( uint32_t *p, unsigned n, uint32_t start); -void bitddec64( uint64_t *p, unsigned n, uint64_t start); - -//-- vectorized fast delta4 one: out[0] = in[4]-in[0], out[1]=in[5]-in[1], out[2]=in[6]-in[2], out[3]=in[7]-in[3],... -uint16_t bits128v16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bits128v32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start); - -//------------- delta = 1: Sorted integer array w/ mindelta = 1 --------------------------------------------- -//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1 -uint8_t bitd18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start); -uint16_t bitd116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start); -uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start); - -//-- in-place reverse delta one -void bitd1dec8( uint8_t *p, unsigned n, uint8_t start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1) -void bitd1dec16( uint16_t *p, unsigned n, uint16_t start); -void bitd1dec32( uint32_t *p, unsigned n, uint32_t start); -void bitd1dec64( uint64_t *p, unsigned n, uint64_t start); - -//------------- delta > 1: Sorted integer array w/ mindelta > 1 --------------------------------------------- -//-- ORed array, for max. bit length get min. delta () -uint8_t bitdi8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start); -uint16_t bitdi16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bitdi32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start); -uint64_t bitdi64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start); -//-- transform sorted integer array to delta array: out[i] = in[i] - in[i-1] - mindelta -uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta); -uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta); -uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta); -uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta); -//-- in-place reverse delta -void bitdidec8( uint8_t *in, unsigned n, uint8_t start, uint8_t mindelta); -void bitdidec16(uint16_t *in, unsigned n, uint16_t start, uint16_t mindelta); -void bitdidec32(uint32_t *in, unsigned n, uint32_t start, uint32_t mindelta); -void bitdidec64(uint64_t *in, unsigned n, uint64_t start, uint64_t mindelta); - -//------------- FOR : array bit length: --------------------------------------------------------------------- -//------ ORed array, for max. bit length of the non decreasing integer array. out[i] = in[i] - start -uint8_t bitf8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start); -uint16_t bitf16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bitf32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start); -uint64_t bitf64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start); - -//------ ORed array, for max. bit length of the non strictly decreasing integer array out[i] = in[i] - 1 - start -uint8_t bitf18( uint8_t *in, unsigned n, uint8_t *px, uint8_t start); -uint16_t bitf116(uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bitf132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start); -uint64_t bitf164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start); - -//------ ORed array, for max. bit length for usorted array -uint8_t bitfm8( uint8_t *in, unsigned n, uint8_t *px, uint8_t *pmin); // unsorted -uint16_t bitfm16(uint16_t *in, unsigned n, uint16_t *px, uint16_t *pmin); -uint32_t bitfm32(uint32_t *in, unsigned n, uint32_t *px, uint32_t *pmin); -uint64_t bitfm64(uint64_t *in, unsigned n, uint64_t *px, uint64_t *pmin); - -//------------- Zigzag encoding for unsorted integer lists: out[i] = in[i] - in[i-1] ------------------------ -//-- ORed array, to get maximum zigzag bit length integer array -uint8_t bitz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start); -uint16_t bitz16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bitz32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start); -uint64_t bitz64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start); -//-- Zigzag transform -uint8_t bitzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta); -uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta); -uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta); -uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta); -//-- in-place zigzag reverse transform -void bitzdec8( uint8_t *in, unsigned n, uint8_t start); -void bitzdec16( uint16_t *in, unsigned n, uint16_t start); -void bitzdec32( uint32_t *in, unsigned n, uint32_t start); -void bitzdec64( uint64_t *in, unsigned n, uint64_t start); - -//------------- Zigzag of zigzag/delta : unsorted/sorted integer array ---------------------------------------------------- -//-- get delta maximum bit length of the non strictly decreasing integer array. out[i] = in[i] - in[i-1] - 1 -uint8_t bitzz8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start); -uint16_t bitzz16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bitzz32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start); -uint64_t bitzz64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start); - -uint8_t bitzzenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uint8_t mindelta); -uint16_t bitzzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta); -uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta); -uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta); - -//-- in-place reverse zigzag of delta (encoded w/ bitdiencNN and parameter mindelta = 1) -void bitzzdec8( uint8_t *in, unsigned n, uint8_t start); // non strictly decreasing (out[i] = in[i] - in[i-1] - 1) -void bitzzdec16( uint16_t *in, unsigned n, uint16_t start); -void bitzzdec32( uint32_t *in, unsigned n, uint32_t start); -void bitzzdec64( uint64_t *in, unsigned n, uint64_t start); - -//------------- XOR encoding for unsorted integer lists: out[i] = in[i] - in[i-1] ------------- -//-- ORed array, to get maximum zigzag bit length integer array -uint8_t bitx8( uint8_t *in, unsigned n, uint8_t *px, uint8_t start); -uint16_t bitx16( uint16_t *in, unsigned n, uint16_t *px, uint16_t start); -uint32_t bitx32( uint32_t *in, unsigned n, uint32_t *px, uint32_t start); -uint64_t bitx64( uint64_t *in, unsigned n, uint64_t *px, uint64_t start); - -//-- XOR transform -uint8_t bitxenc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start); -uint16_t bitxenc16( uint16_t *in, unsigned n, uint16_t *out, uint16_t start); -uint32_t bitxenc32( uint32_t *in, unsigned n, uint32_t *out, uint32_t start); -uint64_t bitxenc64( uint64_t *in, unsigned n, uint64_t *out, uint64_t start); - -//-- XOR in-place reverse transform -void bitxdec8( uint8_t *p, unsigned n, uint8_t start); -void bitxdec16( uint16_t *p, unsigned n, uint16_t start); -void bitxdec32( uint32_t *p, unsigned n, uint32_t start); -void bitxdec64( uint64_t *p, unsigned n, uint64_t start); - -//------- Lossy floating point transform: pad the trailing mantissa bits with zeros according to the error e (ex. e=0.00001) - #ifdef USE_FLOAT16 -void fppad16(_Float16 *in, size_t n, _Float16 *out, float e); - #endif -void fppad32(float *in, size_t n, float *out, float e); -void fppad64(double *in, size_t n, double *out, double e); - -#ifdef __cplusplus -} -#endif - -//---- Floating point to Integer decomposition --------------------------------- -// seeeeeeee21098765432109876543210 (s:sign, e:exponent, 0-9:mantissa) - #ifdef BITUTIL_IN -#define MANTF32 23 -#define MANTF64 52 - -#define BITFENC(_u_, _sgn_, _expo_, _mant_, _mantbits_, _one_) _sgn_ = _u_ >> (sizeof(_u_)*8-1); _expo_ = ((_u_ >> (_mantbits_)) & ( (_one_<<(sizeof(_u_)*8 - 1 - _mantbits_)) -1)); _mant_ = _u_ & ((_one_<<_mantbits_)-1); -#define BITFDEC( _sgn_, _expo_, _mant_, _u_, _mantbits_) _u_ = (_sgn_) << (sizeof(_u_)*8-1) | (_expo_) << _mantbits_ | (_mant_) - #endif diff --git a/conf.h b/conf.h deleted file mode 100644 index be6face..0000000 --- a/conf.h +++ /dev/null @@ -1,282 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ - -// conf.h - config & common -#ifndef CONF_H -#define CONF_H -//------------------------- Compiler ------------------------------------------ - #if defined(__GNUC__) -#include -#define ALIGNED(t,v,n) t v __attribute__ ((aligned (n))) -#define ALWAYS_INLINE inline __attribute__((always_inline)) -#define NOINLINE __attribute__((noinline)) -#define _PACKED __attribute__ ((packed)) -#define likely(x) __builtin_expect((x),1) -#define unlikely(x) __builtin_expect((x),0) - -#define popcnt32(_x_) __builtin_popcount(_x_) -#define popcnt64(_x_) __builtin_popcountll(_x_) - - #if defined(__i386__) || defined(__x86_64__) -//x,__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5 -// x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6, -static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; } -static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; } -static inline int bsr64(uint64_t x ) { return x?64 - __builtin_clzll(x):0; } -static inline int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); } - -static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } -static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } -static inline uint64_t rol64(uint64_t x, int s) { asm ("rolq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } -static inline uint64_t ror64(uint64_t x, int s) { asm ("rorq %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } - #else -static inline int __bsr32(unsigned x ) { return 31 - __builtin_clz( x); } -static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; } -static inline int bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; } - -static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); } -static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); } -static inline unsigned rol64(unsigned x, int s) { return x << s | x >> (64 - s); } -static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); } - #endif - -#define ctz64(_x_) __builtin_ctzll(_x_) -#define ctz32(_x_) __builtin_ctz(_x_) // 0:32 ctz32(1< 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8 -#define bswap16(x) __builtin_bswap16(x) - #else -static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap32(x << 16); } - #endif -#define bswap32(x) __builtin_bswap32(x) -#define bswap64(x) __builtin_bswap64(x) - - #elif _MSC_VER //---------------------------------------------------- -#include -#include - #if _MSC_VER < 1600 -#include "vs/stdint.h" -#define __builtin_prefetch(x,a) -#define inline __inline - #else -#include -#define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA) - #endif - -#define ALIGNED(t,v,n) __declspec(align(n)) t v -#define ALWAYS_INLINE __forceinline -#define NOINLINE __declspec(noinline) -#define THREADLOCAL __declspec(thread) -#define likely(x) (x) -#define unlikely(x) (x) - -static inline int __bsr32(unsigned x) { unsigned long z=0; _BitScanReverse(&z, x); return z; } -static inline int bsr32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?z+1:0; } -static inline int ctz32( unsigned x) { unsigned long z; _BitScanForward(&z, x); return x?z:32; } -static inline int clz32( unsigned x) { unsigned long z; _BitScanReverse(&z, x); return x?31-z:32; } - #if !defined(_M_ARM64) && !defined(_M_X64) -static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) { - unsigned long x0 = (unsigned long)x, top, bottom; _BitScanForward(&top, (unsigned long)(x >> 32)); _BitScanForward(&bottom, x0); - *ret = x0 ? bottom : 32 + top; return x != 0; -} -static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) { - unsigned long x1 = (unsigned long)(x >> 32), top, bottom; _BitScanReverse(&top, x1); _BitScanReverse(&bottom, (unsigned long)x); - *ret = x1 ? top + 32 : bottom; return x != 0; -} - #endif -static inline int bsr64(uint64_t x) { unsigned long z=0; _BitScanReverse64(&z, x); return x?z+1:0; } -static inline int ctz64(uint64_t x) { unsigned long z; _BitScanForward64(&z, x); return x?z:64; } -static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x); return x?63-z:64; } - -#define rol32(x,s) _lrotl(x, s) -#define ror32(x,s) _lrotr(x, s) - -#define bswap16(x) _byteswap_ushort(x) -#define bswap32(x) _byteswap_ulong(x) -#define bswap64(x) _byteswap_uint64(x) - -#define popcnt32(x) __popcnt(x) - #ifdef _WIN64 -#define popcnt64(x) __popcnt64(x) - #else -#define popcnt64(x) (popcnt32(x) + popcnt32(x>>32)) - #endif - -#define sleep(x) Sleep(x/1000) -#define fseeko _fseeki64 -#define ftello _ftelli64 -#define strcasecmp _stricmp -#define strncasecmp _strnicmp -#define strtoull _strtoui64 -static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); } - #endif - -#define __bsr8(_x_) __bsr32(_x_) -#define __bsr16(_x_) __bsr32(_x_) -#define bsr8(_x_) bsr32(_x_) -#define bsr16(_x_) bsr32(_x_) -#define ctz8(_x_) ctz32(_x_) -#define ctz16(_x_) ctz32(_x_) -#define clz8(_x_) (clz32(_x_)-24) -#define clz16(_x_) (clz32(_x_)-16) - -#define popcnt8(x) popcnt32(x) -#define popcnt16(x) popcnt32(x) - -//--------------- Unaligned memory access ------------------------------------- - #ifdef UA_MEMCPY -#include -static inline unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; } -static inline unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; } -static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; } -static inline size_t ctousz(const void *cp) { size_t x; memcpy(&x, cp, sizeof(x)); return x; } -static inline float ctof32(const void *cp) { float x; memcpy(&x, cp, sizeof(x)); return x; } -static inline double ctof64(const void *cp) { double x; memcpy(&x, cp, sizeof(x)); return x; } - -static inline void stou16( void *cp, unsigned short x) { memcpy(cp, &x, sizeof(x)); } -static inline void stou32( void *cp, unsigned x) { memcpy(cp, &x, sizeof(x)); } -static inline void stou64( void *cp, unsigned long long x) { memcpy(cp, &x, sizeof(x)); } -static inline void stousz( void *cp, size_t x) { memcpy(cp, &x, sizeof(x)); } -static inline void stof32( void *cp, float x) { memcpy(cp, &x, sizeof(x)); } -static inline void stof64( void *cp, double x) { memcpy(cp, &x, sizeof(x)); } - #elif defined(__i386__) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || _MSC_VER ||\ - defined(__powerpc__) || defined(__s390__) ||\ - defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\ - defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \ - defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \ - defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) -#define ctou16(_cp_) (*(unsigned short *)(_cp_)) -#define ctou32(_cp_) (*(unsigned *)(_cp_)) -#define ctof32(_cp_) (*(float *)(_cp_)) - - #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__s390__) || defined(_MSC_VER) -#define ctou64(_cp_) (*(uint64_t *)(_cp_)) -#define ctof64(_cp_) (*(double *)(_cp_)) - #elif defined(__ARM_FEATURE_UNALIGNED) -struct _PACKED longu { uint64_t l; }; -struct _PACKED doubleu { double d; }; -#define ctou64(_cp_) ((struct longu *)(_cp_))->l -#define ctof64(_cp_) ((struct doubleu *)(_cp_))->d - #endif - - #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__) -struct _PACKED shortu { unsigned short s; }; -struct _PACKED unsignedu { unsigned u; }; -struct _PACKED longu { uint64_t l; }; -struct _PACKED floatu { float f; }; -struct _PACKED doubleu { double d; }; - -#define ctou16(_cp_) ((struct shortu *)(_cp_))->s -#define ctou32(_cp_) ((struct unsignedu *)(_cp_))->u -#define ctou64(_cp_) ((struct longu *)(_cp_))->l -#define ctof32(_cp_) ((struct floatu *)(_cp_))->f -#define ctof64(_cp_) ((struct doubleu *)(_cp_))->d - #else -#error "unknown cpu" - #endif - -#define ctou24(_cp_) (ctou32(_cp_) & 0xffffff) -#define ctou48(_cp_) (ctou64(_cp_) & 0xffffffffffffull) -#define ctou8(_cp_) (*(_cp_)) -//--------------------- wordsize ---------------------------------------------- - #if defined(__64BIT__) || defined(_LP64) || defined(__LP64__) || defined(_WIN64) ||\ - defined(__x86_64__) || defined(_M_X64) ||\ - defined(__ia64) || defined(_M_IA64) ||\ - defined(__aarch64__) ||\ - defined(__mips64) ||\ - defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\ - defined(__s390x__) -#define __WORDSIZE 64 - #else -#define __WORDSIZE 32 - #endif -#endif - -//---------------------misc --------------------------------------------------- -#define BZHI64F(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) // _b_ < 64 -#define BZHI32F(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) // _b_ < 32 -#define BZHI64( _u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1))) // Constant -#define BZHI32( _u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1))) -#define BZHI16( _u_, _b_) BZHI32(_u_, _b_) -#define BZHI8( _u_, _b_) BZHI32(_u_, _b_) - - #ifdef __AVX2__ - #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) -#include - #else -#include - #endif -#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_) - - #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) -#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) - #else -#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_) - #endif - #else -#define bzhi_u64(_u_, _b_) BZHI64(_u_, _b_) -#define bzhi_u32(_u_, _b_) BZHI32(_u_, _b_) - #endif - -#define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1)) -#define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1))) - -#define TEMPLATE2_(_x_, _y_) _x_##_y_ -#define TEMPLATE2(_x_, _y_) TEMPLATE2_(_x_,_y_) - -#define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_ -#define TEMPLATE3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_) - -#define CACHE_LINE_SIZE 64 -#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4) - -#define CLAMP(_x_, _low_, _high_) (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_))) - -//--- NDEBUG ------- -#include - #ifdef _MSC_VER - #ifdef NDEBUG -#define AS(expr, fmt, ...) -#define AC(expr, fmt, ...) do { if(!(expr)) { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0) -#define die(fmt, ...) do { fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0) - #else -#define AS(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0) -#define AC(expr, fmt, ...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); abort(); } } while(0) -#define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ##__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0) - #endif - #else - #ifdef NDEBUG -#define AS(expr, fmt,args...) -#define AC(expr, fmt,args...) do { if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0) -#define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0) - #else -#define AS(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0) -#define AC(expr, fmt,args...) do { if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } } while(0) -#define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0) - #endif - #endif diff --git a/eliasfano.h b/eliasfano.h deleted file mode 100644 index 52584dc..0000000 --- a/eliasfano.h +++ /dev/null @@ -1,61 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// eliasfano.h - "Integer Compression" Elias Fano c/c++ header -#ifdef __cplusplus -extern "C" { -#endif -#if defined(_MSC_VER) && _MSC_VER < 1600 -#include "vs/stdint.h" -#else -#include -#endif - -// compress/decompress integer array with n values to the buffer out. Return value = end of output/input buffer -unsigned char *efanoenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efanoenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *efanodec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *efanodec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -unsigned char *efano1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efano1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *efano1dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *efano1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -unsigned char *efanoenc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efanodec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -unsigned char *efano1enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efano1dec128v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -unsigned char *efanoenc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efanodec256v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -unsigned char *efano1enc256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efano1dec256v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -#ifdef __cplusplus -} -#endif diff --git a/fp.h b/fp.h deleted file mode 100644 index f50d43f..0000000 --- a/fp.h +++ /dev/null @@ -1,125 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Floating Point + Integer Compression" -#ifdef __cplusplus -extern "C" { -#endif -#if defined(_MSC_VER) && _MSC_VER < 1600 -#include "vs/stdint.h" -#else -#include -#endif - -// ---------- TurboPFor Zigzag of delta (=delta of delta + zigzag encoding) (TurboPFor) -size_t p4nzzenc128v8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t p4nzzdec128v8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t p4nzzenc128v16( uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t p4nzzdec128v16( unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t p4nzzenc128v32( uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t p4nzzdec128v32( unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t p4nzzenc128v64( uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t p4nzzdec128v64( unsigned char *in, size_t n, uint64_t *out, uint64_t start); - -//----------- Zigzag (bit/io) ------------------------------------------------------- -size_t bvzenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t bvzdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t bvzenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t bvzdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t bvzenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t bvzdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t bvzenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t bvzdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start); -//----------- Zigzag of delta (bit/io) --------------------------------------------- -size_t bvzzenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t bvzzdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t bvzzenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t bvzzdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t bvzzenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t bvzzdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t bvzzenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t bvzzdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start); - -//----------- TurboGorilla : Improved gorilla style + RLE (bit/io) ------------------ -size_t fpgenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t fpgdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t fpgenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t fpgdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t fpgenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t fpgdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t fpgenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t fpgdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start); - -//----------- TurboFloat XOR : Last value predictor (TurboPFor) --------------------- -size_t fpxenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t fpxdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t fpxenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t fpxdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t fpxenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t fpxdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t fpxenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t fpxdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start); - -//----------- TurboFloat FCM: Finite Context Method Predictor (TurboPFor) ----------- -size_t fpfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t fpfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t fpfcmenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t fpfcmdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t fpfcmenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t fpfcmdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t fpfcmenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t fpfcmdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start); - -//----------- TurboFloat DFCM: Differential Finite Context Method Predictor (TurboPFor) -size_t fpdfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t fpdfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t fpdfcmenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t fpdfcmdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t fpdfcmenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t fpdfcmdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t fpdfcmenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t fpdfcmdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start); - -//----------- TurboFloat 2D DFCM: Differential Finite Context Method Predictor ----- -size_t fp2dfcmenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t fp2dfcmdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t fp2dfcmenc16(uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t fp2dfcmdec16(unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t fp2dfcmenc32(uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t fp2dfcmdec32(unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t fp2dfcmenc64(uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t fp2dfcmdec64(unsigned char *in, size_t n, uint64_t *out, uint64_t start); - -/*/-------------- delta (=zigzag). Same as p4zenc ------------------------------------ -size_t fppenc8( uint8_t *in, size_t n, unsigned char *out, uint8_t start); -size_t fppdec8( unsigned char *in, size_t n, uint8_t *out, uint8_t start); -size_t fppenc16( uint16_t *in, size_t n, unsigned char *out, uint16_t start); -size_t fppdec16( unsigned char *in, size_t n, uint16_t *out, uint16_t start); -size_t fppenc32( uint32_t *in, size_t n, unsigned char *out, uint32_t start); -size_t fppdec32( unsigned char *in, size_t n, uint32_t *out, uint32_t start); -size_t fppenc64( uint64_t *in, size_t n, unsigned char *out, uint64_t start); -size_t fppdec64( unsigned char *in, size_t n, uint64_t *out, uint64_t start);*/ - -#ifdef __cplusplus -} -#endif diff --git a/sse_neon.h b/sse_neon.h deleted file mode 100755 index bfbc3f8..0000000 --- a/sse_neon.h +++ /dev/null @@ -1,355 +0,0 @@ -/** - Copyright (C) powturbo 2013-2021 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// Intel SSE to ARM NEON optimized for maximum speed (and compatibility gcc/clang) with possible minor changes to the source code - -#ifndef _SSE_NEON_H_ -#define _SSE_NEON_H_ -#include "conf.h" - -#ifdef __ARM_NEON //------------------------------------------------------------------------------------------------------------------ -#include -#define __m128i uint32x4_t // int32x4_t can also be used -#define __m128 float32x4_t - -//#define USE_MACROS -#define uint8x16_to_8x8x2(_u_) ((uint8x8x2_t) { vget_low_u8(_u_), vget_high_u8(_u_) }) - - #ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ---------------------------------------------------------- -#define _mm_set_epi8(u15,u14,u13,u12,\ - u11,u10, u9, u8,\ - u7,u6,u5,u4,\ - u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (uint32x4_t)vld1q_u8( _u);}) -#define _mm_set_epi16( u7,u6,u5,u4,\ - u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (uint32x4_t)vld1q_u16(_u);}) -//#define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; vld1q_u32(_u);}) -//#define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (uint32x4_t)vld1q_u64(_u);}) -#define _mm_set_epi32(u3, u2, u1, u0) vcombine_u32(vcreate_u32((uint64_t)u1 << 32 | u0), vcreate_u32((uint64_t)u3 << 32 | u2)) -#define _mm_set_epi64x(u1, u0) (__m128i)vcombine_u64(vcreate_u64(u0), vcreate_u64(u1)) - - #else -static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8, - uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4, - uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) { - uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (uint32x4_t)vld1q_u8( u); } -static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4, - uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (uint32x4_t)vld1q_u16(u); } -static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return vld1q_u32(u); } -static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (uint32x4_t)vld1q_u64(u); } - #endif - -#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7) -#define _mm_setr_epi32(u3,u2,u1,u0) _mm_set_epi32( u0,u1,u2,u3) -#define _mm_setr_epi64x(u1,u0) _mm_set_epi64x(u0,u0) - -#define _mm_set1_epi8( _u8_ ) (__m128i)vdupq_n_u8( _u8_ ) -#define _mm_set1_epi16( _u16_) (__m128i)vdupq_n_u16(_u16_) -#define _mm_set1_epi32( _u32_) vdupq_n_u32(_u32_) -#define _mm_set1_epi64x(_u64_) (__m128i)vdupq_n_u64(_u64_) -#define _mm_setzero_si128() vdupq_n_u32( 0 ) - -#define _mm_cvtss_f32(_u_) vgetq_lane_f32((float32x4_t)(_u_), 0) -#define _mm_setzero_ps() (__m128)vdupq_n_f32(0) -#define _mm_set1_ps(_f32_) (__m128)vdupq_n_f32(_f32_) -//---------------------------------------------- Arithmetic ----------------------------------------------------------------------- -#define _mm_add_epi8( _u_,_v_) (__m128i)vaddq_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_)) -#define _mm_add_epi16( _u_,_v_) (__m128i)vaddq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_)) -#define _mm_add_epi32( _u_,_v_) vaddq_u32( _u_, _v_ ) -#define _mm_sub_epi8( _u_,_v_) (__m128i)vsubq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_)) -#define _mm_sub_epi16( _u_,_v_) (__m128i)vsubq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_)) -#define _mm_sub_epi32( _u_,_v_) (__m128i)vsubq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_)) -#define _mm_subs_epu8( _u_,_v_) (__m128i)vqsubq_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_)) - -#define _mm_mullo_epi16(_u_,_v_) (__m128i)vmulq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) -#define _mm_mullo_epi32(_u_,_v_) (__m128i)vmulq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_)) -#define mm_mullo_epu32(_u_,_v_) vmulq_u32(_u_,_v_) - -#define _mm_mulhi_epi16s(_u_,_v_) (__m128i)vqdmulhq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) //only for small values?? -static ALWAYS_INLINE __m128i _mm_mulhi_epi16(__m128i u, __m128i v) { - int32x4_t lo = vmull_s16(vget_low_s16( (int16x8_t)(u)), vget_low_s16( (int16x8_t)(v))); - int32x4_t hi = vmull_s16(vget_high_s16((int16x8_t)(u)), vget_high_s16((int16x8_t)(v))); - uint16x8x2_t a = vuzpq_u16((uint16x8_t)(lo), (uint16x8_t)(hi)); - return (__m128i)(vreinterpretq_s32_u16(a.val[1])); -} -#define _mm_mul_epu32( _u_,_v_) (__m128i)vmull_u32(vget_low_u32(_u_),vget_low_u32(_v_)) -#define _mm_adds_epu16( _u_,_v_) (__m128i)vqaddq_u16((uint16x8_t)(_u_),(uint16x8_t)(_v_)) -static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) { - int32x4_t mlo = vmull_s16(vget_low_s16( (int16x8_t)u), vget_low_s16( (int16x8_t)v)), - mhi = vmull_s16(vget_high_s16((int16x8_t)u), vget_high_s16((int16x8_t)v)); - int32x2_t alo = vpadd_s32(vget_low_s32(mlo), vget_high_s32(mlo)), - ahi = vpadd_s32(vget_low_s32(mhi), vget_high_s32(mhi)); - return (__m128i)vcombine_s32(alo, ahi); -} -//---------------------------------------------- Special math functions ----------------------------------------------------------- -#define _mm_min_epu8( _u_,_v_) (__m128i)vminq_u8( (uint8x16_t)(_u_), (uint8x16_t)(_v_)) -#define _mm_min_epu16( _u_,_v_) (__m128i)vminq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_)) -#define _mm_min_epi16( _u_,_v_) (__m128i)vminq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) -//---------------------------------------------- Logical -------------------------------------------------------------------------- -#define mm_testnz_epu32(_u_) vmaxvq_u32(_u_) //vaddvq_u32(_u_) -#define mm_testnz_epu8( _u_) vmaxv_u8(_u_) -#define _mm_or_si128( _u_,_v_) (__m128i)vorrq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_)) -#define _mm_and_si128( _u_,_v_) (__m128i)vandq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_)) -#define _mm_xor_si128( _u_,_v_) (__m128i)veorq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_)) -//---------------------------------------------- Shift ---------------------------------------------------------------------------- -#define mm_slli_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)> 7?vdupq_n_u8( 0):vshlq_n_u8( (uint8x16_t)(_u_), (_c_)))) // parameter c MUST be a constant / vshlq_n_u8: __constrange(0-(N-1)) -#define mm_slli_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u16(0):vshlq_n_u16((uint16x8_t)(_u_), (_c_)))) -#define mm_slli_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>31?vdupq_n_u32(0):vshlq_n_u32((uint32x4_t)(_u_), (_c_)))) -#define mm_slli_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>63?vdupq_n_u64(0):vshlq_n_u64((uint64x2_t)(_u_), (_c_)))) -#define _mm_slli_si128( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u8( 0):vextq_u8(vdupq_n_u8(0), (uint8x16_t)(_u_), 16-(_c_) )) ) // vextq_u8: __constrange(0-15) - -#define mm_srli_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)> 7?vdupq_n_u8( 0):vshrq_n_u8( (uint8x16_t)(_u_), (_c_)))) // vshrq_n: __constrange(1-N) -#define mm_srli_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u16(0):vshrq_n_u16((uint16x8_t)(_u_), (_c_)))) -#define mm_srli_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>31?vdupq_n_u32(0):vshrq_n_u32((uint32x4_t)(_u_), (_c_)))) -#define mm_srli_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>63?vdupq_n_u64(0):vshlq_n_u64((uint64x2_t)(_u_), (_c_)))) -#define _mm_srli_si128( _u_,_c_) (__m128i)((_c_)<1?(_u_):((_c_)>15?vdupq_n_u8(0):vextq_u8((uint8x16_t)(_u_), vdupq_n_u8(0), (_c_) )) ) // vextq_u8: __constrange(0-15) - -#define mm_srai_epi8( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s8( (int8x16_t)(_u_), (_c_))) // c <= 8 (vshrq_n:1-N) -#define mm_srai_epi16( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s16((int16x8_t)(_u_), (_c_))) // c <= 16 -#define mm_srai_epi32( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s32((int32x4_t)(_u_), (_c_))) // c <= 32 -#define mm_srai_epi64( _u_,_c_) (__m128i)((_c_)<1?(_u_):vshrq_n_s64((int64x2_t)(_u_), (_c_))) // c <= 64 - -#define _mm_slli_epi8( _u_,_m_) (__m128i)vshlq_u8( (uint8x16_t)(_u_), vdupq_n_s8( (_m_))) // parameter c integer constant/variable -#define _mm_slli_epi16( _u_,_m_) (__m128i)vshlq_u16((uint16x8_t)(_u_), vdupq_n_s16( (_m_))) -#define _mm_slli_epi32( _u_,_m_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vdupq_n_s32( (_m_))) -#define _mm_slli_epi64( _u_,_m_) (__m128i)vshlq_u64((uint64x2_t)(_u_), vdupq_n_s64( (_m_))) - -#define _mm_srli_epi8( _u_,_m_) (__m128i)vshlq_u8( (uint8x16_t)(_u_), vdupq_n_s8( -(_m_))) -#define _mm_srli_epi16( _u_,_m_) (__m128i)vshlq_u16((uint16x8_t)(_u_), vdupq_n_s16(-(_m_))) -#define _mm_srli_epi32( _u_,_m_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vdupq_n_s32(-(_m_))) -#define _mm_srli_epi64( _u_,_m_) (__m128i)vshlq_u64((uint64x2_t)(_u_), vdupq_n_s64(-(_m_))) - -#define _mm_srai_epi8( _u_,_m_) (__m128i)vshlq_s8( (int8x16_t)(_u_), vdupq_n_s8( -(_m_))) -#define _mm_srai_epi16( _u_,_m_) (__m128i)vshlq_s16((int16x8_t)(_u_), vdupq_n_s16(-(_m_))) -#define _mm_srai_epi32( _u_,_m_) (__m128i)vshlq_s32((int32x4_t)(_u_), vdupq_n_s32(-(_m_))) -#define _mm_srai_epi64( _u_,_m_) (__m128i)vshlq_s64((int64x2_t)(_u_), vdupq_n_s64(-(_m_))) - -#define _mm_sll_epi8( _u_,_v_) (__m128i)vshlq_s8( (int8x16_t)(_u_), (int8x16_t)(_v_)) //_v_:all lanes equal -#define _mm_sll_epi16( _u_,_v_) (__m128i)vshlq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_)) -#define _mm_sll_epi32( _u_,_v_) (__m128i)vshlq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_)) -#define _mm_sll_epi64( _u_,_v_) (__m128i)vshlq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_)) - -#define _mm_srl_epi8( _u_,_v_) (__m128i)vshrq_s8( (int8x16_t)(_u_), (int8x16_t)(_v_)) -#define _mm_srl_epi16( _u_,_v_) (__m128i)vshrq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_)) -#define _mm_srl_epi32( _u_,_v_) (__m128i)vshrq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_)) -#define _mm_srl_epi64( _u_,_v_) (__m128i)vshrq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_)) - -#define _mm_sllv_epi32( _u_,_v_) (__m128i)vshlq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_)) //variable shift -#define _mm_srlv_epi32( _u_,_v_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vnegq_s32((int32x4_t)(_v_))) -//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) --------------------------------- -#define _mm_cmpeq_epi8( _u_,_v_) (__m128i)vceqq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_)) -#define _mm_cmpeq_epi16( _u_,_v_) (__m128i)vceqq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) -#define _mm_cmpeq_epi32( _u_,_v_) (__m128i)vceqq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_)) - -#define _mm_cmpgt_epi8( _u_,_v_) (__m128i)vcgtq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_)) -#define _mm_cmpgt_epi16( _u_,_v_) (__m128i)vcgtq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) -#define _mm_cmpgt_epi32( _u_,_v_) (__m128i)vcgtq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_)) - -#define _mm_cmpgt_epu16( _u_,_v_) (__m128i)vcgtq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_)) -#define mm_cmpgt_epu32( _u_,_v_) (__m128i)vcgtq_u32( _u_, _v_) -//---------------------------------------------- Load ----------------------------------------------------------------------------- -#define _mm_loadl_epi64( _u64p_) (__m128i)vcombine_s32(vld1_s32((int32_t const *)(_u64p_)), vcreate_s32(0)) -#define mm_loadu_epi64p(_u64p_,_u_) (__m128i)vld1q_lane_u64((uint64_t *)(_u64p_), (uint64x2_t)(_u_), 0) -#define _mm_loadu_si128( _ip_) vld1q_u32(_ip_) -#define _mm_load_si128( _ip_) vld1q_u32(_ip_) - -#define _mm_load_ps( _ip_) (__m128)vld1q_f32((float32_t *)(_ip_)) -#define _mm_loadu_ps( _ip_) (__m128)vld1q_f32((float32_t *)(_ip_)) -#define _mm_load1_ps( _ip_) (__m128)vld1q_dup_f32((float32_t *)(_p_)) -#define _mm_loadl_pi(_u_,_ip_) (__m128)vcombine_f32((float32x2_t)vld1_f32((float32_t *)(_ip)), (float32x2_t)vget_high_f32(_u_)) -#define _mm_loadh_pi(_u_,_ip_) (__m128)vcombine_f32((float32x2_t)vget_low_f32(_u_), (float32x2_t)vld1_f32((const float *)(_ip_))) -//---------------------------------------------- Store ---------------------------------------------------------------------------- -#define _mm_storel_epi64(_ip_,_u_) vst1q_lane_u64((uint64_t *)(_ip_), (uint64x2_t)(_u_), 0) -#define _mm_storeu_si128(_ip_,_u_) vst1q_u32((__m128i *)(_ip_), _u_) - -#define _mm_store_ps( _ip_,_u_) vst1q_f32( (float32_t *)(_ip_), (float32x4_t)(_u_)) -#define _mm_storeu_ps( _ip_,_u_) vst1q_f32( (float32_t *)(_ip_), (float32x4_t)(_u_)) -#define _mm_store_ss( _ip_,_u_) vst1q_lane_f32((float32_t *)(_ip_), (float32x4_t)(_u_), 0) -//---------------------------------------------- Convert -------------------------------------------------------------------------- -#define mm_cvtsi64_si128p(_u64p_,_u_) mm_loadu_epi64p(_u64p_,_u_) -#define _mm_cvtsi64_si128(_u_) (__m128i)vdupq_n_u64(_u_) //vld1q_s64(_u_) -//---------------------------------------------- Reverse bits/bytes --------------------------------------------------------------- -#define mm_rbit_epi8(_v_) (__m128i)vrbitq_u8( (uint8x16_t)(_v_)) // reverse bits -#define mm_rev_epi16(_v_) vrev16q_u8((uint8x16_t)(_v_)) // reverse bytes -#define mm_rev_epi32(_v_) vrev32q_u8((uint8x16_t)(_v_)) -#define mm_rev_epi64(_v_) vrev64q_u8((uint8x16_t)(_v_)) -//--------------------------------------------- Insert/extract -------------------------------------------------------------------- -#define mm_extract_epi32x(_u_,_u32_,_id_) vst1q_lane_u32((uint32_t *)&(_u32_), _u_, _id_) -#define _mm_extract_epi64x(_u_,_u64_,_id_) vst1q_lane_u64((uint64_t *)&(_u64_), (uint64x2_t)(_u_), _id_) - -#define _mm_extract_epi8( _u_, _id_) vgetq_lane_u8( (uint8x16_t)(_u_), _id_) -#define _mm_extract_epi16(_u_, _id_) vgetq_lane_u16(_u_, _id_) -#define _mm_extract_epi32(_u_, _id_) vgetq_lane_u32(_u_, _id_) -#define mm_extract_epu32(_u_, _id_) vgetq_lane_u32(_u_, _id_) -#define _mm_cvtsi128_si32(_u_) vgetq_lane_u32((uint32x4_t)(_u_),0) -#define _mm_cvtsi128_si64(_u_) vgetq_lane_u64((uint64x2_t)(_u_),0) - -#define _mm_insert_epu32p(_u_,_u32p_,_id_) vsetq_lane_u32(_u32p_, _u_, _id_) -#define mm_insert_epi32p(_u_,_u32p_,_id_) vld1q_lane_u32(_u32p_, (uint32x4_t)(_u_), _id_) -#define _mm_cvtsi32_si128(_x_) (__m128i)vsetq_lane_s32(_x_, vdupq_n_s32(0), 0) - -#define _mm_blendv_epi8(_u_,_v_,_m_) vbslq_u32(_m_,_v_,_u_) -//---------------------------------------------- Miscellaneous -------------------------------------------------------------------- -#define _mm_alignr_epi8(_u_,_v_,_m_) (__m128i)vextq_u8( (uint8x16_t)(_v_), (uint8x16_t)(_u_), _m_) -#define _mm_packs_epi16( _u_,_v_) (__m128i)vcombine_s8( vqmovn_s16((int16x8_t)(_u_)), vqmovn_s16((int16x8_t)(_v_))) -#define _mm_packs_epi32( _u_,_v_) (__m128i)vcombine_s16(vqmovn_s32((int32x4_t)(_u_)), vqmovn_s32((int32x4_t)(_v_))) - -#define _mm_packs_epu16( _u_,_v_) (__m128i)vcombine_u8((uint16x8_t)(_u_), (uint16x8_t)(_v_)) -#define _mm_packus_epi16( _u_,_v_) (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)(_u_)), vqmovun_s16((int16x8_t)(_v_))) - -static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { - const uint8x16_t __attribute__ ((aligned (16))) m = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7}; - uint8x16_t mv = (uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcltq_s8((int8x16_t)v, vdupq_n_s8(0)), m)))); - return vgetq_lane_u8(mv, 8) << 8 | vgetq_lane_u8(mv, 0); -} -//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff) - #ifdef __aarch64__ -static ALWAYS_INLINE uint8_t mm_movemask_epi8s(uint8x8_t sv) { const uint8x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddv_u8( vand_u8( sv, m)); } // short only ARM -//static ALWAYS_INLINE uint16_t mm_movemask_epu16(uint32x4_t v) { const uint16x8_t m = { 1, 1<<2, 1<<4, 1<<6, 1<<8, 1<<10, 1<<12, 1<<14}; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); } -static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) { const uint16x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); } -static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) { const uint32x4_t m = { 1, 1<<1, 1<<2, 1<<3 }; return vaddvq_u32(vandq_u32((uint32x4_t)v, m)); } -static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { const uint64x2_t m = { 1, 1<<1 }; return vaddvq_u64(vandq_u64((uint64x2_t)v, m)); } - #else -static ALWAYS_INLINE uint32_t mm_movemask_epu32(uint32x4_t v) { const uint32x4_t mask = {1,2,4,8}, av = vandq_u32(v, mask), xv = vextq_u32(av, av, 2), ov = vorrq_u32(av, xv); return vgetq_lane_u32(vorrq_u32(ov, vextq_u32(ov, ov, 3)), 0); } - #endif -// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack ----------------------------------------- -#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_)) - -#define _mm_shuffle_epi8(_u_, _v_) (__m128i)vqtbl1q_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_)) - #if defined(__aarch64__) -#define mm_shuffle_nnnn_epi32(_u_,_m_) (__m128i)vdupq_laneq_u32(_u_, _m_) - #else -#define mm_shuffle_nnnn_epi32(_u_,_m_) (__m128i)vdupq_n_u32(vgetq_lane_u32(_u_, _m_) - #endif - - #ifdef USE_MACROS -#define mm_shuffle_2031_epi32(_u_) ({ uint32x4_t _zv = (uint32x4_t)vrev64q_u32(_u_); uint32x2x2_t _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);}) -#define mm_shuffle_3120_epi32(_u_) ({ uint32x4_t _zv = _u_; _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);}) - #else -static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) { uint32x4_t a = (uint32x4_t)vrev64q_u32(v); uint32x2x2_t z = vtrn_u32(vget_low_u32(a), vget_high_u32(a)); return vcombine_u32(z.val[0], z.val[1]);} -static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) { uint32x2x2_t z = vtrn_u32(vget_low_u32(v), vget_high_u32(v)); return vcombine_u32(z.val[0], z.val[1]);} - #endif - - #if defined(USE_MACROS) || defined(__clang__) -#define _mm_shuffle_epi32(_u_, _m_) ({ const uint32x4_t _av =_u_;\ - uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));\ - _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);\ - _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);\ - _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); _v;\ - }) -#define _mm_shuffle_epi32s(_u_, _m_) _mm_set_epi32(vgetq_lane_u32(_u_, ((_m_) ) & 0x3),\ - vgetq_lane_u32(_u_, ((_m_) >> 2) & 0x3),\ - vgetq_lane_u32(_u_, ((_m_) >> 4) & 0x3),\ - vgetq_lane_u32(_u_, ((_m_) >> 6) & 0x3)) - #else -static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _u_, const unsigned _m_) { const uint32x4_t _av =_u_; - uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3)); - _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1); - _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2); - _v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); - return _v; -} -static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _u_, const unsigned _m_) { - return _mm_set_epi32(vgetq_lane_u32(_u_, ((_m_) ) & 0x3), - vgetq_lane_u32(_u_, ((_m_) >> 2) & 0x3), - vgetq_lane_u32(_u_, ((_m_) >> 4) & 0x3), - vgetq_lane_u32(_u_, ((_m_) >> 6) & 0x3)); -} - #endif - #ifdef USE_MACROS -#define _mm_unpacklo_epi8( _u_,_v_) ({ uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}) -#define _mm_unpacklo_epi16(_u_,_v_) ({ uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}) -#define _mm_unpacklo_epi32(_u_,_v_) ({ uint32x2x2_t _zv = vzip_u32( vget_low_u32( _u_ ), vget_low_u32( _v_ )); vcombine_u32(_zv.val[0], _zv.val[1]);}) -#define _mm_unpacklo_epi64(_u_,_v_) (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_))) - -#define _mm_unpackhi_epi8( _u_,_v_) ({ uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}) -#define _mm_unpackhi_epi16(_u_,_v_) ({ uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}) -#define _mm_unpackhi_epi32(_u_,_v_) ({ uint32x2x2_t _zv = vzip_u32(vget_high_u32( _u_ ), vget_high_u32( _v_ )); vcombine_u32(_zv.val[0], _zv.val[1]);}) -#define _mm_unpackhi_epi64(_u_,_v_) (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))) - #else -static ALWAYS_INLINE __m128i _mm_unpacklo_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);} -static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);} -static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32( vget_low_u32( _u_ ), vget_low_u32( _v_ )); return vcombine_u32(_zv.val[0], _zv.val[1]);} -static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_))); } - -static ALWAYS_INLINE __m128i _mm_unpackhi_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]); } -static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]); } -static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32(vget_high_u32( _u_ ), vget_high_u32( _v_ )); return vcombine_u32(_zv.val[0], _zv.val[1]); } -static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); } - #endif - -#else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) -------------- -#define mm_movemask_epu32(_u_) _mm_movemask_ps(_mm_castsi128_ps(_u_)) -#define mm_movemask_epu16(_u_) _mm_movemask_epi8(_u_) -#define mm_loadu_epi64p( _u64p_,_u_) _u_ = _mm_cvtsi64_si128(ctou64(_u64p_)) - -#define mm_extract_epu32( _u_, _id_) _mm_extract_epi32(_u_, _id_) -#define mm_extract_epi32x(_u_,_u32_, _id_) _u32_ = _mm_extract_epi32(_u_, _id_) -#define mm_extract_epi64x(_u_,_u64_, _id_) _u64_ = _mm_extract_epi64(_u_, _id_) -#define mm_insert_epi32p( _u_,_u32p_,_c_) _mm_insert_epi32( _u_,ctou32(_u32p_),_c_) - -#define mm_mullo_epu32( _u_,_v_) _mm_mullo_epi32(_u_,_v_) -#define mm_cvtsi64_si128p(_u64p_,_u_) _u_ = _mm_cvtsi64_si128(ctou64(_u64p_)) - -#define mm_cmplt_epu32( _u_, _v_) _mm_cmplt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000)) //__m128i cv80000000 = _mm_set1_epi32(0x80000000); must be declared -#define mm_cmpgt_epu32( _u_, _v_) _mm_cmpgt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000)) -#define _mm_cmplt_epu32( _u_, _v_) _mm_cmplt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000))) -#define _mm_cmpgt_epu32( _u_, _v_) _mm_cmpgt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000))) - -#define mm_shuffle_nnnn_epi32(_u_, _n_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(_n_,_n_,_n_,_n_)) -#define mm_shuffle_2031_epi32(_u_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(2,0,3,1)) -#define mm_shuffle_3120_epi32(_u_) _mm_shuffle_epi32(_u_, _MM_SHUFFLE(3,1,2,0)) - -#define _mm_slli_epi8(_u_, _m_ ) _mm_and_si128(_mm_set1_epi8(0xff << _m_), _mm_slli_epi32(_u_, _m_ )) -#define _mm_srli_epi8(_u_, _m_ ) _mm_and_si128(_mm_set1_epi8(0xff >> _m_), _mm_srli_epi32(_u_, _m_ )) - -#define mm_slli_epi8( _u_,_c_) _mm_slli_epi8( _u_,_c_) // parameter c MUST be a constant for compatibilty with the arm functions above -#define mm_slli_epi16( _u_,_c_) _mm_slli_epi16(_u_,_c_) -#define mm_slli_epi32( _u_,_c_) _mm_slli_epi32(_u_,_c_) -#define mm_slli_epi64( _u_,_c_) _mm_slli_epi64(_u_,_c_) - -#define mm_srli_epi8( _u_,_c_) _mm_srli_epi8( _u_,_c_) -#define mm_srli_epi16( _u_,_c_) _mm_srli_epi16(_u_,_c_) -#define mm_srli_epi32( _u_,_c_) _mm_srli_epi32(_u_,_c_) -#define mm_srli_epi64( _u_,_c_) _mm_srli_epi64(_u_,_c_) - -#define mm_srai_epi8( _u_,_c_) _mm_srai_epi8( _u_,_c_) -#define mm_srai_epi16( _u_,_c_) _mm_srai_epi16(_u_,_c_) -#define mm_srai_epi32( _u_,_c_) _mm_srai_epi32(_u_,_c_) -#define mm_srai_epi64( _u_,_c_) _mm_srai_epi64(_u_,_c_) - - #ifdef __SSSE3__ -static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i v) { // reverse bits in bytes - __m128i fv = _mm_set_epi8(15, 7,11, 3,13, 5, 9, 1,14, 6,10, 2,12, 4, 8, 0), cv0f_8 = _mm_set1_epi8(0xf); - __m128i lv = _mm_shuffle_epi8(fv,_mm_and_si128( v, cv0f_8)); - __m128i hv = _mm_shuffle_epi8(fv,_mm_and_si128( mm_srli_epi64(v, 4), cv0f_8)); - return _mm_or_si128( mm_slli_epi64(lv,4), hv); -} - -static ALWAYS_INLINE __m128i mm_rev_epi16(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } // reverse vector bytes in uint??_t -static ALWAYS_INLINE __m128i mm_rev_epi32(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3)); } -static ALWAYS_INLINE __m128i mm_rev_epi64(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7)); } -static ALWAYS_INLINE __m128i mm_rev_si128(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15)); } - #endif - #endif -#endif - diff --git a/time_.h b/time_.h deleted file mode 100644 index d6a3233..0000000 --- a/time_.h +++ /dev/null @@ -1,252 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// time_.h : parameter free high precision time/benchmark functions -#include -#include - - #ifdef _WIN32 -#include - #ifndef sleep -#define sleep(n) Sleep((n) * 1000) - #endif - -typedef unsigned __int64 uint64_t; -typedef unsigned __int64 tm_t; - - #else -#include -#include -#define Sleep(ms) usleep((ms) * 1000) - -typedef struct timespec tm_t; - #endif - -#if defined (__i386__) || defined( __x86_64__ ) - #ifdef _MSC_VER -#include // __rdtsc - #else -#include - #endif - - #ifdef __corei7__ -#define RDTSC_INI(_c_) do { unsigned _cl, _ch; \ - __asm volatile ("cpuid\n\t" \ - "rdtsc\n\t" \ - "mov %%edx, %0\n" \ - "mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \ - "%rax", "%rbx", "%rcx", "%rdx"); \ - _c_ = (uint64_t)_ch << 32 | _cl; \ -} while(0) - -#define RDTSC(_c_) do { unsigned _cl, _ch; \ - __asm volatile("rdtscp\n" \ - "mov %%edx, %0\n" \ - "mov %%eax, %1\n" \ - "cpuid\n\t": "=r" (_ch), "=r" (_cl):: "%rax",\ - "%rbx", "%rcx", "%rdx");\ - _c_ = (uint64_t)_ch << 32 | _cl;\ -} while(0) - #else -#define RDTSC(_c_) do { unsigned _cl, _ch;\ - __asm volatile ("cpuid \n"\ - "rdtsc"\ - : "=a"(_cl), "=d"(_ch)\ - : "a"(0)\ - : "%ebx", "%ecx");\ - _c_ = (uint64_t)_ch << 32 | _cl;\ -} while(0) -#define RDTSC_INI(_c_) RDTSC(_c_) - #endif -#else -#define RDTSC_INI(_c_) -#define RDTSC(_c_) -#endif - -#define tmrdtscini() ({ uint64_t _c; __asm volatile("" ::: "memory"); RDTSC_INI(_c); _c; }) -#define tmrdtsc() ({ uint64_t _c; RDTSC(_c); _c; }) - -#ifndef TM_F -#define TM_F 1.0 // TM_F=4 -> MI/s -#endif - - #ifdef RDTSC_ON -#define tminit() tmrdtscini() -#define tmtime() tmrdtsc() -#define TM_T CLOCKS_PER_SEC -static double TMBS(unsigned l, double t) { double dt = t, dl = l; return t/l; } -#define TM_C 1000 - - #else -#define TM_C 1 -static double TMBS(unsigned l, double t) { return (l/t)/1000000.0; } - - #ifdef _WIN32 -static LARGE_INTEGER tps; -static tm_t tmtime(void) { - LARGE_INTEGER tm; - tm_t t; - QueryPerformanceCounter(&tm); - return tm.QuadPart; -} - -static tm_t tminit() { tm_t t0,ts; QueryPerformanceFrequency(&tps); t0 = tmtime(); while((ts = tmtime())==t0) {}; return ts; } -static double tmdiff(tm_t start, tm_t stop) { return (double)(stop - start)/tps.QuadPart; } -static int tmiszero(tm_t t) { return !t; } - #else - #ifdef __APPLE__ -#include - #ifndef MAC_OS_X_VERSION_10_12 -#define MAC_OS_X_VERSION_10_12 101200 - #endif -#define CIVETWEB_APPLE_HAVE_CLOCK_GETTIME (defined(__APPLE__) && defined(MAC_OS_X_VERSION_MIN_REQUIRED) && MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_12) - #if !(CIVETWEB_APPLE_HAVE_CLOCK_GETTIME) -#include -#define CLOCK_REALTIME 0 -#define CLOCK_MONOTONIC 0 -int clock_gettime(int /*clk_id*/, struct timespec* t) { - struct timeval now; - int rv = gettimeofday(&now, NULL); - if (rv) return rv; - t->tv_sec = now.tv_sec; - t->tv_nsec = now.tv_usec * 1000; - return 0; -} - #endif - #endif -static tm_t tmtime() { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return tm; } -static double tmdiff(tm_t start, tm_t stop) { return (stop.tv_sec - start.tv_sec) + (double)(stop.tv_nsec - start.tv_nsec)/1e9f; } -static tm_t tminit() { tm_t t0 = tmtime(),t; while(!tmdiff(t = tmtime(),t0)) {}; return t; } -static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); } - #endif -#endif - -//---------------------------------------- bench ---------------------------------------------------------------------- -// for each a function call is repeated until exceeding tm_tx seconds. -// A run duration is always tm_tx seconds -// The number of runs can be set with the program options -I and -J (specify -I15 -J15 for more precision) - -// sleep after each 8 runs to avoid cpu throttling. -#define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0) - -// benchmark loop -#define TMBEG(_tm_Reps_) { unsigned _tm_r,_tm_c = 0,_tm_R,_tm_Rx = _tm_Reps_,_tm_Rn = _tm_Reps_; double _tm_t;\ - for(tm_rm = tm_rep, tm_tm = DBL_MAX, _tm_R = 0; _tm_R < _tm_Rn; _tm_R++) { tm_t _tm_t0 = tminit(); /*for each run*/\ - for(_tm_r = 0;_tm_r < tm_rm;) { /*repeat tm_rm times */ - -#define TMEND(_len_) \ - _tm_r++; if(tm_tm == DBL_MAX && (_tm_t = tmdiff(_tm_t0, tmtime())) > tm_tx) break;\ - }\ - /*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\ - /*other runs: break the loop only after 'tm_rm' repeats */ \ - _tm_t = tmdiff(_tm_t0, tmtime());\ - /*set min time, recalculate repeats tm_rm based on tm_tx, recalculate number of runs based on tm_TX*/\ - if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("[%d,%d] ", tm_rm, _tm_Rn);*/ } tm_tm = _tm_t; _tm_c++; }\ - else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\ - if(tm_verbose) { printf("%8.2f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\ - if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu throttling*/\ - }\ -} - -static unsigned tm_rep = 1<<30, tm_Rep = 3, tm_Rep2 = 3, tm_rm, tm_RepMin = 1, tm_slp = 20, tm_verbose = 2; -static tm_t tm_0, tm_T; -static double tm_tm, tm_tx = 1, tm_TX = 60; - -static void tm_init(int _tm_Rep, int _tm_verbose) { tm_verbose = _tm_verbose; if(_tm_Rep) tm_Rep = _tm_Rep; } - -#define TMBENCH(_name_, _func_, _len_) do { if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\ - TMBEG(tm_Rep) _func_; TMEND(_len_); \ - double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_, dm*TM_C/dr) );\ -} while(0) - -// second TMBENCH. Example: use TMBENCH for encoding and TMBENCH2 for decoding -#define TMBENCH2(_name_, _func_, _len_) do { \ - TMBEG(tm_Rep2) _func_; TMEND(_len_);\ - double dm = tm_tm, dr = tm_rm; if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_, dm*TM_C/dr) );\ - if(tm_verbose>1) printf("%s ", _name_?_name_:#_func_);\ -} while(0) - -// Check -#define TMBENCHT(_name_,_func_, _len_, _res_) do { \ - TMBEG(tm_Rep) \ - if(_func_ != _res_) { printf("ERROR: %lld != %lld", (long long)_func_, (long long)_res_ ); exit(0); };\ - TMEND(_len_);\ - if(tm_verbose) printf("%8.2f \b\b\b\b\b", TMBS(_len_,(double)tm_tm*TM_C/(double)tm_rm) );\ - if(tm_verbose) printf("%s ", _name_?_name_:#_func_ );\ -} while(0) -//---------------------------------------------------------------------------------------------------------------------------------- -#define Kb (1u<<10) -#define Mb (1u<<20) -#define Gb (1u<<30) -#define KB 1000 -#define MB 1000000 -#define GB 1000000000 - -static unsigned argtoi(char *s, unsigned def) { - char *p; - unsigned n = strtol(s, &p, 10),f = 1; - switch(*p) { - case 'K': f = KB; break; - case 'M': f = MB; break; - case 'G': f = GB; break; - case 'k': f = Kb; break; - case 'm': f = Mb; break; - case 'g': f = Gb; break; - case 'B': return n; break; - case 'b': def = 0; - default: if(!def) return n>=32?0xffffffffu:(1u << n); f = def; - } - return n*f; -} -static uint64_t argtol(char *s) { - char *p; - uint64_t n = strtol(s, &p, 10),f=1; - switch(*p) { - case 'K': f = KB; break; - case 'M': f = MB; break; - case 'G': f = GB; break; - case 'k': f = Kb; break; - case 'm': f = Mb; break; - case 'g': f = Gb; break; - case 'B': return n; break; - case 'b': return 1u << n; - default: f = MB; - } - return n*f; -} - -static uint64_t argtot(char *s) { - char *p; - uint64_t n = strtol(s, &p, 10),f=1; - switch(*p) { - case 'h': f = 3600000; break; - case 'm': f = 60000; break; - case 's': f = 1000; break; - case 'M': f = 1; break; - default: f = 1000; - } - return n*f; -} - -static void memrcpy(unsigned char *out, unsigned char *in, unsigned n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; } - diff --git a/transpose.h b/transpose.h deleted file mode 100644 index 5f4f1e6..0000000 --- a/transpose.h +++ /dev/null @@ -1,113 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// transpose.h - Byte/Nibble transpose for further compressing with lz77 or other compressors -#ifdef __cplusplus -extern "C" { -#endif -// Syntax -// in : Input buffer -// n : Total number of bytes in input buffer -// out : output buffer -// esize : element size in bytes (ex. 2, 4, 8,... ) - -//---------- High level functions with dynamic cpu detection and JIT scalar/sse/avx2 switching -void tpenc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // tranpose -void tpdec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); // reverse transpose - -void tp2denc(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize); //2D transpose -void tp2ddec(unsigned char *in, unsigned x, unsigned y, unsigned char *out, unsigned esize); -void tp3denc(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); //3D transpose -void tp3ddec(unsigned char *in, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); -void tp4denc(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); //4D transpose -void tp4ddec(unsigned char *in, unsigned w, unsigned x, unsigned y, unsigned z, unsigned char *out, unsigned esize); - -// Nibble transpose SIMD (SSE2,AVX2, ARM Neon) -void tp4enc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); -void tp4dec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); - -// bit transpose -//void tp1enc( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); -//void tp1dec( unsigned char *in, unsigned n, unsigned char *out, unsigned esize); - -//---------- Low level functions ------------------------------------ -void tpenc2( unsigned char *in, unsigned n, unsigned char *out); // scalar -void tpenc3( unsigned char *in, unsigned n, unsigned char *out); -void tpenc4( unsigned char *in, unsigned n, unsigned char *out); -void tpenc8( unsigned char *in, unsigned n, unsigned char *out); -void tpenc16( unsigned char *in, unsigned n, unsigned char *out); - -void tpdec2( unsigned char *in, unsigned n, unsigned char *out); -void tpdec3( unsigned char *in, unsigned n, unsigned char *out); -void tpdec4( unsigned char *in, unsigned n, unsigned char *out); -void tpdec8( unsigned char *in, unsigned n, unsigned char *out); -void tpdec16( unsigned char *in, unsigned n, unsigned char *out); - -void tpenc128v2( unsigned char *in, unsigned n, unsigned char *out); // sse2 -void tpdec128v2( unsigned char *in, unsigned n, unsigned char *out); -void tpenc128v4( unsigned char *in, unsigned n, unsigned char *out); -void tpdec128v4( unsigned char *in, unsigned n, unsigned char *out); -void tpenc128v8( unsigned char *in, unsigned n, unsigned char *out); -void tpdec128v8( unsigned char *in, unsigned n, unsigned char *out); - -void tp4enc128v2( unsigned char *in, unsigned n, unsigned char *out); -void tp4dec128v2( unsigned char *in, unsigned n, unsigned char *out); -void tp4enc128v4( unsigned char *in, unsigned n, unsigned char *out); -void tp4dec128v4( unsigned char *in, unsigned n, unsigned char *out); -void tp4enc128v8( unsigned char *in, unsigned n, unsigned char *out); -void tp4dec128v8( unsigned char *in, unsigned n, unsigned char *out); - -void tp1enc128v2( unsigned char *in, unsigned n, unsigned char *out); -void tp1dec128v2( unsigned char *in, unsigned n, unsigned char *out); -void tp1enc128v4( unsigned char *in, unsigned n, unsigned char *out); -void tp1dec128v4( unsigned char *in, unsigned n, unsigned char *out); -void tp1enc128v8( unsigned char *in, unsigned n, unsigned char *out); -void tp1dec128v8( unsigned char *in, unsigned n, unsigned char *out); - -void tpenc256v2( unsigned char *in, unsigned n, unsigned char *out); // avx2 -void tpdec256v2( unsigned char *in, unsigned n, unsigned char *out); -void tpenc256v4( unsigned char *in, unsigned n, unsigned char *out); -void tpdec256v4( unsigned char *in, unsigned n, unsigned char *out); -void tpenc256v8( unsigned char *in, unsigned n, unsigned char *out); -void tpdec256v8( unsigned char *in, unsigned n, unsigned char *out); - -void tp4enc256v2( unsigned char *in, unsigned n, unsigned char *out); -void tp4dec256v2( unsigned char *in, unsigned n, unsigned char *out); -void tp4enc256v4( unsigned char *in, unsigned n, unsigned char *out); -void tp4dec256v4( unsigned char *in, unsigned n, unsigned char *out); -void tp4enc256v8( unsigned char *in, unsigned n, unsigned char *out); -void tp4dec256v8( unsigned char *in, unsigned n, unsigned char *out); - -//------- CPU instruction set -// cpuiset = 0: return current simd set, -// cpuiset != 0: set simd set 0:scalar, 20:sse2, 52:avx2 -unsigned cpuini(unsigned cpuiset); - -// convert simd set to string "sse3", "sse3", "sse4.1" or "avx2" -// Ex.: printf("current cpu set=%s\n", cpustr(cpuini(0)) ); -char *cpustr(unsigned cpuisa); - -unsigned cpuisa(void); -#ifdef __cplusplus -} -#endif diff --git a/trle.h b/trle.h deleted file mode 100644 index f40e65d..0000000 --- a/trle.h +++ /dev/null @@ -1,72 +0,0 @@ -/** - Copyright (C) powturbo 2015-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - email : powturbo [AT] gmail.com - - github : https://github.com/powturbo - - homepage : https://sites.google.com/site/powturbo/ - - twitter : https://twitter.com/powturbo - - TurboRLE - "Most efficient and fastest Run Length Encoding" -**/ -#if defined(_MSC_VER) && _MSC_VER < 1600 -#include "vs/stdint.h" -#else -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif -// RLE with specified escape char -unsigned _srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e); -unsigned _srled8( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint8_t e); - -unsigned _srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e); -unsigned _srled16(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint16_t e); - -unsigned _srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e); -unsigned _srled32(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint32_t e); - -unsigned _srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e); -unsigned _srled64(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint64_t e); - -// functions w/ overflow handling -unsigned srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e); -unsigned srled8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint8_t e); - -unsigned srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e); -unsigned srled16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint16_t e); - -unsigned srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e); -unsigned srled32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint32_t e); - -unsigned srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e); -unsigned srled64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint64_t e); - -// RLE w. automatic escape char determination -unsigned srlec( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out); -unsigned _srled( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen); -unsigned srled( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen); - -// Turbo RLE -unsigned trlec( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out); -unsigned _trled( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen); -unsigned trled( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen); -#ifdef __cplusplus -} -#endif diff --git a/vint.h b/vint.h deleted file mode 100644 index 9cc6174..0000000 --- a/vint.h +++ /dev/null @@ -1,401 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Integer Compression" variable byte include header (scalar TurboVByte+ SIMD TurboByte) -#ifndef _VINT_H_ -#define _VINT_H_ - -#ifdef __cplusplus -extern "C" { -#endif - - #ifdef VINT_IN -#include "conf.h" -//----------------------------------- Variable byte: single value macros (low level) ----------------------------------------------- -//------------- 32 bits ------------- -extern unsigned char _vtab32_[]; -#define _vbxvlen32(_x_) _vtab32_[(unsigned char)(_x_)>>4] // (clz32((_x_) ^ 0xff) - 23) // -#define _vbxlen32(_x_) ((bsr32(_x_|1)+6)/7) - -#define _vbxput32(_op_, _x_, _act_) {\ - if(likely((_x_) < (1<< 7))) { *_op_++ = _x_; _act_;}\ - else if(likely((_x_) < (1<<14))) { ctou16(_op_) = bswap16((_x_) | 0x8000u); _op_ += 2; _act_;}\ - else if(likely((_x_) < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0u; ctou16(_op_) = _x_; _op_ += 2; _act_;}\ - else if(likely((_x_) < (1<<28))) { ctou32(_op_) = bswap32((_x_) | 0xe0000000u); _op_ += 4; _act_;}\ - else { *_op_++ = (unsigned long long)(_x_) >> 32 | 0xf0u; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ -} - -#define _vbxget32(_ip_, _x_, _act_) do { _x_ = (unsigned)(*_ip_++);\ - if(!(_x_ & 0x80u)) { _act_;}\ - else if(!(_x_ & 0x40u)) { _x_ = bswap16(ctou16(_ip_ - 1) & 0xff3fu); _ip_++; _act_;}\ - else if(!(_x_ & 0x20u)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\ - else if(!(_x_ & 0x10u)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0fu); _ip_ += 3; _act_;}\ - else { _x_ = (unsigned long long)((_x_) & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\ -} while(0) - -//------------- 64 bits ----------- -#define _vbxlen64(_x_) ((bsr64(_x_)+6)/7) -#define _vbxvlen64(_x_) ((_x_)==0xff?9:clz32((_x_) ^ 0xff) - 23) - -#define _vbxput64(_op_, _x_, _act_) {\ - if(likely(_x_ < (1<< 7))) { *_op_++ = _x_; _act_;}\ - else if(likely(_x_ < (1<<14))) { ctou16(_op_) = bswap16(_x_| 0x8000); _op_ += 2; _act_;}\ - else if(likely(_x_ < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0; ctou16(_op_) = _x_; _op_ += 2; _act_;}\ - else if(likely(_x_ < (1<<28))) { ctou32(_op_) = bswap32(_x_| 0xe0000000); _op_ += 4; _act_;}\ - else if( _x_ < 1ull<<35) { *_op_++ = _x_ >> 32 | 0xf0; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ - else if( _x_ < 1ull<<42) { ctou16(_op_) = bswap16(_x_ >> 32 | 0xf800); _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ - else if( _x_ < 1ull<<49) { *_op_++ = _x_ >> 48 | 0xfc; ctou16(_op_) = _x_ >> 32; _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ - else if( _x_ < 1ull<<56) { ctou64(_op_) = bswap64(_x_ | 0xfe00000000000000ull); _op_ += 8; _act_;}\ - else { *_op_++ = 0xff; ctou64(_op_) = _x_; _op_ += 8; _act_;}\ -} - -#define _vbxget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ - if(!(_x_ & 0x80)) { _act_;}\ - else if(!(_x_ & 0x40)) { _x_ = bswap16(ctou16(_ip_++-1) & 0xff3f); _act_;}\ - else if(!(_x_ & 0x20)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\ - else if(!(_x_ & 0x10)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0f); _ip_ += 3; _act_;}\ - else if(!(_x_ & 0x08)) { _x_ = (_x_ & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\ - else if(!(_x_ & 0x04)) { _x_ = (unsigned long long)(bswap16(ctou16(_ip_-1)) & 0x7ff) << 32 | ctou32(_ip_+1); _ip_ += 5; _act_;}\ - else if(!(_x_ & 0x02)) { _x_ = (_x_ & 0x03)<<48 | (unsigned long long)ctou16(_ip_) << 32 | ctou32(_ip_+2); _ip_ += 6; _act_;}\ - else if(!(_x_ & 0x01)) { _x_ = bswap64(ctou64(_ip_-1)) & 0x01ffffffffffffffull; _ip_ += 7; _act_;}\ - else { _x_ = ctou64(_ip_); _ip_ += 8; _act_;}\ -} while(0) - -#define vbxput64(_op_, _x_) { unsigned long long _x = _x_; _vbxput64(_op_, _x, ;); } -#define vbxput32(_op_, _x_) { register unsigned _x = _x_; _vbxput32(_op_, _x, ;); } -#define vbxput16(_op_, _x_) vbxput32(_op_, _x_) -#define vbxput8( _op_, _x_) (*_op_++ = _x_) - -#define vbxget64(_ip_, _x_) _vbxget64(_ip_, _x_, ;) -#define vbxget32(_ip_, _x_) _vbxget32(_ip_, _x_, ;) -#define vbxget16(_ip_, _x_) vbxget32(_ip_,_x_) -#define vbxget8(_ip_, _x_) (_x_ = *_ip_++) -//--------------------------------------------------------------------------- -#define VB_SIZE 64 -#define VB_MAX 254 -#define VB_B2 6 -#define VB_B3 3 -#define VB_BA3 (VB_MAX - (VB_SIZE/8 - 3)) -#define VB_BA2 (VB_BA3 - (1<> 8); *_op_++ = (_x_);*/ _act_; }\ - else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\ - else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou32(_op_) = (_x_); _op_ += _b; _act_;}\ -} - -#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ - if(likely(_x_ < VB_OFS1)) { _act_ ;}\ - else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \ - else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\ - else { unsigned _b = _x_-VB_BA3; _x_ = ctou32(_ip_) & ((1u << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\ -} while(0) - -#define _vblen64(_x_) _vblen32(_x_) -#define _vbvlen64(_x_) _vbvlen32(_x_) -#define _vbput64(_op_, _x_, _act_) {\ - if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_); _act_;}\ - else if ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1)); _op_ += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\ - else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\ - else { unsigned _b = (bsr64((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou64(_op_) = (_x_); _op_ += _b; _act_;}\ -} - -#define _vbget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ - if(likely(_x_ < VB_OFS1)) { _act_ ;}\ - else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \ - else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\ - else { unsigned _b = _x_-VB_BA3; _x_ = ctou64(_ip_) & ((1ull << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\ -} while(0) - -#ifdef _WIN32 -//#define fgetc_unlocked(_f_) _fgetc_nolock(_f_) -#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_) -#define fgetc_unlocked(_f_) fgetc(_f_) -#else -#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_) //_IO_putc_unlocked(_c_,_f_) -#define fgetc_unlocked(_f_) fgetc(_f_) //_IO_getc_unlocked(_f_) -#endif - -#define leb128put(_op_, _x_) { uint64_t _x = _x_; while(_x > 0x7f) { *_op_++ = _x & 0x7f; _x >>= 7; } *_op_++ = _x | 0x80; } -#define vbfput32(_f_, _x_) ({ uint64_t _x = _x_; while(_x > 0x7f) { fputc_unlocked(_x & 0x7f, _f_); _x >>= 7; } fputc_unlocked(_x | 0x80, _f_); }) - -#define _leb128get(_ip_, _x_, _act_) { unsigned _sft=0; for(_x_=0;;_sft += 7) { unsigned _c = *_ip_++; _x_ += (_c & 0x7f) << _sft; if(_c >= 0x80) { _act_; break; } } } -#define leb128get(_ip_, _x_) vbgetax(_ip_, _x_, ;) -#define vbfget32(_f_ ) ({ unsigned _sft=0,_x=0; for(;;_sft += 7) { unsigned _c = fgetc_unlocked(_f_); if(_c != EOF) { _x += (_c & 0x7f) << _sft; if(_c & 0x80) break; } else { _x = EOF; break; } } _x; }) - -//------------- 16 bits ----------- -#define _vblen16(_x_) _vblen32(_x_) -#define _vbvlen16(_x_) _vbvlen32(_x_) - -#define _vbput16(_op_, _x_, _act_) _vbput32(_op_, _x_, _act_) -#define _vbget16(_ip_, _x_, _act_) _vbget32(_ip_, _x_, _act_) - -#define _vblen8(_x_) 1 -#define _vbvlen8(_x_) 1 -#define _vbput8(_op_, _x_, _act_) { *_op_++ = _x_; _act_; } -#define _vbget8(_ip_, _x_, _act_) { _x_ = *_ip_++; _act_; } -//----------------------------------- Variable byte: single value functions ----------------------------------------------- -// ---- Variable byte length after compression -static inline unsigned vblen16(unsigned short x) { return _vblen16(x); } -static inline unsigned vblen32(unsigned x) { return _vblen32(x); } -static inline unsigned vblen64(uint64_t x) { return _vblen64(x); } - -// ---- Length of compressed value. Input in is the first char of the compressed buffer start (Ex. vbvlen32(in[0]) ) -static inline unsigned vbvlen16(unsigned x) { return _vbvlen32(x); } -static inline unsigned vbvlen32(unsigned x) { return _vbvlen32(x); } -static inline unsigned vbvlen64(unsigned x) { return _vbvlen64(x); } - -//----- encode/decode 16/32/64 single value and advance output/input pointer -#define vbput64(_op_, _x_) { unsigned long long _x = _x_; _vbput64(_op_, _x, ;); } -#define vbput32(_op_, _x_) { register unsigned _x = _x_; _vbput32(_op_, _x, ;); } -#define vbput16(_op_, _x_) vbput32(_op_, _x_) -#define vbput8(_op_, _x_) (*_op_++ = _x_) - -#define vbget64(_ip_, _x_) _vbget64(_ip_, _x_, ;) -#define vbget32(_ip_, _x_) _vbget32(_ip_, _x_, ;) -#define vbget16(_ip_, _x_) vbget32(_ip_,_x_) -#define vbget8(_ip_, _x_) (_x_ = *_ip_++) - #endif -//----------------------------- TurboVByte 'vb':Variable byte + SIMD TurboByte 'v8': array functions ---------------------------------------- -// Encoding/DEcoding: Return value = end of compressed output/input buffer out/in - -//----------------------- Encoding/Decoding unsorted array with n integer values -------------------------- -unsigned char *vbenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); //TurboVByte -unsigned char *vbenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *vbenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); - -//-- Decode -unsigned char *vbdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out); -unsigned char *vbdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); -unsigned char *vbdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out); - -//-- Get value stored at index idx (idx:0...n-1) -unsigned short vbgetx16( unsigned char *__restrict in, unsigned idx); -unsigned vbgetx32( unsigned char *__restrict in, unsigned idx); -uint64_t vbgetx64( unsigned char *__restrict in, unsigned idx); - -//-- Search and return index of next value equal to key or n when no key value found -// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); } -unsigned vbgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key); -unsigned vbgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned key); -unsigned vbgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t key); - -//---------------------- Delta encoding/decoding sorted array --------------------------------------------- -//-- Increasing integer array. out[i] = out[i-1] + in[i] -unsigned char *vbdenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *vbdenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *vbdenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *vbddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *vbddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *vbddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -//-- Get value stored at index idx (idx:0...n-1) -unsigned short vbdgetx16( unsigned char *__restrict in, unsigned idx, unsigned short start); -unsigned vbdgetx32( unsigned char *__restrict in, unsigned idx, unsigned start); -uint64_t vbdgetx64( unsigned char *__restrict in, unsigned idx, uint64_t start); - -//-- Search and return index of next value equal to key or n when no key value found -// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); } -unsigned vbdgetgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start); -unsigned vbdgetgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned *key, unsigned start); -unsigned vbdgetgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t *key, uint64_t start); - -//-- Strictly increasing (never remaining constant or decreasing) integer array. out[i] = out[i-1] + in[i] + 1 -unsigned char *vbd1enc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *vbd1enc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *vbd1dec16(unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *vbd1dec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - - -//-- Get value stored at index idx (idx:0...n-1) -unsigned short vbd1getx16( unsigned char *__restrict in, unsigned idx, unsigned short start); -unsigned vbd1getx32( unsigned char *__restrict in, unsigned idx, unsigned start); -uint64_t vbd1getx64( unsigned char *__restrict in, unsigned idx, uint64_t start); - -//-- Search and return index of next value equal to key or n when no key value found -// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); } -unsigned vbd1getgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start); -unsigned vbd1getgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned *key, unsigned start); -unsigned vbd1getgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t *key, uint64_t start); - -//---------------------- Zigzag encoding/decoding for unsorted integer lists. -unsigned char *vbzenc8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start); -unsigned char *vbzenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *vbzenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *vbzenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *vbzdec8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start); -unsigned char *vbzdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *vbzdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *vbzdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -//---------------------- XOR encoding/decoding for unsorted integer lists. -unsigned char *vbxenc8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start); -unsigned char *vbxenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *vbxenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *vbxenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *vbxdec8( unsigned char *__restrict in, unsigned n, unsigned char *__restrict out, unsigned char start); -unsigned char *vbxdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *vbxdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *vbxdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -//---------------------- Delta of delta encoding/decoding for unsorted integer lists. -unsigned char *vbddenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *vbddenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *vbddenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *vbdddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *vbdddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *vbdddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -//-- Get value stored at index idx (idx:0...n-1) -unsigned short vbzgetx16( unsigned char *__restrict in, unsigned idx, unsigned short start); -unsigned vbzgetx32( unsigned char *__restrict in, unsigned idx, unsigned start); -uint64_t vbzgetx64( unsigned char *__restrict in, unsigned idx, uint64_t start); - -//-- Search and return index of next value equal to key or n when no key value found -// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); } -/*unsigned vbzgeteq15( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start); -unsigned vbzgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start); -unsigned vbzgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned key, unsigned start); -unsigned vbzgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t key, unsigned start);*/ - -//-------------------------- TurboByte (SIMD Group varint) -------------------------------------------------------------- -unsigned char *v8enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); //TurboByte -unsigned char *v8enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); - -unsigned char *v8dec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out); -unsigned char *v8dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); - -//------ delta --------- -unsigned char *v8denc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *v8denc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); - -unsigned char *v8ddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *v8ddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -//------ delta 1 ------- -unsigned char *v8d1enc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *v8d1enc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); - -unsigned char *v8d1dec16(unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *v8d1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -//------- zigzag ------- -unsigned char *v8zenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *v8zenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); - -unsigned char *v8zdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *v8zdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -//------- xor ---------- -unsigned char *v8xenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *v8xenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); - -unsigned char *v8xdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *v8xdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -//-------------------------- TurboByte Hybrid (SIMD Group varint) + Bitpacking ------------------------------------------- -size_t v8nenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8nd1enc16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nd1enc32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8nzenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nzenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8ndec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8ndec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t v8nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t v8nd1dec16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8nd1dec32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t v8nzdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8nzdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t v8nxdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8nxdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -//------------- -size_t v8nenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8ndenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8nd1enc128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8nzenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nzenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8ndec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8ndec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t v8nddec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t v8nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t v8nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -size_t v8nxdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t v8nxdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -//------------- -size_t v8nenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nzenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t v8nxenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t v8ndec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t v8nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t v8nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t v8nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t v8nxdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/vp4.h b/vp4.h deleted file mode 100644 index fae28df..0000000 --- a/vp4.h +++ /dev/null @@ -1,355 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "TurboPFor: Integer Compression" PFor/PForDelta + Direct access -#ifndef VP4_H_ -#define VP4_H_ -#if defined(_MSC_VER) && _MSC_VER < 1600 -#include "vs/stdint.h" -#else -#include -#endif -#include - -#ifdef __cplusplus -extern "C" { -#endif -//************************************************ High level API - n unlimited **************************************************** -// Compress integer array with n values to the buffer out. -// Return value = number of bytes written to compressed buffer out -size_t p4nenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); // SIMD (Vertical bitpacking) -size_t p4nenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nenc128v64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nenc256w32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); - - -size_t p4ndenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4ndenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4ndenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t p4nd1enc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nd1enc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nd1enc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nd1enc128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nd1enc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); - -size_t p4nzenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nzenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nzenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nzenc128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nzenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nzenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -size_t p4nzenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); - -// Decompress the compressed n values in input buffer in to the integer array out. -// Return value = number of bytes read from the ompressed buffer in -size_t p4ndec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t p4ndec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t p4ndec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4ndec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -size_t p4ndec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t p4ndec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4ndec128v64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -size_t p4ndec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); - -// Delta minimum = 0 -size_t p4nddec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t p4nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t p4nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nddec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t p4nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nddec256w32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nddec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -// Delta minimum = 1 -size_t p4nd1dec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t p4nd1dec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t p4nd1dec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nd1dec128v16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nd1dec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); -//Zigzag -size_t p4nzdec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -size_t p4nzdec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t p4nzdec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nzdec128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -size_t p4nzdec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nzdec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -size_t p4nzdec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); - -//************** Low level API - n limited to 128/256 *************************************** -#define P4D_MAX 256 - -// -------------- TurboPFor: Encode ------------ -//#include -// Low level API: Single block n limited -//compress integer array with n values to the buffer out. Return value = end of compressed buffer out -unsigned char *p4enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4enc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out); // SSE (Vertical bitpacking) -unsigned char *p4enc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4enc128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4enc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out); // AVX2 -unsigned char *p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); - -unsigned char *p4enc256w32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out); - -unsigned char *p4encx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out);// Direct access -unsigned char *p4encx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4encx32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4encx64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); - -unsigned char *p4denc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); -unsigned char *p4denc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4denc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4denc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4denc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4denc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4denc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *p4denc256w32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); - -unsigned char *p4dencx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); // Direct access -unsigned char *p4dencx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4dencx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); - -unsigned char *p4d1enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); -unsigned char *p4d1enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4d1enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4d1enc128v16(uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); // SIMD (Vertical bitpacking) -unsigned char *p4d1enc128v32(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4d1enc256v32(uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4d1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *p4d1encx8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); // Direct access -unsigned char *p4d1encx16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4d1encx32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); - -unsigned char *p4zenc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, uint8_t start); -unsigned char *p4zenc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4zenc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4zenc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4zenc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4zenc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4zenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); - -unsigned char *p4senc16(uint16_t *in, unsigned n, unsigned char *out, uint16_t start); -unsigned char *p4senc32(uint32_t *in, unsigned n, unsigned char *out, uint32_t start); -unsigned char *p4senc64(uint64_t *in, unsigned n, unsigned char *out, uint64_t start); - -unsigned char *p4sdec16(unsigned char *in, unsigned n, uint16_t *out, uint16_t start); -unsigned char *p4sdec32(unsigned char *in, unsigned n, uint32_t *out, uint32_t start); -unsigned char *p4sdec64(unsigned char *in, unsigned n, uint64_t *out, uint64_t start); - -size_t p4nsenc16(uint16_t *in, size_t n, unsigned char *out); -size_t p4nsenc32(uint32_t *in, size_t n, unsigned char *out); -size_t p4nsenc64(uint64_t *in, size_t n, unsigned char *out); - -size_t p4nsdec16(unsigned char *in, size_t n, uint16_t *out); -size_t p4nsdec32(unsigned char *in, size_t n, uint32_t *out); -size_t p4nsdec64(unsigned char *in, size_t n, uint64_t *out); - -// same as p4enc, but with b and bx as parameters. Call after _p4bitsXX -inline unsigned char *_p4enc8( uint8_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4enc16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4enc32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4enc128v16( uint16_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking) -inline unsigned char *_p4enc128v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking) -inline unsigned char *_p4enc128v64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking) -inline unsigned char *_p4enc256v32( uint32_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -// calculate the best bit sizes b and bx, return b. -unsigned _p4bits8( uint8_t *__restrict in, unsigned n, unsigned *pbx); -unsigned _p4bits16( uint16_t *__restrict in, unsigned n, unsigned *pbx); -unsigned _p4bits32( uint32_t *__restrict in, unsigned n, unsigned *pbx); -unsigned _p4bits64( uint64_t *__restrict in, unsigned n, unsigned *pbx); - -unsigned _p4bitsx8( uint8_t *__restrict in, unsigned n, unsigned *pbx); -unsigned _p4bitsx16( uint16_t *__restrict in, unsigned n, unsigned *pbx); -unsigned _p4bitsx32( uint32_t *__restrict in, unsigned n, unsigned *pbx); -unsigned _p4bitsx64( uint64_t *__restrict in, unsigned n, unsigned *pbx); - -#define P4HVE(_out_, _b_, _bx_,_usize_) do { if(!_bx_) *_out_++ = _b_;else if(_bx_ <= _usize_) *_out_++ = 0x80|_b_, *_out_++ = _bx_; else *_out_++= (_bx_ == _usize_+1?0x40:0xc0)|_b_; } while(0) - -#define P4HVE8( _out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_, 8) -#define P4HVE16(_out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_,16) -#define P4HVE32(_out_, _b_, _bx_) P4HVE(_out_, _b_, _bx_,32) -#define P4HVE64(_out_, _b_, _bx_) do { unsigned _c = _b_==64?64-1:_b_; P4HVE(_out_, _c, _bx_,64); } while(0) - -//---------------------------- TurboPFor: Decode -------------------------------------------------------- -// decompress a previously (with p4enc32) bit packed array. Return value = end of packed buffer in -//-- scalar. (see p4getx32 for direct access) -// b and bx specified (not stored within the compressed stream header) -inline unsigned char *_p4dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical BitPacking) -inline unsigned char *_p4dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4dec128v64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx); -inline unsigned char *_p4dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx); - -unsigned char *p4dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out); -unsigned char *p4dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out); -unsigned char *p4dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out); -unsigned char *p4dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out); // SIMD (Vertical BitPacking) -unsigned char *p4dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out); -unsigned char *p4dec128v64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out); -unsigned char *p4dec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out); -unsigned char *p4dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out); -//------ Delta decoding --------------------------- Return value = end of packed input buffer in --------------------------- -//-- Increasing integer lists. out[i] = out[i-1] + in[i] -// b and bx specified -unsigned char *_p4ddec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx); -unsigned char *_p4ddec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); -unsigned char *_p4ddec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -unsigned char *_p4ddec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); -unsigned char *_p4ddec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -unsigned char *_p4ddec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -unsigned char *_p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx); - -unsigned char *p4ddec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start); -unsigned char *p4ddec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); -unsigned char *p4ddec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4ddec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking) -unsigned char *p4ddec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4ddec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -//-- Strictly increasing (never remaining constant or decreasing) integer lists. out[i] = out[i-1] + in[i] + 1 -// b and bx specified (see idxcr.c/idxqry.c for an example) -unsigned char *_p4d1dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx); -unsigned char *_p4d1dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); -unsigned char *_p4d1dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -unsigned char *_p4d1dec128v16(unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); // SIMD (Vertical BitPacking) -unsigned char *_p4d1dec128v32(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -unsigned char *_p4d1dec256v32(unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -unsigned char *_p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx); - -unsigned char *p4d1dec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start); -unsigned char *p4d1dec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); -unsigned char *p4d1dec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4d1dec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking) -unsigned char *p4d1dec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4d1dec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -// ZigZag encoding -inline unsigned char *_p4zdec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start, unsigned b, unsigned bx); -inline unsigned char *_p4zdec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); -inline unsigned char *_p4zdec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -inline unsigned char *_p4zdec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b, unsigned bx); -inline unsigned char *_p4zdec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -inline unsigned char *_p4zdec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start, unsigned b, unsigned bx); -inline unsigned char *_p4zdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start, unsigned b, unsigned bx); - -unsigned char *p4zdec8( unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start); -unsigned char *p4zdec16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); -unsigned char *p4zdec32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4zdec128v16( unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start); // SIMD (Vertical BitPacking) -unsigned char *p4zdec128v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4zdec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4zdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); - -//---------------- Direct Access functions to compressed TurboPFor array p4encx16/p4encx32 ------------------------------------------------------- - #ifdef TURBOPFOR_DAC -#include "conf.h" -#define P4D_PAD8(_x_) ( (((_x_)+8-1)/8) ) -#define P4D_B(_x_) ((_x_) & 0x7f) -#define P4D_XB(_x_) (((_x_) & 0x80)?((_x_) >> 8):0) -#define P4D_ININC(_in_, _x_) _in_ += 1+((_x_) >> 7) - -static inline unsigned p4bits(unsigned char *__restrict in, int *bx) { unsigned i = ctou16(in); *bx = P4D_XB(i); return P4D_B(i); } - -struct p4 { - unsigned long long *xmap; - unsigned char *ex; - unsigned isx,bx,cum[P4D_MAX/64+1]; - int oval,idx; -}; - -static unsigned long long p4xmap[P4D_MAX/64+1] = { 0 }; - -// prepare direct access usage -static inline void p4ini(struct p4 *p4, unsigned char **pin, unsigned n, unsigned *b) { unsigned char *in = *pin; - unsigned p4i = ctou16(in); - p4->isx = p4i&0x80; - *b = P4D_B(p4i); - p4->bx = P4D_XB(p4i); //printf("p4i=%x,b=%d,bx=%d ", p4->i, *b, p4->bx); //assert(n <= P4D_MAX); - *pin = p4->ex = ++in; - if(p4->isx) { - unsigned num=0,j; - unsigned char *p; - ++in; - p4->xmap = (unsigned long long *)in; - for(j=0; j < n/64; j++) { p4->cum[j] = num; num += popcnt64(ctou64(in+j*8)); } - if(n & 0x3f) num += popcnt64(ctou64(in+j*8) & ((1ull<<(n&0x3f))-1) ); - p4->ex = p = in + (n+7)/8; - *pin = p = p4->ex+(((uint64_t)num*p4->bx+7)/8); - } else p4->xmap = p4xmap; - p4->oval = p4->idx = -1; -} - -//---------- Get a single value with index "idx" from a "p4encx32" packed array -static ALWAYS_INLINE uint8_t p4getx8( struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx8( in, idx, b); - if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx8(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<bx) << b; - return u; -} - -static ALWAYS_INLINE uint16_t p4getx16(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx16(in, idx, b); - if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx16(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<bx) << b; - return u; -} -static ALWAYS_INLINE uint32_t p4getx32(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = bitgetx32(in, idx, b); - if(p4->xmap[bi=idx>>6] & (1ull<<(cl=idx&63))) u += bitgetx32(p4->ex, p4->cum[bi] + popcnt64(p4->xmap[bi] & ~(~0ull<bx) << b; - return u; -} - -// Get the next single value greater of equal to val -static ALWAYS_INLINE uint16_t p4geqx8( struct p4 *p4, unsigned char *in, unsigned b, uint8_t val) { do p4->oval += p4getx8( p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; } -static ALWAYS_INLINE uint16_t p4geqx16(struct p4 *p4, unsigned char *in, unsigned b, uint16_t val) { do p4->oval += p4getx16(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; } -static ALWAYS_INLINE uint32_t p4geqx32(struct p4 *p4, unsigned char *in, unsigned b, uint32_t val) { do p4->oval += p4getx32(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; } - -/* DO NOT USE : like p4dec32 but using direct access. This is only a demo showing direct access usage. Use p4dec32 instead for decompressing entire blocks */ -unsigned char *p4decx32( unsigned char *in, unsigned n, uint32_t *out); // unsorted -unsigned char *p4fdecx32( unsigned char *in, unsigned n, uint32_t *out, uint32_t start); // FOR increasing -unsigned char *p4f1decx32( unsigned char *in, unsigned n, uint32_t *out, uint32_t start); // FOR strictly increasing - #endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/vsimple.h b/vsimple.h deleted file mode 100644 index 1291424..0000000 --- a/vsimple.h +++ /dev/null @@ -1,47 +0,0 @@ -/** - Copyright (C) powturbo 2013-2019 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Integer Compression" variable simple "SimpleV" -// this belongs to the integer compression known as "simple family", like simple-9,simple-16 -// or simple-8b. SimpleV is compressing integers in groups into variable word size 32, 40 and 64 bits + RLE (run length encoding) -// SimpleV is faster than simple-16 and compress better than simple-16 or simple-8b. - -#ifdef __cplusplus -extern "C" { -#endif - -// vsencNN: compress array with n unsigned (NN bits in[n]) values to the buffer out. Return value = end of compressed output buffer out -unsigned char *vsenc8( unsigned char *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *vsenc16(unsigned short *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *vsenc32(unsigned *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *vsenc64(uint64_t *__restrict in, size_t n, unsigned char *__restrict out); - -// vsdecNN: decompress buffer into an array of n unsigned values. Return value = end of compressed input buffer in -unsigned char *vsdec8( unsigned char *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *vsdec16(unsigned char *__restrict in, size_t n, unsigned short *__restrict out); -unsigned char *vsdec32(unsigned char *__restrict in, size_t n, unsigned *__restrict out); -unsigned char *vsdec64(unsigned char *__restrict in, size_t n, uint64_t *__restrict out); - -#ifdef __cplusplus -} -#endif