TurboPFor: Bit Packing
This commit is contained in:
142
bitpack.c
142
bitpack.c
@ -44,6 +44,7 @@
|
|||||||
#pragma clang diagnostic push
|
#pragma clang diagnostic push
|
||||||
#pragma clang diagnostic ignored "-Wunsequenced"
|
#pragma clang diagnostic ignored "-Wunsequenced"
|
||||||
|
|
||||||
|
#ifdef PLAIN //----------------------------------- Plain -----------------------------------------------------------------------
|
||||||
typedef unsigned char *(*BITPACK_F8)( uint8_t *__restrict out, unsigned n, const unsigned char *__restrict in);
|
typedef unsigned char *(*BITPACK_F8)( uint8_t *__restrict out, unsigned n, const unsigned char *__restrict in);
|
||||||
typedef unsigned char *(*BITPACK_D8)( uint8_t *__restrict out, unsigned n, const unsigned char *__restrict in, uint8_t start);
|
typedef unsigned char *(*BITPACK_D8)( uint8_t *__restrict out, unsigned n, const unsigned char *__restrict in, uint8_t start);
|
||||||
typedef unsigned char *(*BITPACK_F16)(uint16_t *__restrict out, unsigned n, const unsigned char *__restrict in);
|
typedef unsigned char *(*BITPACK_F16)(uint16_t *__restrict out, unsigned n, const unsigned char *__restrict in);
|
||||||
@ -53,7 +54,6 @@ typedef unsigned char *(*BITPACK_D32)(uint32_t *__restrict out, unsigned n, cons
|
|||||||
typedef unsigned char *(*BITPACK_F64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in);
|
typedef unsigned char *(*BITPACK_F64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in);
|
||||||
typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in, uint64_t start);
|
typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in, uint64_t start);
|
||||||
|
|
||||||
|
|
||||||
#if 1 //def _MSC_VER
|
#if 1 //def _MSC_VER
|
||||||
#define VX (v=x)
|
#define VX (v=x)
|
||||||
#define V x
|
#define V x
|
||||||
@ -62,7 +62,6 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
|
|||||||
#define V v
|
#define V v
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(SSE2_ON) && !defined(AVX2_ON)
|
|
||||||
#if 0
|
#if 0
|
||||||
#define IP0(_ip_,_x_) *_ip_
|
#define IP0(_ip_,_x_) *_ip_
|
||||||
#define IP( _ip_,_x_) *_ip_++
|
#define IP( _ip_,_x_) *_ip_++
|
||||||
@ -227,7 +226,7 @@ size_t bitnfpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict
|
|||||||
size_t bitnfpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitf, bitfpacka); }
|
size_t bitnfpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitf, bitfpacka); }
|
||||||
size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitf, bitfpacka); }
|
size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitf, bitfpacka); }
|
||||||
|
|
||||||
#endif
|
#else //--------------------------------------- SIMD ----------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
#define _BITNPACKV(in, n, out, _csize_, _usize_, _bitpackv_) {\
|
#define _BITNPACKV(in, n, out, _csize_, _usize_, _bitpackv_) {\
|
||||||
unsigned char *op = out; TEMPLATE3(uint, _usize_, _t) _o,_x;\
|
unsigned char *op = out; TEMPLATE3(uint, _usize_, _t) _o,_x;\
|
||||||
@ -247,7 +246,71 @@ size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict
|
|||||||
return op - out;\
|
return op - out;\
|
||||||
}
|
}
|
||||||
|
|
||||||
#if (defined(__SSE2__) || defined(__ARM_NEON)) && defined(SSE2_ON)
|
#ifdef __AVX2__
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include "bitpack_.h"
|
||||||
|
|
||||||
|
#define OPPE(__op)
|
||||||
|
#define IPPE(__op)
|
||||||
|
|
||||||
|
#define PAD8(__x) (((__x)+8-1)/8)
|
||||||
|
#define OPPE(__op)
|
||||||
|
#define IPPE(__op)
|
||||||
|
|
||||||
|
#define VI32(ip, i, iv, parm)
|
||||||
|
#define IP32(ip, i, iv) _mm256_loadu_si256(ip++)
|
||||||
|
|
||||||
|
unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; }
|
||||||
|
#undef VI32
|
||||||
|
#undef IP32
|
||||||
|
|
||||||
|
|
||||||
|
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm256_sub_epi32(_mm256_loadu_si256(_ip_++),sv)
|
||||||
|
#define IP32(_ip_, i, _iv_) _iv_
|
||||||
|
#include "bitpack_.h"
|
||||||
|
unsigned char *bitfpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
||||||
|
__m256i v, sv = _mm256_set1_epi32(start);
|
||||||
|
BITPACK256V32(in, b, out, sv);
|
||||||
|
return pout;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm256_sub_epi32(_mm256_loadu_si256(_ip_++),_sv_); _sv_ = _mm256_add_epi32(_sv_,cv);
|
||||||
|
#define IP32(ip, i, _iv_) _iv_
|
||||||
|
unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
||||||
|
__m256i v, sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm256_set1_epi32(8);
|
||||||
|
BITPACK256V32(in, b, out, sv); return pout;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_delta_epi32(v,_sv_); _sv_ = v
|
||||||
|
#define IP32(ip, i, _iv_) _iv_
|
||||||
|
#include "bitpack_.h"
|
||||||
|
unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
||||||
|
__m256i v,sv = _mm256_set1_epi32(start);
|
||||||
|
BITPACK256V32(in, b, out, sv);
|
||||||
|
return pout;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = _mm256_sub_epi32(mm256_delta_epi32(v,_sv_),cv); _sv_ = v
|
||||||
|
unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
||||||
|
__m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
|
||||||
|
BITPACK256V32(in, b, out, sv);
|
||||||
|
return pout;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_delta_epi32(v,_sv_); _sv_ = v; _iv_ = mm256_zzage_epi32(_iv_)
|
||||||
|
unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
||||||
|
__m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
|
||||||
|
BITPACK256V32(in, b, out, sv);
|
||||||
|
return pout;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t bitnpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; _BITNPACKV( in, n, out, 256, 32, bitpack256v); }
|
||||||
|
size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd, bitdpack256v, bitd, bitdpack); }
|
||||||
|
size_t bitnd1pack256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd1, bitd1pack256v,bitd1, bitd1pack); }
|
||||||
|
size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitz, bitzpack256v, bitz, bitzpack); }
|
||||||
|
size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); }
|
||||||
|
|
||||||
|
#elif defined(__SSE2__) || defined(__ARM_NEON) //----------------------------- SSE ---------------------------------------------------------------
|
||||||
#define OPPE(__op)
|
#define OPPE(__op)
|
||||||
#define IPPE(__op)
|
#define IPPE(__op)
|
||||||
|
|
||||||
@ -318,7 +381,6 @@ unsigned char *bits1pack128v32(unsigned *__restrict in, unsigned n, unsign
|
|||||||
__m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv); return pout;
|
__m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv); return pout;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define VI16(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi16(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi16(_sv_,cv);
|
#define VI16(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi16(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi16(_sv_,cv);
|
||||||
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi32(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi32(_sv_,cv);
|
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi32(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi32(_sv_,cv);
|
||||||
#define IP16(ip, i, _iv_) _iv_
|
#define IP16(ip, i, _iv_) _iv_
|
||||||
@ -358,73 +420,7 @@ size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__rest
|
|||||||
|
|
||||||
size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitf, bitfpack128v, bitf, bitfpack); }
|
size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitf, bitfpack128v, bitf, bitfpack); }
|
||||||
size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitf, bitfpack128v, bitf, bitfpack); }
|
size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitf, bitfpack128v, bitf, bitfpack); }
|
||||||
#endif
|
#endif // SSE
|
||||||
|
#endif // Plain
|
||||||
#if defined(__AVX2__) && defined(AVX2_ON)
|
|
||||||
#include <immintrin.h>
|
|
||||||
#include "bitpack_.h"
|
|
||||||
|
|
||||||
#define OPPE(__op)
|
|
||||||
#define IPPE(__op)
|
|
||||||
|
|
||||||
#define PAD8(__x) (((__x)+8-1)/8)
|
|
||||||
#define OPPE(__op)
|
|
||||||
#define IPPE(__op)
|
|
||||||
|
|
||||||
#define VI32(ip, i, iv, parm)
|
|
||||||
#define IP32(ip, i, iv) _mm256_loadu_si256(ip++)
|
|
||||||
|
|
||||||
unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; }
|
|
||||||
#undef VI32
|
|
||||||
#undef IP32
|
|
||||||
|
|
||||||
|
|
||||||
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm256_sub_epi32(_mm256_loadu_si256(_ip_++),sv)
|
|
||||||
#define IP32(_ip_, i, _iv_) _iv_
|
|
||||||
#include "bitpack_.h"
|
|
||||||
unsigned char *bitfpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
|
||||||
__m256i v, sv = _mm256_set1_epi32(start);
|
|
||||||
BITPACK256V32(in, b, out, sv);
|
|
||||||
return pout;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm256_sub_epi32(_mm256_loadu_si256(_ip_++),_sv_); _sv_ = _mm256_add_epi32(_sv_,cv);
|
|
||||||
#define IP32(ip, i, _iv_) _iv_
|
|
||||||
unsigned char *bitf1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
|
||||||
__m256i v, sv = _mm256_set_epi32(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm256_set1_epi32(8);
|
|
||||||
BITPACK256V32(in, b, out, sv); return pout;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_delta_epi32(v,_sv_); _sv_ = v
|
|
||||||
#define IP32(ip, i, _iv_) _iv_
|
|
||||||
#include "bitpack_.h"
|
|
||||||
unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
|
||||||
__m256i v,sv = _mm256_set1_epi32(start);
|
|
||||||
BITPACK256V32(in, b, out, sv);
|
|
||||||
return pout;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = _mm256_sub_epi32(mm256_delta_epi32(v,_sv_),cv); _sv_ = v
|
|
||||||
unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
|
||||||
__m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
|
|
||||||
BITPACK256V32(in, b, out, sv);
|
|
||||||
return pout;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm256_loadu_si256(_ip_++); _iv_ = mm256_delta_epi32(v,_sv_); _sv_ = v; _iv_ = mm256_zzage_epi32(_iv_)
|
|
||||||
unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b);
|
|
||||||
__m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1);
|
|
||||||
BITPACK256V32(in, b, out, sv);
|
|
||||||
return pout;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t bitnpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; _BITNPACKV( in, n, out, 256, 32, bitpack256v); }
|
|
||||||
size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd, bitdpack256v, bitd, bitdpack); }
|
|
||||||
size_t bitnd1pack256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd1, bitd1pack256v,bitd1, bitd1pack); }
|
|
||||||
size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitz, bitzpack256v, bitz, bitzpack); }
|
|
||||||
size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); }
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#pragma clang diagnostic pop
|
#pragma clang diagnostic pop
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user