TurboPFor: Turbobyte SIMD

This commit is contained in:
x
2023-03-23 11:29:37 +01:00
parent c8b4787828
commit d66ea5f61c

View File

@ -32,13 +32,10 @@
#include "include_/bitutil_.h" #include "include_/bitutil_.h"
#define V8PAYLOAD(_n_, _usize_) (((_n_)*(_usize_/16)+7)/8) #define V8PAYLOAD(_n_, _usize_) (((_n_)*(_usize_/16)+7)/8)
#define V8BOUND_(_n_, _usize_) (V8PAYLOAD(_n_, _usize_)+ (_n_)*(_usize_/8)) #define V8BOUND(_n_, _usize_) (V8PAYLOAD(_n_, _usize_)+ (_n_)*(_usize_/8))
#define V8BOUND16(_n_) V8BOUND_(_n_, 16)
#define V8BOUND32(_n_) V8BOUND_(_n_, 32)
size_t v8bound16(const uint16_t *in, size_t n) { return V8BOUND16(n); }
size_t v8bound32(const uint32_t *in, size_t n) { return V8BOUND32(n); }
size_t v8bound16(size_t n) { return V8BOUND(n, 16); }
size_t v8bound32(size_t n) { return V8BOUND(n, 32); }
size_t v8len16(const uint16_t *in, size_t n) { size_t v8len16(const uint16_t *in, size_t n) {
size_t c = 0; size_t c = 0;
@ -972,7 +969,7 @@ unsigned char *T2(V8ENC,32)(uint32_t *__restrict in, unsigned n, unsigned char *
#ifdef __AVX2__ // slightly faster than SSE ---------------------------------------------------------------------------------------------- #ifdef __AVX2__ // slightly faster than SSE ----------------------------------------------------------------------------------------------
VEINI256v32; const __m256i cv1_8 = _mm256_set1_epi8(1), cv7f00 = _mm256_set1_epi16(0x7F00), zv = _mm256_setzero_si256(); VEINI256v32; const __m256i cv1_8 = _mm256_set1_epi8(1), cv7f00 = _mm256_set1_epi16(0x7F00), zv = _mm256_setzero_si256();
for(ip = in; ip != in+(n&~(32-1)); ip += 32) { //PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(32-1)); ip += 32) { //PREFETCH(ip+384,0);
__m256i iv0 = _mm256_loadu_si256(ip ), __m256i iv0 = _mm256_loadu_si256(ip ),
iv1 = _mm256_loadu_si256(ip+ 8); VE256v32(iv0,sv); VE256v32(iv1,sv); iv1 = _mm256_loadu_si256(ip+ 8); VE256v32(iv0,sv); VE256v32(iv1,sv);
__m256i iv2 = _mm256_loadu_si256(ip+16), __m256i iv2 = _mm256_loadu_si256(ip+16),
@ -1050,10 +1047,10 @@ unsigned char *T2(V8ENC,32)(uint32_t *__restrict in, unsigned n, unsigned char *
_mm_storeu_si128((__m128i *)op, ov1); op += LEN32(m2,1); _mm_storeu_si128((__m128i *)op, ov1); op += LEN32(m2,1);
_mm_storeu_si128((__m128i *)op, ov2); op += LEN32(m3,0); _mm_storeu_si128((__m128i *)op, ov2); op += LEN32(m3,0);
_mm_storeu_si128((__m128i *)op, ov3); op += LEN32(m3,1); _mm_storeu_si128((__m128i *)op, ov3); op += LEN32(m3,1);
} //PREFETCH(ip+512,0); } //PREFETCH(ip+384,0);
} }
#else //------------------------------ scalar ---------------------------------------------- #else //------------------------------ scalar ----------------------------------------------
for(ip = in; ip != in+(n&~(32-1)); ip += 32) { PNEXTA(out,op,8); VLE4( 0); VLE4( 4); VLE4( 8); VLE4(12); VLE4(16); VLE4(20); VLE4(24); VLE4(28); PREFETCH(ip+512,0); } for(ip = in; ip != in+(n&~(32-1)); ip += 32) { PNEXTA(out,op,8); VLE4( 0); VLE4( 4); VLE4( 8); VLE4(12); VLE4(16); VLE4(20); VLE4(24); VLE4(28); /*PREFETCH(ip+512,0);*/ }
#endif #endif
for( ; ip != in+(n&~(4-1)); ip += 4) { PNEXTA(out,op,1); VLE4(0); } for( ; ip != in+(n&~(4-1)); ip += 4) { PNEXTA(out,op,1); VLE4(0); }
if(ip != in+n) { uint32_t *sp = ip; for(*OP=0,PNEXTA(out,op,1); ip != in+n; ip++ ) VLE1(out[0]); } if(ip != in+n) { uint32_t *sp = ip; for(*OP=0,PNEXTA(out,op,1); ip != in+n; ip++ ) VLE1(out[0]); }
@ -1075,7 +1072,7 @@ unsigned char *T2(V8DEC,32)(unsigned char *__restrict in, unsigned n, uint32_t
#ifdef __AVX2__ //slightly faster than SSE ------------------------------------------------------------------------------------------ #ifdef __AVX2__ //slightly faster than SSE ------------------------------------------------------------------------------------------
VDINI256v32; VDINI256v32;
uint64_t mx = ctou64(IP); uint64_t mx = ctou64(IP);
for(; op != out+(n&~(32-1)); op += 32) { //PREFETCH(ip+512,0); for(; op != out+(n&~(32-1)); op += 32) { PREFETCH(ip+384,0);
uint64_t m = mx; mx = ctou64(IP+=8); uint64_t m = mx; mx = ctou64(IP+=8);
{__m256i ov0 = _mm256_castsi128_si256( _mm_loadu_si128(ip)); ip += LEN32(m,0); {__m256i ov0 = _mm256_castsi128_si256( _mm_loadu_si128(ip)); ip += LEN32(m,0);
ov0 = _mm256_inserti128_si256(ov0, _mm_loadu_si128(ip),1); ip += LEN32(m,1); ov0 = _mm256_inserti128_si256(ov0, _mm_loadu_si128(ip),1); ip += LEN32(m,1);
@ -1136,7 +1133,7 @@ unsigned char *T2(V8DEC,32)(unsigned char *__restrict in, unsigned n, uint32_t
__m128i sv0 = SVD32((uint8_t )m,0); __m128i sv0 = SVD32((uint8_t )m,0);
__m128i sv1 = SVD32((uint16_t)m,1); m>>=16; __m128i sv1 = SVD32((uint16_t)m,1); m>>=16;
__m128i ov2 = _mm_loadu_si128(ip); ip += LEN32((uint8_t )m,0); __m128i ov2 = _mm_loadu_si128(ip); ip += LEN32((uint8_t )m,0);
__m128i ov3 = _mm_loadu_si128(ip); ip += LEN32((uint16_t)m,1); //PREFETCH(ip+256,0); __m128i ov3 = _mm_loadu_si128(ip); ip += LEN32((uint16_t)m,1);
__m128i sv2 = SVD32(m,0); __m128i sv2 = SVD32(m,0);
__m128i sv3 = SVD32(m,1); __m128i sv3 = SVD32(m,1);
@ -1148,13 +1145,13 @@ unsigned char *T2(V8DEC,32)(unsigned char *__restrict in, unsigned n, uint32_t
VD128v32(ov0,sv); _mm_storeu_si128(op+16, ov0); VD128v32(ov0,sv); _mm_storeu_si128(op+16, ov0);
VD128v32(ov1,sv); _mm_storeu_si128(op+20, ov1); VD128v32(ov1,sv); _mm_storeu_si128(op+20, ov1);
VD128v32(ov2,sv); _mm_storeu_si128(op+24, ov2); VD128v32(ov2,sv); _mm_storeu_si128(op+24, ov2);
VD128v32(ov3,sv); _mm_storeu_si128(op+28, ov3); VD128v32(ov3,sv); _mm_storeu_si128(op+28, ov3); PREFETCH(ip+384,0);
} }
} }
} }
#elif defined(__ARM_NEON) || defined(__SSSE3__) // optimzed for ARM ---------------------------------------------------------- #elif defined(__ARM_NEON) || defined(__SSSE3__) // optimzed for ARM ----------------------------------------------------------
VDINI128v32; VDINI128v32;
for(; op != out+(n&~(32-1)); op += 32) { PREFETCH(ip+256,0); for(; op != out+(n&~(32-1)); op += 32) { //PREFETCH(ip+384,0);
uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4); uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4);
__m128i ov0 = _mm_loadu_si128(ip+IPINC); ip += LEN32(m0,0)+IPINC; __m128i ov0 = _mm_loadu_si128(ip+IPINC); ip += LEN32(m0,0)+IPINC;
__m128i fv0 = SVD32(m0,0); __m128i fv0 = SVD32(m0,0);
@ -1196,8 +1193,7 @@ unsigned char *T2(V8DEC,32)(unsigned char *__restrict in, unsigned n, uint32_t
} }
#else //----------------------------- scalar ----------------------------------------------- #else //----------------------------- scalar -----------------------------------------------
for(; op != out+(n&~(32-1)); op += 32) { in = ip; ip+=8; for(; op != out+(n&~(32-1)); op += 32) { in = ip; ip+=8;
VLD4( 0); VLD4( 4); VLD4( 8); VLD4(12); VLD4(16); VLD4(20); VLD4(24); VLD4(28); VLD4( 0); VLD4( 4); VLD4( 8); VLD4(12); VLD4(16); VLD4(20); VLD4(24); VLD4(28); //PREFETCH(ip+512,0);
PREFETCH(ip+512,0);
} }
#endif #endif
uint32_t m; for(; op != out+(n&~(4-1)); op += 4) { PNEXTA(in,ip,1); VLD4( 0); } uint32_t m; for(; op != out+(n&~(4-1)); op += 4) { PNEXTA(in,ip,1); VLD4( 0); }
@ -1297,7 +1293,7 @@ unsigned char *T2(V8DEC,16)(unsigned char *__restrict in, unsigned n, uint16_t
#if defined(__SSSE3__) || defined(__ARM_NEON)//----------------------- #if defined(__SSSE3__) || defined(__ARM_NEON)//-----------------------
VDINI128v16; VDINI128v16;
for(op = out; op != out+(n&~(64-1)); op += 64) { //PREFETCH(ip+512,0); for(op = out; op != out+(n&~(64-1)); op += 64) { PREFETCH(ip+512,0);
uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4); uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4);
__m128i ov0 = _mm_shuffle_epi8(_mm_loadu_si128(ip+IPINC), SVD16(m0,0)); ip += LEN16(m0,0)+IPINC; __m128i ov0 = _mm_shuffle_epi8(_mm_loadu_si128(ip+IPINC), SVD16(m0,0)); ip += LEN16(m0,0)+IPINC;
__m128i ov1 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m0,1)); ip += LEN16(m0,1); __m128i ov1 = _mm_shuffle_epi8(_mm_loadu_si128(ip), SVD16(m0,1)); ip += LEN16(m0,1);