From c5e6b726a57977a87d143c4336f8593ac7b909ed Mon Sep 17 00:00:00 2001 From: x Date: Sat, 21 Dec 2019 14:06:33 +0100 Subject: [PATCH] TurboPFor: TurboPFor decode --- vp4d.c | 296 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 148 insertions(+), 148 deletions(-) diff --git a/vp4d.c b/vp4d.c index 626f16a..2a00b56 100644 --- a/vp4d.c +++ b/vp4d.c @@ -1,7 +1,7 @@ /** Copyright (C) powturbo 2013-2019 GPL v2 License - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -21,12 +21,12 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// "Integer Compression" TurboPFor - Pfor/PforDelta +// "Integer Compression" TurboPFor - Pfor/PforDelta -#ifndef USIZE -#pragma warning( disable : 4005) -#pragma warning( disable : 4090) -#pragma warning( disable : 4068) +#ifndef USIZE +#pragma warning( disable : 4005) +#pragma warning( disable : 4090) +#pragma warning( disable : 4068) #define VINT_IN #define BITPACK_DAC @@ -44,13 +44,13 @@ #define P4DELTA_(a) #if defined(__SSSE3__) || defined(__ARM_NEON) extern char _shuffle_32[16][16]; // defined in bitunpack.c -extern char _shuffle_16[256][16]; +extern char _shuffle_16[256][16]; #endif - + #ifdef __AVX2__ #define VSIZE 256 -#define P4DELTA(a) -#define P4DELTA_(a) +#define P4DELTA(a) +#define P4DELTA_(a) #define _P4DEC _p4dec256v #define P4DEC p4dec256v @@ -95,7 +95,7 @@ extern char _shuffle_16[256][16]; #include "vp4d.c" #undef BITUNDD - #elif !defined(SSE2_ON) && !defined(AVX2_ON) + #elif !defined(SSE2_ON) && !defined(AVX2_ON) #define _P4DEC _p4dec #define P4DEC p4dec @@ -121,7 +121,7 @@ extern char _shuffle_16[256][16]; #define P4DELTA_(a) a #define DELTA -#define _P4DEC _p4ddec //delta0 +#define _P4DEC _p4ddec //delta0 #define P4DEC p4ddec #define P4NDEC p4nddec #define P4NDECS p4ddec @@ -153,7 +153,7 @@ extern char _shuffle_16[256][16]; #define USIZE 64 #include "vp4d.c" -#define _P4DEC _p4zdec //zigzag0 +#define _P4DEC _p4zdec //zigzag0 #define P4DEC p4zdec #define P4NDEC p4nzdec #define P4NDECS p4zdec @@ -168,7 +168,7 @@ extern char _shuffle_16[256][16]; #include "vp4d.c" #define USIZE 64 #include "vp4d.c" - + #undef _P4DEC #undef P4DEC #undef BITUNPACK @@ -242,8 +242,8 @@ extern char _shuffle_16[256][16]; #define VSIZE 256 -#define P4DELTA(a) -#define P4DELTA_(a) +#define P4DELTA(a) +#define P4DELTA_(a) #undef DELTA #define _P4DEC _p4dec256w @@ -256,8 +256,8 @@ extern char _shuffle_16[256][16]; #include "vp4d.c" #endif #undef DELTA - -#else + +#else #define uint_t TEMPLATE3(uint, USIZE, _t) #pragma GCC push_options @@ -265,146 +265,146 @@ extern char _shuffle_16[256][16]; ALWAYS_INLINE unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start), unsigned b, unsigned bx ) { if(!(b & 0x80)) { - #if USIZE == 64 + #if USIZE == 64 b = (b == 63)?64:b; #endif return TEMPLATE2(BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b); } b &= 0x7f; #if VSIZE >= 128 - #if USIZE == 64 + #if USIZE == 64 if(b+bx <= 32) - #endif - { unsigned char *pb = in; - #if VSIZE == 128 - #if USIZE == 64 - uint32_t ex[P4D_MAX+64]; - in = TEMPLATE2(bitunpack, 32)(in+16, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)), ex, bx); - #else - uint_t ex[P4D_MAX+64]; - in = TEMPLATE2(bitunpack, USIZE)(in+16, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)), ex, bx); - #endif - #else - uint_t ex[P4D_MAX+64]; - in = TEMPLATE2(bitunpack, USIZE)(in+32, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)) + popcnt64(ctou64(in+16)) + popcnt64(ctou64(in+24)), ex, bx); #endif - return TEMPLATE2(_BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b, ex, pb); + { unsigned char *pb = in; + #if VSIZE == 128 + #if USIZE == 64 + uint32_t ex[P4D_MAX+64]; + in = TEMPLATE2(bitunpack, 32)(in+16, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)), ex, bx); + #else + uint_t ex[P4D_MAX+64]; + in = TEMPLATE2(bitunpack, USIZE)(in+16, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)), ex, bx); + #endif + #else + uint_t ex[P4D_MAX+64]; + in = TEMPLATE2(bitunpack, USIZE)(in+32, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)) + popcnt64(ctou64(in+16)) + popcnt64(ctou64(in+24)), ex, bx); + #endif + return TEMPLATE2(_BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b, ex, pb); } #endif #if VSIZE < 128 || USIZE == 64 { uint_t ex[P4D_MAX+64]; - unsigned long long bb[P4D_MAX/64]; + unsigned long long bb[P4D_MAX/64]; unsigned num=0,i,p4dn = (n+63)/64; for(i = 0; i < n/64; i++) { bb[i] = ctou64(in+i*8); num += popcnt64(bb[i]); } if(n & 0x3f) { bb[i] = ctou64(in+i*8) & ((1ull<<(n&0x3f))-1); num += popcnt64(bb[i]); } in = TEMPLATE2(bitunpack, USIZE)(in+PAD8(n), num, ex, bx); in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b); #if 0 //defined(AVX_2__) - { uint_t *op,*pex = ex; + { uint_t *op,*pex = ex; for(i = 0; i < p4dn; i++) { - for(op = out; bb[i]; bb[i] >>= 8,op += 8) { unsigned m = (unsigned char)bb[i], mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + for(op = out; bb[i]; bb[i] >>= 8,op += 8) { unsigned m = (unsigned char)bb[i], mc=popcnt32(m), s = pex[mc]; pex[mc]=0; _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), mm256_maskz_expand_epi32(m,_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b)))); pex += mc; *pex=s; - } //out += 64; + } //out += 64; } - } + } #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && USIZE == 32 - { uint_t *_op=out,*op,*pex = ex; + { uint_t *_op=out,*op,*pex = ex; for(i = 0; i < p4dn; i++) { - for(op=_op; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; + for(op=_op; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)_shuffle_32[m]) ) )); pex += popcnt32(m); } _op+=64; } - } + } #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && USIZE == 16 - { uint_t *_op = out, *op, *pex = ex; + { uint_t *_op = out, *op, *pex = ex; for(i = 0; i < p4dn; i++) { - for(op = _op; bb[i]; bb[i] >>= 8,op += 8) { const unsigned char m = bb[i]; + for(op = _op; bb[i]; bb[i] >>= 8,op += 8) { const unsigned char m = bb[i]; _mm_storeu_si128((__m128i *)op, _mm_add_epi16(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)_shuffle_16[m]) ) )); pex += popcnt32(m); } _op += 64; } - } + } #else - { unsigned k = 0; + { unsigned k = 0; uint_t *op,*p; for(op=out,i = 0; i < p4dn; i++,op += 64) { unsigned long long u = bb[i]; - #if 1 // faster - while(u) op[ctz64(u)] += ex[k++]<>= x; p+=x; - switch(u&0xf) { - case 0: break; //0000 - case 1: OP(0); break; //0001 - case 2: OP(1); break; //0010 - case 3: OP(1); OP(0); break; //0011 - case 4: OP(2); break; //0100 - case 5: OP(2); OP(0); break; //0101 - case 6: OP(2); OP(1); break; //0110 - case 7: OP(2); OP(1); OP(0); break; //0111 - case 8: OP(3); break; //1000 - case 9: OP(3); OP(0); break; //1001 - case 10: OP(3); OP(1); break; //1010 - case 11: OP(3); OP(1); OP(0); break; //1011 - case 12: OP(3); OP(2); break; //1100 - case 13: OP(3); OP(2); OP(0); break; //1101 - case 14: OP(3); OP(2); OP(1); break; //1110 - case 15: OP(3); OP(2); OP(1); OP(0); break; //1111 - } - u >>= 4; - p += 4; - } + #if 1 // faster + while(u) op[ctz64(u)] += ex[k++]<>= x; p+=x; + switch(u&0xf) { + case 0: break; //0000 + case 1: OP(0); break; //0001 + case 2: OP(1); break; //0010 + case 3: OP(1); OP(0); break; //0011 + case 4: OP(2); break; //0100 + case 5: OP(2); OP(0); break; //0101 + case 6: OP(2); OP(1); break; //0110 + case 7: OP(2); OP(1); OP(0); break; //0111 + case 8: OP(3); break; //1000 + case 9: OP(3); OP(0); break; //1001 + case 10: OP(3); OP(1); break; //1010 + case 11: OP(3); OP(1); OP(0); break; //1011 + case 12: OP(3); OP(2); break; //1100 + case 13: OP(3); OP(2); OP(0); break; //1101 + case 14: OP(3); OP(2); OP(1); break; //1110 + case 15: OP(3); OP(2); OP(1); OP(0); break; //1111 + } + u >>= 4; + p += 4; + } #endif - } + } } #endif } #ifdef BITUNDD TEMPLATE2(BITUNDD, USIZE)(out, n, start); - #endif + #endif return in; #endif } -unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) { - unsigned b, bx = 0, i; +unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) { + unsigned b, bx = 0, i; if(!n) return in; b = *in++; if((b & 0xc0) == 0xc0) { // all items are equal - b &= 0x3f; - #if USIZE == 64 + b &= 0x3f; + #if USIZE == 64 b = b == 63?64:b; - #endif - uint_t u = TEMPLATE2(ctou, USIZE)(in); if(b < USIZE) u = TEMPLATE2(BZHI, USIZE)(u,b); - for(i=0; i < n; i++) out[i] = u; + #endif + uint_t u = TEMPLATE2(ctou, USIZE)(in); if(b < USIZE) u = TEMPLATE2(BZHI, USIZE)(u,b); + for(i=0; i < n; i++) out[i] = u; #ifdef BITUNDD TEMPLATE2(BITUNDD, USIZE)(out, n, start); #endif - return in+(b+7)/8; - } else if(likely(!(b & 0x40))) { // PFOR + return in+(b+7)/8; + } else if(likely(!(b & 0x40))) { // PFOR if(b & 0x80) - bx = *in++; + bx = *in++; return TEMPLATE2(_P4DEC, USIZE)(in, n, out P4DELTA(start), b, bx); } else { // Variable byte #if USIZE > 8 - uint_t ex[P4D_MAX+64],*pex=ex; - b &= 0x3f; - bx = *in++; /*if(b && *in++ == 0x80) { uint_t u = TEMPLATE2(ctou, USIZE)(in); if(b < USIZE) u = TEMPLATE2(BZHI, USIZE)(u,b); int i; for(i = 0; i < n; i++) out[i] = u; in+=(b+7)/8; } else*/ - in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b); + uint_t ex[P4D_MAX+64],*pex=ex; + b &= 0x3f; + bx = *in++; /*if(b && *in++ == 0x80) { uint_t u = TEMPLATE2(ctou, USIZE)(in); if(b < USIZE) u = TEMPLATE2(BZHI, USIZE)(u,b); int i; for(i = 0; i < n; i++) out[i] = u; in+=(b+7)/8; } else*/ + in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b); in = TEMPLATE2(vbdec, USIZE)(in, bx, ex); - #define ST(j) out[in[i+j]] |= ex[i+j] << b - //#define ST(j) out[*ip++] |= (*pex++) << b; - //unsigned char *ip;for(ip = in; ip != in + (bx & ~3); ) - for(i = 0; i != (bx & ~7); i+=8) - { ST(0);ST(1);ST(2);ST(3); ST(4);ST(5);ST(6);ST(7); } - //for(;ip != in+bx; ) ST(0); - for(;i != bx; i++) ST(0); + #define ST(j) out[in[i+j]] |= ex[i+j] << b + //#define ST(j) out[*ip++] |= (*pex++) << b; + //unsigned char *ip;for(ip = in; ip != in + (bx & ~3); ) + for(i = 0; i != (bx & ~7); i+=8) + { ST(0);ST(1);ST(2);ST(3); ST(4);ST(5);ST(6);ST(7); } + //for(;ip != in+bx; ) ST(0); + for(;i != bx; i++) ST(0); in += bx; #ifdef BITUNDD TEMPLATE2(BITUNDD, USIZE)(out, n, start); - #endif + #endif #undef ST #endif // USIZE > 8 return in; @@ -417,80 +417,80 @@ unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, #define CSIZE 128 #endif -size_t TEMPLATE2(P4NDEC, USIZE)(unsigned char *__restrict in, size_t n, uint_t *__restrict out) { +size_t TEMPLATE2(P4NDEC, USIZE)(unsigned char *__restrict in, size_t n, uint_t *__restrict out) { unsigned char *ip = in; uint_t *op; if(!n) return 0; { #ifdef DELTA - uint_t start; - TEMPLATE2(vbxget, USIZE)(ip, start); + uint_t start; + TEMPLATE2(vbxget, USIZE)(ip, start); *out++ = start; --n; #endif - for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) { - unsigned b = *ip++, bx = 0, i; PREFETCH(ip+512,0);//ip = TEMPLATE2(P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start)); - + for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) { + unsigned b = *ip++, bx = 0, i; PREFETCH(ip+512,0);//ip = TEMPLATE2(P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start)); + if((b & 0xc0) == 0xc0) { - b &= 0x3f; - #if USIZE == 64 + b &= 0x3f; + #if USIZE == 64 b = b == 63?64:b; #endif - uint_t u = TEMPLATE2(ctou, USIZE)(ip); if(b < USIZE) u = TEMPLATE2(BZHI, USIZE)(u,b); - int i; for(i=0; i < CSIZE; i++) op[i] = u; + uint_t u = TEMPLATE2(ctou, USIZE)(ip); if(b < USIZE) u = TEMPLATE2(BZHI, USIZE)(u,b); + int i; for(i=0; i < CSIZE; i++) op[i] = u; #ifdef BITUNDD TEMPLATE2(BITUNDD, USIZE)(op, CSIZE, start); #endif - ip += (b+7)/8; + ip += (b+7)/8; } else if(likely(!(b & 0x40))) { if(b & 0x80) - bx = *ip++; + bx = *ip++; ip = TEMPLATE2(_P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start), b, bx); } - #if USIZE > 8 + #if USIZE > 8 else { - uint_t ex[P4D_MAX+64]; - b &= 0x3f; - bx = *ip++; //f(*ip++ == 0x80 && b) { uint_t u = TEMPLATE2(ctou, USIZE)(ip); ip+=(b+7)/8; if(b < USIZE) u = TEMPLATE2(BZHI, USIZE)(u,b); int i; for(i=0; i < CSIZE; i++) op[i] = u; } else - ip = TEMPLATE2(BITUNPACK, USIZE)(ip, CSIZE, op, b); + uint_t ex[P4D_MAX+64]; + b &= 0x3f; + bx = *ip++; //f(*ip++ == 0x80 && b) { uint_t u = TEMPLATE2(ctou, USIZE)(ip); ip+=(b+7)/8; if(b < USIZE) u = TEMPLATE2(BZHI, USIZE)(u,b); int i; for(i=0; i < CSIZE; i++) op[i] = u; } else + ip = TEMPLATE2(BITUNPACK, USIZE)(ip, CSIZE, op, b); ip = TEMPLATE2(vbdec, USIZE)(ip, bx, ex); - for(i = 0; i != (bx & ~7); i += 8) { - op[ip[i ]] |= ex[i ] << b; - op[ip[i+1]] |= ex[i+1] << b; - op[ip[i+2]] |= ex[i+2] << b; - op[ip[i+3]] |= ex[i+3] << b; - op[ip[i+4]] |= ex[i+4] << b; - op[ip[i+5]] |= ex[i+5] << b; - op[ip[i+6]] |= ex[i+6] << b; - op[ip[i+7]] |= ex[i+7] << b; - } - for(;i != bx; i++) - op[ip[i]] |= ex[i] << b; + for(i = 0; i != (bx & ~7); i += 8) { + op[ip[i ]] |= ex[i ] << b; + op[ip[i+1]] |= ex[i+1] << b; + op[ip[i+2]] |= ex[i+2] << b; + op[ip[i+3]] |= ex[i+3] << b; + op[ip[i+4]] |= ex[i+4] << b; + op[ip[i+5]] |= ex[i+5] << b; + op[ip[i+6]] |= ex[i+6] << b; + op[ip[i+7]] |= ex[i+7] << b; + } + for(;i != bx; i++) + op[ip[i]] |= ex[i] << b; ip += bx; #ifdef BITUNDD TEMPLATE2(BITUNDD, USIZE)(op, CSIZE, start); - #endif + #endif } #endif - P4DELTA_(start = op[CSIZE-1]); + P4DELTA_(start = op[CSIZE-1]); } - return TEMPLATE2(P4NDECS, USIZE)(ip, n&(CSIZE-1), op P4DELTA(start)) - in; + return TEMPLATE2(P4NDECS, USIZE)(ip, n&(CSIZE-1), op P4DELTA(start)) - in; } } - - #ifdef P4DECX + + #ifdef P4DECX unsigned char *TEMPLATE2(p4decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out) { - unsigned b,i; - struct p4 p4; - + unsigned b,i; + struct p4 p4; + p4ini(&p4, &in, n, &b); - - if(unlikely(p4.isx)) { - #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+k, b); + + if(unlikely(p4.isx)) { + #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+k, b); for(i = 0; i != n&~3; i += 4) { ITX(0); ITX(1); ITX(2); ITX(3); } for( ; i != n; i++ ) ITX(0); - } else { - #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, i+k, b); + } else { + #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, i+k, b); for(i = 0; i != n&~3; i += 4) { ITY(0); ITY(1); ITY(2); ITY(3); } for( ; i != n; i++) ITY(0); } @@ -498,16 +498,16 @@ unsigned char *TEMPLATE2(p4decx, USIZE)(unsigned char *in, unsigned n, uint_t *_ } unsigned char *TEMPLATE2(p4f1decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, uint_t start) { - unsigned b,i; - struct p4 p4; + unsigned b,i; + struct p4 p4; p4ini(&p4, &in, n, &b); if(unlikely(p4.isx)) { - #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, (i+k), b)+start+i+k+1; + #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, (i+k), b)+start+i+k+1; for(i = 0; i != n&~3; i+=4) { ITX(0); ITX(1); ITX(2); ITX(3); } for( ; i != n; i++) ITX(0); } else { - #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, (i+k), b)+start+i+k+1; + #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, (i+k), b)+start+i+k+1; for(i = 0; i != n&~3; i += 4) { ITY(0); ITY(1); ITY(2); ITY(3); } for( ; i != n; i++) ITY(0); } @@ -515,16 +515,16 @@ unsigned char *TEMPLATE2(p4f1decx, USIZE)(unsigned char *in, unsigned n, uint_t } unsigned char *TEMPLATE2(p4fdecx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, uint_t start) { - unsigned b,i; - struct p4 p4; + unsigned b,i; + struct p4 p4; p4ini(&p4, &in, n, &b); if(unlikely(p4.isx)) { - #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+k, b)+start; + #define ITX(k) out[i+k] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+k, b)+start; for(i = 0; i != n&~3; i+=4) { ITX(0); ITX(1); ITX(2); ITX(3); } for( ; i != n; i++) ITX(0); } else { - #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, (i+k), b)+start; + #define ITY(k) out[i+k] = TEMPLATE2(bitgetx, USIZE)(in, (i+k), b)+start; for(i = 0; i != n&~3; i+=4) { ITY(0); ITY(1); ITY(2); ITY(3); } for( ; i != n; i++) ITY(0); }