TurboPFor: TurboPFor decode
This commit is contained in:
77
vp4d.c
77
vp4d.c
@ -29,10 +29,9 @@
|
|||||||
#include "bitunpack.h"
|
#include "bitunpack.h"
|
||||||
#include "bitutil.h"
|
#include "bitutil.h"
|
||||||
#include "vp4d.h"
|
#include "vp4d.h"
|
||||||
#include "vint.h"
|
#include "vint.h" //#include "vsimple.h"
|
||||||
#include "vsimple.h"
|
|
||||||
#define PAD8(__x) ( (((__x)+8-1)/8) )
|
#define PAD8(__x) ( (((__x)+8-1)/8) )
|
||||||
#define VSIZEX 256
|
|
||||||
|
|
||||||
#if 0 //defined(__AVX_2__)
|
#if 0 //defined(__AVX_2__)
|
||||||
#include "avx2.h"
|
#include "avx2.h"
|
||||||
@ -60,18 +59,17 @@ static ALIGNED(char, shuffles[16][16], 16) = {
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define P4DELTA(a)
|
||||||
#define USIZE 64
|
|
||||||
|
|
||||||
#define _P4DEC _p4dec
|
#define _P4DEC _p4dec
|
||||||
#define P4DEC p4dec
|
#define P4DEC p4dec
|
||||||
#define BITUNPACK bitunpack // unpack only
|
#define BITUNPACK bitunpack // unpack only
|
||||||
#define BITUNPACKD bitunpack // integrated unpack
|
#define BITUNPACKD bitunpack // integrated unpack
|
||||||
#define _BITUNPACKD bitunpack // integrated
|
#define _BITUNPACKD bitunpack // integrated pfor
|
||||||
#define P4START
|
|
||||||
|
#define USIZE 64
|
||||||
#include __FILE__
|
#include __FILE__
|
||||||
|
|
||||||
#define P4DECX
|
#define P4DECX // direct access
|
||||||
|
|
||||||
#define USIZE 16
|
#define USIZE 16
|
||||||
#include __FILE__
|
#include __FILE__
|
||||||
@ -79,10 +77,8 @@ static ALIGNED(char, shuffles[16][16], 16) = {
|
|||||||
#define USIZE 32
|
#define USIZE 32
|
||||||
#include __FILE__
|
#include __FILE__
|
||||||
#undef P4DECX
|
#undef P4DECX
|
||||||
#undef P4DECX
|
|
||||||
|
|
||||||
#define P4START start,
|
#define P4DELTA(a) ,a
|
||||||
#define P4START_T
|
|
||||||
#define _P4DEC _p4ddec //delta0
|
#define _P4DEC _p4ddec //delta0
|
||||||
#define P4DEC p4ddec
|
#define P4DEC p4ddec
|
||||||
#define BITUNPACKD bitdunpack
|
#define BITUNPACKD bitdunpack
|
||||||
@ -101,10 +97,10 @@ static ALIGNED(char, shuffles[16][16], 16) = {
|
|||||||
#undef P4DEC
|
#undef P4DEC
|
||||||
#undef BITUNPACK
|
#undef BITUNPACK
|
||||||
#undef BITUNDD
|
#undef BITUNDD
|
||||||
|
#undef P4DELTA
|
||||||
|
|
||||||
// SIMD -------------
|
// SIMD -------------
|
||||||
#define P4START
|
#define P4DELTA(a)
|
||||||
#undef P4START_T
|
|
||||||
#define VSIZE 128
|
#define VSIZE 128
|
||||||
#define _P4DEC _p4dec128v
|
#define _P4DEC _p4dec128v
|
||||||
#define P4DEC p4dec128v
|
#define P4DEC p4dec128v
|
||||||
@ -113,8 +109,7 @@ static ALIGNED(char, shuffles[16][16], 16) = {
|
|||||||
#define _BITUNPACKD _bitunpack128v
|
#define _BITUNPACKD _bitunpack128v
|
||||||
#include __FILE__
|
#include __FILE__
|
||||||
|
|
||||||
#define P4START start,
|
#define P4DELTA(a) ,a
|
||||||
#define P4START_T
|
|
||||||
#define _P4DEC _p4ddec128v
|
#define _P4DEC _p4ddec128v
|
||||||
#define P4DEC p4ddec128v
|
#define P4DEC p4ddec128v
|
||||||
#define BITUNPACKD bitdunpack128v
|
#define BITUNPACKD bitdunpack128v
|
||||||
@ -129,10 +124,9 @@ static ALIGNED(char, shuffles[16][16], 16) = {
|
|||||||
#define BITUNDD bitund1
|
#define BITUNDD bitund1
|
||||||
#include __FILE__
|
#include __FILE__
|
||||||
#undef BITUNDD
|
#undef BITUNDD
|
||||||
|
#undef P4DELTA
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
#define P4START
|
|
||||||
#undef P4START_T
|
|
||||||
#define VSIZE 256
|
#define VSIZE 256
|
||||||
#define _P4DEC _p4dec256v
|
#define _P4DEC _p4dec256v
|
||||||
#define P4DEC p4dec256v
|
#define P4DEC p4dec256v
|
||||||
@ -141,8 +135,7 @@ static ALIGNED(char, shuffles[16][16], 16) = {
|
|||||||
#define _BITUNPACKD _bitunpack256v
|
#define _BITUNPACKD _bitunpack256v
|
||||||
#include __FILE__
|
#include __FILE__
|
||||||
|
|
||||||
#define P4START start,
|
#define P4DELTA
|
||||||
#define P4START_T
|
|
||||||
#define _P4DEC _p4ddec256v
|
#define _P4DEC _p4ddec256v
|
||||||
#define P4DEC p4ddec256v
|
#define P4DEC p4ddec256v
|
||||||
#define BITUNPACKD bitdunpack256v
|
#define BITUNPACKD bitdunpack256v
|
||||||
@ -159,22 +152,13 @@ static ALIGNED(char, shuffles[16][16], 16) = {
|
|||||||
#undef BITUNDD
|
#undef BITUNDD
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#undef USIZE
|
||||||
#else
|
#else
|
||||||
#define uint_t TEMPLATE3(uint, USIZE, _t)
|
#define uint_t TEMPLATE3(uint, USIZE, _t)
|
||||||
|
|
||||||
//#pragma GCC push_options
|
unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start), unsigned b, unsigned bx ) {
|
||||||
//#pragma GCC optimize ("unroll-loops")
|
uint_t ex[P4D_MAX+8];
|
||||||
#ifdef P4START_T
|
if(!(b & 1)) return TEMPLATE2(BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b>>1);
|
||||||
unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start, unsigned b, unsigned bx ) {
|
|
||||||
#else
|
|
||||||
unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, unsigned b, unsigned bx ) {
|
|
||||||
#endif
|
|
||||||
uint_t ex[VSIZEX+8];
|
|
||||||
if(!(b & 1)) return TEMPLATE2(BITUNPACKD, USIZE)(in,
|
|
||||||
#ifndef VSIZE
|
|
||||||
n,
|
|
||||||
#endif
|
|
||||||
out, P4START b>>1);
|
|
||||||
|
|
||||||
b >>= 1;
|
b >>= 1;
|
||||||
#ifdef VSIZE
|
#ifdef VSIZE
|
||||||
@ -184,17 +168,13 @@ unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n
|
|||||||
#else
|
#else
|
||||||
in = TEMPLATE2(bitunpack, USIZE)(in+32, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)) + popcnt64(ctou64(in+16)) + popcnt64(ctou64(in+24)), ex, bx);
|
in = TEMPLATE2(bitunpack, USIZE)(in+32, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)) + popcnt64(ctou64(in+16)) + popcnt64(ctou64(in+24)), ex, bx);
|
||||||
#endif
|
#endif
|
||||||
return TEMPLATE2(_BITUNPACKD, USIZE)(in, out, P4START b, ex, pb);
|
return TEMPLATE2(_BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b, ex, pb);
|
||||||
#else
|
#else
|
||||||
unsigned long long bb[VSIZEX/64]; unsigned num=0,i,p4dn = (n+63)/64;
|
unsigned long long bb[P4D_MAX/64]; unsigned num=0,i,p4dn = (n+63)/64;
|
||||||
for(i = 0; i < n/64; i++) { bb[i] = ctou64(in+i*8); num += popcnt64(bb[i]); }
|
for(i = 0; i < n/64; i++) { bb[i] = ctou64(in+i*8); num += popcnt64(bb[i]); }
|
||||||
if(n & 0x3f) { bb[i] = ctou64(in+i*8) & ((1ull<<(n&0x3f))-1); num += popcnt64(bb[i]); }
|
if(n & 0x3f) { bb[i] = ctou64(in+i*8) & ((1ull<<(n&0x3f))-1); num += popcnt64(bb[i]); }
|
||||||
in = TEMPLATE2(bitunpack, USIZE)(in+PAD8(n), num, ex, bx);
|
in = TEMPLATE2(bitunpack, USIZE)(in+PAD8(n), num, ex, bx);
|
||||||
in = TEMPLATE2(bitunpack, USIZE)(in,
|
in = TEMPLATE2(bitunpack, USIZE)(in, n, out, b);
|
||||||
#ifndef VSIZE
|
|
||||||
n,
|
|
||||||
#endif
|
|
||||||
out, b);
|
|
||||||
#if 0 //defined(AVX_2__)
|
#if 0 //defined(AVX_2__)
|
||||||
uint_t *op,*pex = ex;
|
uint_t *op,*pex = ex;
|
||||||
for(i = 0; i < p4dn; i++) {
|
for(i = 0; i < p4dn; i++) {
|
||||||
@ -222,27 +202,18 @@ unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef P4START_T
|
unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) {
|
||||||
unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start) {
|
|
||||||
#else
|
|
||||||
unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out) {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
unsigned b = *in++,bx,i;
|
unsigned b = *in++,bx,i;
|
||||||
|
|
||||||
if(likely(!(b & 0x80))) {
|
if(likely(!(b & 0x80))) {
|
||||||
if(b & 1)
|
if(b & 1)
|
||||||
bx = *in++;
|
bx = *in++;
|
||||||
return TEMPLATE2(_P4DEC, USIZE)(in, n, out, P4START b, bx );
|
return TEMPLATE2(_P4DEC, USIZE)(in, n, out P4DELTA(start), b, bx );
|
||||||
} else {
|
} else {
|
||||||
uint_t ex[VSIZEX+8];
|
uint_t ex[P4D_MAX+8];
|
||||||
b = (b & 0x7f)>>1;
|
b = (b & 0x7f)>>1;
|
||||||
bx = *in++;
|
bx = *in++;
|
||||||
in = TEMPLATE2(BITUNPACK, USIZE)(in,
|
in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b);
|
||||||
#ifndef VSIZE
|
|
||||||
n,
|
|
||||||
#endif
|
|
||||||
out, b);
|
|
||||||
in = TEMPLATE2(vbdec, USIZE)(in, bx, ex);
|
in = TEMPLATE2(vbdec, USIZE)(in, bx, ex);
|
||||||
for(i = 0; i != (bx & ~3); i += 4) {
|
for(i = 0; i != (bx & ~3); i += 4) {
|
||||||
out[in[i ]] |= ex[i ] << b;
|
out[in[i ]] |= ex[i ] << b;
|
||||||
|
|||||||
Reference in New Issue
Block a user