TurboPFor: TurboPFor decode

This commit is contained in:
x
2017-01-06 18:06:51 +01:00
parent 6c03ef762e
commit c0717663d9

58
vp4d.c
View File

@ -28,8 +28,8 @@
#include "conf.h" #include "conf.h"
#include "bitunpack.h" #include "bitunpack.h"
#include "bitutil.h" #include "bitutil.h"
#include "vp4d.h"
#include "vint.h" //#include "vsimple.h" #include "vint.h" //#include "vsimple.h"
#include "vp4d.h"
#define PAD8(__x) ( (((__x)+8-1)/8) ) #define PAD8(__x) ( (((__x)+8-1)/8) )
@ -60,8 +60,11 @@ static ALIGNED(char, shuffles[16][16], 16) = {
#endif #endif
#define P4DELTA(a) #define P4DELTA(a)
#define P4DELTA_(a)
#define _P4DEC _p4dec #define _P4DEC _p4dec
#define P4DEC p4dec #define P4DEC p4dec
#define P4NDEC p4ndec
#define P4NDECS p4dec
#define BITUNPACK bitunpack // unpack only #define BITUNPACK bitunpack // unpack only
#define BITUNPACKD bitunpack // integrated unpack #define BITUNPACKD bitunpack // integrated unpack
#define _BITUNPACKD bitunpack // integrated pfor #define _BITUNPACKD bitunpack // integrated pfor
@ -79,8 +82,11 @@ static ALIGNED(char, shuffles[16][16], 16) = {
#undef P4DECX #undef P4DECX
#define P4DELTA(a) ,a #define P4DELTA(a) ,a
#define P4DELTA_(a) a
#define _P4DEC _p4ddec //delta0 #define _P4DEC _p4ddec //delta0
#define P4DEC p4ddec #define P4DEC p4ddec
#define P4NDEC p4nddec
#define P4NDECS p4ddec
#define BITUNPACKD bitdunpack #define BITUNPACKD bitdunpack
#define _BITUNPACKD _bitdunpack #define _BITUNPACKD _bitdunpack
#define BITUNDD bitund #define BITUNDD bitund
@ -88,6 +94,8 @@ static ALIGNED(char, shuffles[16][16], 16) = {
#define _P4DEC _p4d1dec //delta1 #define _P4DEC _p4d1dec //delta1
#define P4DEC p4d1dec #define P4DEC p4d1dec
#define P4NDEC p4nd1dec
#define P4NDECS p4d1dec
#define BITUNPACKD bitd1unpack #define BITUNPACKD bitd1unpack
#define _BITUNPACKD bitd1unpack #define _BITUNPACKD bitd1unpack
#define BITUNDD bitund1 #define BITUNDD bitund1
@ -102,18 +110,24 @@ static ALIGNED(char, shuffles[16][16], 16) = {
// SIMD ------------- // SIMD -------------
#ifndef NSIMD #ifndef NSIMD
#ifdef __SSSE3__ #ifdef __SSSE3__
#define P4DELTA(a)
#define VSIZE 128 #define VSIZE 128
#define P4DELTA(a)
#define P4DELTA_(a)
#define _P4DEC _p4dec128v #define _P4DEC _p4dec128v
#define P4DEC p4dec128v #define P4DEC p4dec128v
#define P4NDEC p4ndec128v
#define P4NDECS p4dec
#define BITUNPACK bitunpack128v #define BITUNPACK bitunpack128v
#define BITUNPACKD bitunpack128v #define BITUNPACKD bitunpack128v
#define _BITUNPACKD _bitunpack128v #define _BITUNPACKD _bitunpack128v
#include "vp4d.c" #include "vp4d.c"
#define P4DELTA(a) ,a #define P4DELTA(a) ,a
#define P4DELTA_(a) a
#define _P4DEC _p4ddec128v #define _P4DEC _p4ddec128v
#define P4DEC p4ddec128v #define P4DEC p4ddec128v
#define P4NDEC p4nddec128v
#define P4NDECS p4ddec
#define BITUNPACKD bitdunpack128v #define BITUNPACKD bitdunpack128v
#define _BITUNPACKD _bitdunpack128v #define _BITUNPACKD _bitdunpack128v
#define BITUNDD bitund #define BITUNDD bitund
@ -121,6 +135,8 @@ static ALIGNED(char, shuffles[16][16], 16) = {
#define _P4DEC _p4d1dec128v #define _P4DEC _p4d1dec128v
#define P4DEC p4d1dec128v #define P4DEC p4d1dec128v
#define P4NDEC p4nd1dec128v
#define P4NDECS p4d1dec
#define BITUNPACKD bitd1unpack128v #define BITUNPACKD bitd1unpack128v
#define _BITUNPACKD _bitd1unpack128v #define _BITUNPACKD _bitd1unpack128v
#define BITUNDD bitund1 #define BITUNDD bitund1
@ -130,17 +146,24 @@ static ALIGNED(char, shuffles[16][16], 16) = {
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
#define P4DELTA(a)
#define P4DELTA_(a)
#define VSIZE 256 #define VSIZE 256
#define _P4DEC _p4dec256v #define _P4DEC _p4dec256v
#define P4DEC p4dec256v #define P4DEC p4dec256v
#define P4NDEC p4ndec256v
#define P4NDECS p4dec
#define BITUNPACK bitunpack256v #define BITUNPACK bitunpack256v
#define BITUNPACKD bitunpack256v #define BITUNPACKD bitunpack256v
#define _BITUNPACKD _bitunpack256v #define _BITUNPACKD _bitunpack256v
#include "vp4d.c" #include "vp4d.c"
#define P4DELTA #define P4DELTA(a) ,a
#define P4DELTA_(a) a
#define _P4DEC _p4ddec256v #define _P4DEC _p4ddec256v
#define P4DEC p4ddec256v #define P4DEC p4ddec256v
#define P4NDEC p4nddec256v
#define P4NDECS p4ddec
#define BITUNPACKD bitdunpack256v #define BITUNPACKD bitdunpack256v
#define _BITUNPACKD _bitdunpack256v #define _BITUNPACKD _bitdunpack256v
#define BITUNDD bitund #define BITUNDD bitund
@ -148,6 +171,8 @@ static ALIGNED(char, shuffles[16][16], 16) = {
#define _P4DEC _p4d1dec256v #define _P4DEC _p4d1dec256v
#define P4DEC p4d1dec256v #define P4DEC p4d1dec256v
#define P4NDEC p4nd1dec256v
#define P4NDECS p4d1dec
#define BITUNPACKD bitd1unpack256v #define BITUNPACKD bitd1unpack256v
#define _BITUNPACKD _bitd1unpack256v #define _BITUNPACKD _bitd1unpack256v
#define BITUNDD bitund1 #define BITUNDD bitund1
@ -206,7 +231,7 @@ unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n
#endif #endif
} }
unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) { unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) { if(!n) return in;
unsigned b = *in++,bx,i; unsigned b = *in++,bx,i;
if(likely(!(b & 0x80))) { if(likely(!(b & 0x80))) {
@ -219,11 +244,15 @@ unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n,
bx = *in++; bx = *in++;
in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b); in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b);
in = TEMPLATE2(vbdec, USIZE)(in, bx, ex); in = TEMPLATE2(vbdec, USIZE)(in, bx, ex);
for(i = 0; i != (bx & ~3); i += 4) { for(i = 0; i != (bx & ~7); i += 8) {
out[in[i ]] |= ex[i ] << b; out[in[i ]] |= ex[i ] << b;
out[in[i+1]] |= ex[i+1] << b; out[in[i+1]] |= ex[i+1] << b;
out[in[i+2]] |= ex[i+2] << b; out[in[i+2]] |= ex[i+2] << b;
out[in[i+3]] |= ex[i+3] << b; out[in[i+3]] |= ex[i+3] << b;
out[in[i+4]] |= ex[i+4] << b;
out[in[i+5]] |= ex[i+5] << b;
out[in[i+6]] |= ex[i+6] << b;
out[in[i+7]] |= ex[i+7] << b;
} }
for(;i < bx; i++) for(;i < bx; i++)
out[in[i]] |= ex[i] << b; out[in[i]] |= ex[i] << b;
@ -235,6 +264,21 @@ unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n,
} }
} }
#ifdef VSIZE
#define CSIZE VSIZE
#else
#define CSIZE 128
#endif
unsigned char *TEMPLATE2(P4NDEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) {
uint_t *op;
for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) { __builtin_prefetch(in+512);
in = TEMPLATE2(P4DEC, USIZE)(in, CSIZE, op P4DELTA(start));
P4DELTA_(start = op[CSIZE-1]);
}
return TEMPLATE2(P4NDECS, USIZE)(in, n&(CSIZE-1), op P4DELTA(start));
}
#ifdef P4DECX #ifdef P4DECX
unsigned char *TEMPLATE2(p4decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out) { unsigned char *TEMPLATE2(p4decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out) {
unsigned b,i; unsigned b,i;
@ -263,7 +307,7 @@ unsigned char *TEMPLATE2(p4decx, USIZE)(unsigned char *in, unsigned n, uint_t *_
return in + PAD8(n*b); return in + PAD8(n*b);
} }
unsigned char *TEMPLATE2(p4fdecx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, unsigned start) { unsigned char *TEMPLATE2(p4fdecx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, uint_t start) {
unsigned b,i; unsigned b,i;
struct p4 p4; struct p4 p4;
p4ini(&p4, &in, n, &b); p4ini(&p4, &in, n, &b);
@ -275,7 +319,7 @@ unsigned char *TEMPLATE2(p4fdecx, USIZE)(unsigned char *in, unsigned n, uint_t *
return in + PAD8(n*b); return in + PAD8(n*b);
} }
unsigned char *TEMPLATE2(p4f0decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, unsigned start) { unsigned char *TEMPLATE2(p4f0decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, uint_t start) {
unsigned b,i; unsigned b,i;
struct p4 p4; struct p4 p4;
p4ini(&p4, &in, n, &b); p4ini(&p4, &in, n, &b);