diff --git a/vp4d.c b/vp4d.c index 9dc9d02..69a61e2 100644 --- a/vp4d.c +++ b/vp4d.c @@ -29,10 +29,9 @@ #include "bitunpack.h" #include "bitutil.h" #include "vp4d.h" -#include "vint.h" -#include "vsimple.h" +#include "vint.h" //#include "vsimple.h" + #define PAD8(__x) ( (((__x)+8-1)/8) ) -#define VSIZEX 256 #if 0 //defined(__AVX_2__) #include "avx2.h" @@ -60,18 +59,17 @@ static ALIGNED(char, shuffles[16][16], 16) = { }; #endif - -#define USIZE 64 - +#define P4DELTA(a) #define _P4DEC _p4dec #define P4DEC p4dec #define BITUNPACK bitunpack // unpack only #define BITUNPACKD bitunpack // integrated unpack -#define _BITUNPACKD bitunpack // integrated -#define P4START +#define _BITUNPACKD bitunpack // integrated pfor + +#define USIZE 64 #include __FILE__ -#define P4DECX +#define P4DECX // direct access #define USIZE 16 #include __FILE__ @@ -79,10 +77,8 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define USIZE 32 #include __FILE__ #undef P4DECX -#undef P4DECX -#define P4START start, -#define P4START_T +#define P4DELTA(a) ,a #define _P4DEC _p4ddec //delta0 #define P4DEC p4ddec #define BITUNPACKD bitdunpack @@ -101,10 +97,10 @@ static ALIGNED(char, shuffles[16][16], 16) = { #undef P4DEC #undef BITUNPACK #undef BITUNDD +#undef P4DELTA // SIMD ------------- -#define P4START -#undef P4START_T +#define P4DELTA(a) #define VSIZE 128 #define _P4DEC _p4dec128v #define P4DEC p4dec128v @@ -113,8 +109,7 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define _BITUNPACKD _bitunpack128v #include __FILE__ -#define P4START start, -#define P4START_T +#define P4DELTA(a) ,a #define _P4DEC _p4ddec128v #define P4DEC p4ddec128v #define BITUNPACKD bitdunpack128v @@ -129,10 +124,9 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define BITUNDD bitund1 #include __FILE__ #undef BITUNDD +#undef P4DELTA #ifdef __AVX2__ -#define P4START -#undef P4START_T #define VSIZE 256 #define _P4DEC _p4dec256v #define P4DEC p4dec256v @@ -141,8 +135,7 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define _BITUNPACKD _bitunpack256v #include __FILE__ -#define P4START start, -#define P4START_T +#define P4DELTA #define _P4DEC _p4ddec256v #define P4DEC p4ddec256v #define BITUNPACKD bitdunpack256v @@ -157,24 +150,15 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define BITUNDD bitund1 #include __FILE__ #undef BITUNDD - #endif - + #endif + +#undef USIZE #else #define uint_t TEMPLATE3(uint, USIZE, _t) -//#pragma GCC push_options -//#pragma GCC optimize ("unroll-loops") - #ifdef P4START_T -unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start, unsigned b, unsigned bx ) { - #else -unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, unsigned b, unsigned bx ) { - #endif - uint_t ex[VSIZEX+8]; - if(!(b & 1)) return TEMPLATE2(BITUNPACKD, USIZE)(in, - #ifndef VSIZE - n, - #endif - out, P4START b>>1); +unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start), unsigned b, unsigned bx ) { + uint_t ex[P4D_MAX+8]; + if(!(b & 1)) return TEMPLATE2(BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b>>1); b >>= 1; #ifdef VSIZE @@ -184,17 +168,13 @@ unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n #else in = TEMPLATE2(bitunpack, USIZE)(in+32, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)) + popcnt64(ctou64(in+16)) + popcnt64(ctou64(in+24)), ex, bx); #endif - return TEMPLATE2(_BITUNPACKD, USIZE)(in, out, P4START b, ex, pb); + return TEMPLATE2(_BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b, ex, pb); #else - unsigned long long bb[VSIZEX/64]; unsigned num=0,i,p4dn = (n+63)/64; + unsigned long long bb[P4D_MAX/64]; unsigned num=0,i,p4dn = (n+63)/64; for(i = 0; i < n/64; i++) { bb[i] = ctou64(in+i*8); num += popcnt64(bb[i]); } if(n & 0x3f) { bb[i] = ctou64(in+i*8) & ((1ull<<(n&0x3f))-1); num += popcnt64(bb[i]); } in = TEMPLATE2(bitunpack, USIZE)(in+PAD8(n), num, ex, bx); - in = TEMPLATE2(bitunpack, USIZE)(in, - #ifndef VSIZE - n, - #endif - out, b); + in = TEMPLATE2(bitunpack, USIZE)(in, n, out, b); #if 0 //defined(AVX_2__) uint_t *op,*pex = ex; for(i = 0; i < p4dn; i++) { @@ -222,27 +202,18 @@ unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n #endif } - #ifdef P4START_T -unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start) { - #else -unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out) { - #endif - +unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) { unsigned b = *in++,bx,i; if(likely(!(b & 0x80))) { if(b & 1) bx = *in++; - return TEMPLATE2(_P4DEC, USIZE)(in, n, out, P4START b, bx ); + return TEMPLATE2(_P4DEC, USIZE)(in, n, out P4DELTA(start), b, bx ); } else { - uint_t ex[VSIZEX+8]; + uint_t ex[P4D_MAX+8]; b = (b & 0x7f)>>1; bx = *in++; - in = TEMPLATE2(BITUNPACK, USIZE)(in, - #ifndef VSIZE - n, - #endif - out, b); + in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b); in = TEMPLATE2(vbdec, USIZE)(in, bx, ex); for(i = 0; i != (bx & ~3); i += 4) { out[in[i ]] |= ex[i ] << b; @@ -260,7 +231,7 @@ unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, } } - #ifdef P4DECX + #ifdef P4DECX unsigned char *TEMPLATE2(p4decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out) { unsigned b,i; struct p4 p4;