TurboPFor: TurboPFor decode

This commit is contained in:
x
2018-02-05 13:35:15 +01:00
parent fa4d008432
commit 6f34c47eca

46
vp4d.c
View File

@ -40,27 +40,8 @@
#ifdef __SSSE3__
#include <tmmintrin.h>
extern char _shuffle_32[16][16];/* = {
#define _ 0x80
{ _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ },
{ 0,1,2,3, _,_,_,_, _,_, _, _, _, _, _,_ },
{ _,_,_,_, 0,1,2,3, _,_, _, _, _, _, _,_ },
{ 0,1,2,3, 4,5,6,7, _,_, _, _, _, _, _,_ },
{ _,_,_,_, _,_,_,_, 0,1, 2, 3, _, _, _,_ },
{ 0,1,2,3, _,_,_,_, 4,5, 6, 7, _, _, _,_ },
{ _,_,_,_, 0,1,2,3, 4,5, 6, 7, _, _, _,_ },
{ 0,1,2,3, 4,5,6,7, 8,9,10,11, _, _, _,_ },
{ _,_,_,_, _,_,_,_, _,_,_,_, 0, 1, 2, 3 },
{ 0,1,2,3, _,_,_,_, _,_,_, _, 4, 5, 6, 7 },
{ _,_,_,_, 0,1,2,3, _,_,_, _, 4, 5, 6, 7 },
{ 0,1,2,3, 4,5,6,7, _,_, _, _, 8, 9,10,11 },
{ _,_,_,_, _,_,_,_, 0,1, 2, 3, 4, 5, 6, 7 },
{ 0,1,2,3, _,_,_,_, 4,5, 6, 7, 8, 9,10,11 },
{ _,_,_,_, 0,1,2,3, 4,5, 6, 7, 8, 9,10,11 },
{ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 },
#undef _
};*/
extern char _shuffle_16[256][16]; // defined in bitunpack.c
extern char _shuffle_32[16][16]; // defined in bitunpack.c
extern char _shuffle_16[256][16];
#endif
#if !defined(SSE2_ON) && !defined(AVX2_ON)
@ -164,6 +145,8 @@ extern char _shuffle_16[256][16]; // defined in bitunpack.c
#include "vp4d.c"
#define USIZE 32
#include "vp4d.c"
#define USIZE 64
#include "vp4d.c"
#define P4DELTA(a) ,a
#define P4DELTA_(a) a
@ -284,7 +267,6 @@ extern char _shuffle_16[256][16]; // defined in bitunpack.c
#pragma GCC optimize ("align-functions=16")
ALWAYS_INLINE unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start), unsigned b, unsigned bx ) {
uint_t ex[P4D_MAX+64];
if(!(b & 0x80)) {
#if USIZE == 64
b = (b == 63)?64:b;
@ -292,22 +274,34 @@ ALWAYS_INLINE unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict
return TEMPLATE2(BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b);
}
b &= 0x7f;
#if VSIZE >= 128 && USIZE < 64
#if VSIZE >= 128
#if USIZE == 64
if(b+bx <= 32)
#endif
{ unsigned char *pb = in;
#if VSIZE == 128
#if USIZE == 64
uint32_t ex[P4D_MAX+64];
in = TEMPLATE2(bitunpack, 32)(in+16, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)), ex, bx);
#else
uint_t ex[P4D_MAX+64];
in = TEMPLATE2(bitunpack, USIZE)(in+16, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)), ex, bx);
#endif
#else
uint_t ex[P4D_MAX+64];
in = TEMPLATE2(bitunpack, USIZE)(in+32, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)) + popcnt64(ctou64(in+16)) + popcnt64(ctou64(in+24)), ex, bx);
#endif
return TEMPLATE2(_BITUNPACKD, USIZE)(in, n, out P4DELTA(start), b, ex, pb);
}
#else
{ unsigned long long bb[P4D_MAX/64];
#endif
#if VSIZE < 128 || USIZE == 64
{ uint_t ex[P4D_MAX+64];
unsigned long long bb[P4D_MAX/64];
unsigned num=0,i,p4dn = (n+63)/64;
for(i = 0; i < n/64; i++) { bb[i] = ctou64(in+i*8); num += popcnt64(bb[i]); }
if(n & 0x3f) { bb[i] = ctou64(in+i*8) & ((1ull<<(n&0x3f))-1); num += popcnt64(bb[i]); }
in = TEMPLATE2(bitunpack, USIZE)(in+PAD8(n), num, ex, bx);
in = TEMPLATE2(bitunpack, USIZE)(in, n, out, b);
in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b);
#if 0 //defined(AVX_2__)
{ uint_t *op,*pex = ex;
for(i = 0; i < p4dn; i++) {