diff --git a/bitpack.c b/bitpack.c index 7bfed28..db44932 100644 --- a/bitpack.c +++ b/bitpack.c @@ -25,6 +25,7 @@ #include #include "bitpack.h" #include "bitutil.h" +#include "vint.h" #define PAD8(_x_) ( (((_x_)+8-1)/8) ) #pragma clang diagnostic push @@ -97,45 +98,50 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons #include "bitpack_.h" #undef IPI -#define BITNPACK(in, n, out, csize, usize) { ip=in;\ - /*if(usize <= 32)\ - for(; ip < in+(n&~(csize*4-1)); ) { __builtin_prefetch(ip+512); unsigned char *p=ip; unsigned u,b;\ - TEMPLATE2(BITSIZE,usize)(ip, csize, b); u = b; out = TEMPLATE2(bitpacka, usize)[b](ip, csize, out); ip+=csize;\ - TEMPLATE2(BITSIZE,usize)(ip, csize, b); u |= b<<6; out = TEMPLATE2(bitpacka, usize)[b](ip, csize, out); ip+=csize;\ - TEMPLATE2(BITSIZE,usize)(ip, csize, b); u |= b<<12; out = TEMPLATE2(bitpacka, usize)[b](ip, csize, out); ip+=csize;\ - TEMPLATE2(BITSIZE,usize)(ip, csize, b); u |= b<<18; out = TEMPLATE2(bitpacka, usize)[b](ip, csize, out); ip+=csize;\ - ctou32(p) = p[3]<<24 | u&0xffffff;\ - }*/\ - for(in+=n; ip < in;) { unsigned iplen = in - ip; if(iplen > csize) iplen = csize; __builtin_prefetch(ip+512);\ - unsigned b; TEMPLATE2(BITSIZE,usize)(ip, csize, b); *out++ = b; out = TEMPLATE2(bitpacka, usize)[b](ip, csize, out); \ +#define BITNPACK(in, n, out, csize, usize) { unsigned char *op = out;\ + for(ip = in, in += n; ip < in;) { \ + unsigned iplen = in - ip,b;\ + if(iplen > csize) iplen = csize; __builtin_prefetch(ip+512);\ + TEMPLATE2(BITSIZE,usize)(ip, csize, b);\ + *op++ = b; \ + op = TEMPLATE2(bitpacka, usize)[b](ip, csize, op); \ ip += csize;\ - } return out;\ + } \ + return op - out;\ } -#define BITNDPACK(in, n, out, csize, usize, _start_, _bitd_, _bitpacka_) {\ - for(ip = in,in+=n; ip < in;) { unsigned iplen = in - ip; if(iplen > csize) iplen = csize; __builtin_prefetch(ip+512);\ +#define BITNDPACK(in, n, out, csize, usize, _bitd_, _bitpacka_) {\ + if(!n) return 0;\ + unsigned char *op = out; \ + start = *in++; \ + TEMPLATE2(vbxput, usize)(op, start);\ +\ + for(ip = in,--n, in += n; ip < in;) { \ + unsigned iplen = in - ip;\ + if(iplen > csize) iplen = csize; __builtin_prefetch(ip+512);\ typeof(in[0]) _in[csize+8];\ - unsigned b = TEMPLATE2(_bitd_, usize)(ip, csize, _start_);\ - *out++ = b; out = TEMPLATE2(_bitpacka_, usize)[b](ip, csize, out, _start_);\ + unsigned b = TEMPLATE2(_bitd_, usize)(ip, csize, start);\ + *op++ = b; op = TEMPLATE2(_bitpacka_, usize)[b](ip, csize, op, start);\ ip += csize;\ start = ip[-1];\ - } return out;\ + } \ + return op - out;\ } -unsigned char *bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t *ip; BITNPACK(in, n, out, 128, 8); } -unsigned char *bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip; BITNPACK(in, n, out, 128, 16); } -unsigned char *bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; BITNPACK(in, n, out, 128, 32); } -unsigned char *bitnpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip; BITNPACK(in, n, out, 128, 64); } +size_t bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t *ip,start; BITNPACK(in, n, out, 128, 8); } +size_t bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; BITNPACK(in, n, out, 128, 16); } +size_t bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNPACK(in, n, out, 128, 32); } +size_t bitnpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNPACK(in, n, out, 128, 64); } -unsigned char *bitndpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out, uint8_t start) { uint8_t *ip; BITNDPACK(in, n, out, 128, 8, start, bitd, bitdpacka); } -unsigned char *bitndpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out, uint16_t start) { uint16_t *ip; BITNDPACK(in, n, out, 128, 16, start, bitd, bitdpacka); } -unsigned char *bitndpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start) { uint32_t *ip; BITNDPACK(in, n, out, 128, 32, start, bitd, bitdpacka); } -unsigned char *bitndpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out, uint64_t start) { uint64_t *ip; BITNDPACK(in, n, out, 128, 64, start, bitd, bitdpacka); } +size_t bitndpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t *ip,start; BITNDPACK(in, n, out, 128, 8, bitd, bitdpacka); } +size_t bitndpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; BITNDPACK(in, n, out, 128, 16, bitd, bitdpacka); } +size_t bitndpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitd, bitdpacka); } +size_t bitndpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitd, bitdpacka); } -unsigned char *bitnd1pack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out, uint8_t start) { uint8_t *ip; BITNDPACK(in, n, out, 128, 8, start, bitd1, bitd1packa); } -unsigned char *bitnd1pack16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out, uint16_t start) { uint16_t *ip; BITNDPACK(in, n, out, 128, 16, start, bitd1, bitd1packa); } -unsigned char *bitnd1pack32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start) { uint32_t *ip; BITNDPACK(in, n, out, 128, 32, start, bitd1, bitd1packa); } -unsigned char *bitnd1pack64(uint64_t *__restrict in, size_t n, unsigned char *__restrict out, uint64_t start) { uint64_t *ip; BITNDPACK(in, n, out, 128, 64, start, bitd1, bitd1packa); } +size_t bitnd1pack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t *ip,start; BITNDPACK(in, n, out, 128, 8, bitd1, bitd1packa); } +size_t bitnd1pack16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; BITNDPACK(in, n, out, 128, 16, bitd1, bitd1packa); } +size_t bitnd1pack32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNDPACK(in, n, out, 128, 32, bitd1, bitd1packa); } +size_t bitnd1pack64(uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNDPACK(in, n, out, 128, 64, bitd1, bitd1packa); } //---------------------------------------------------------------------------------------------------------------------------------- #ifdef __SSE2__ diff --git a/bitpack.h b/bitpack.h index 0a7ebd0..049363f 100644 --- a/bitpack.h +++ b/bitpack.h @@ -30,7 +30,38 @@ extern "C" { #endif #include -//********************************** Bit Packing : Pack **************************************************************** +//******************** Bit Packing High Level API - n unlimited *************************************************** +size_t bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitnpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); + +size_t bitndpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitndpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitndpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitndpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); + +size_t bitnd1pack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitnd1pack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitnd1pack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t bitnd1pack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); + +size_t bitnunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); +size_t bitnunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); +size_t bitnunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t bitnunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); + +size_t bitndunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); +size_t bitndunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); +size_t bitndunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t bitndunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); + +size_t bitnd1unpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); +size_t bitnd1unpack16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out); +size_t bitnd1unpack32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t bitnd1unpack64(unsigned char *__restrict in, size_t n, uint64_t *__restrict out); + +//******** Bit Packing Low level API **************************************************************** // bipackNN: Pack array with n unsigned (NN bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out unsigned char *bitpack8( uint8_t *__restrict in, unsigned n, const unsigned char *__restrict out , unsigned b); @@ -177,38 +208,6 @@ unsigned char *_bitd1unpack128h32(const unsigned char *__restrict in, unsigned n unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb); unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb); unsigned char *_bitd1unpack256v32(const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb); - -//------------------------------- Multiple blocks -------------------------------- -unsigned char *bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *bitnpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); - -unsigned char *bitndpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out, uint8_t start); -unsigned char *bitndpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out, uint16_t start); -unsigned char *bitndpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start); -unsigned char *bitndpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out, uint64_t start); - -unsigned char *bitnd1pack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out, uint8_t start); -unsigned char *bitnd1pack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out, uint16_t start); -unsigned char *bitnd1pack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start); -unsigned char *bitnd1pack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out, uint64_t start); - -unsigned char *bitnunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -unsigned char *bitnunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -unsigned char *bitnunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -unsigned char *bitnunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); - -unsigned char *bitndunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out, uint8_t start); -unsigned char *bitndunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out, uint16_t start); -unsigned char *bitndunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start); -unsigned char *bitndunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out, uint64_t start); - -unsigned char *bitnd1unpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out, uint8_t start); -unsigned char *bitnd1unpack16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out, uint16_t start); -unsigned char *bitnd1unpack32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start); -unsigned char *bitnd1unpack64(unsigned char *__restrict in, size_t n, uint64_t *__restrict out, uint64_t start); - #ifdef __cplusplus } #endif diff --git a/bitunpack.c b/bitunpack.c index 4d839bd..a523257 100644 --- a/bitunpack.c +++ b/bitunpack.c @@ -26,6 +26,7 @@ #include "conf.h" #include "bitutil.h" #include "bitpack.h" +#include "vint.h" #define PAD8(_x_) (((_x_)+7)/8) #pragma GCC push_options @@ -82,33 +83,39 @@ typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsi #undef OPI #define BITNUNPACK(in, n, out, csize, usize) {\ + unsigned char *ip = in;\ for(op = out,out+=n; op < out;) { unsigned oplen = out - op; if(oplen > csize) oplen = csize; __builtin_prefetch(in+512);\ - unsigned b = *in++; in = TEMPLATE2(bitunpacka, usize)[b](in, csize, op);\ + unsigned b = *ip++; ip = TEMPLATE2(bitunpacka, usize)[b](ip, csize, op);\ op += csize;\ - } return in;\ + } \ + return ip - in;\ } -#define BITNDUNPACK(in, n, out, csize, usize, _start_, _bitunpacka_) {\ - for(op = out,out+=n; op < out;) { unsigned oplen = out - op; if(oplen > csize) oplen = csize; __builtin_prefetch(in+512);\ - unsigned b = *in++; in = TEMPLATE2(_bitunpacka_, usize)[b](in, csize, op, _start_);\ +#define BITNDUNPACK(in, n, out, csize, usize, _bitunpacka_) {\ + if(!n) return 0;\ + unsigned char *ip = in;\ + TEMPLATE2(vbxget, usize)(ip, start); \ + *out++ = start;\ + for(--n,op = out,out+=n; op < out;) { unsigned oplen = out - op; if(oplen > csize) oplen = csize; __builtin_prefetch(ip+512);\ + unsigned b = *ip++; ip = TEMPLATE2(_bitunpacka_, usize)[b](ip, csize, op, start);\ op += csize;\ start = op[-1];\ - } return in;\ + } return ip - in;\ } -unsigned char *bitnunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op; BITNUNPACK(in, n, out, 128, 8); } -unsigned char *bitnunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op; BITNUNPACK(in, n, out, 128, 16); } -unsigned char *bitnunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op; BITNUNPACK(in, n, out, 128, 32); } -unsigned char *bitnunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op; BITNUNPACK(in, n, out, 128, 64); } +size_t bitnunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op; BITNUNPACK(in, n, out, 128, 8); } +size_t bitnunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op; BITNUNPACK(in, n, out, 128, 16); } +size_t bitnunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op; BITNUNPACK(in, n, out, 128, 32); } +size_t bitnunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op; BITNUNPACK(in, n, out, 128, 64); } -unsigned char *bitndunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out, uint8_t start) { uint8_t *op; BITNDUNPACK(in, n, out, 128, 8, start, bitdunpacka); } -unsigned char *bitndunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out, uint16_t start) { uint16_t *op; BITNDUNPACK(in, n, out, 128, 16, start, bitdunpacka); } -unsigned char *bitndunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start) { uint32_t *op; BITNDUNPACK(in, n, out, 128, 32, start, bitdunpacka); } -unsigned char *bitndunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out, uint64_t start) { uint64_t *op; BITNDUNPACK(in, n, out, 128, 64, start, bitdunpacka); } +size_t bitndunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op,start; BITNDUNPACK(in, n, out, 128, 8, bitdunpacka); } +size_t bitndunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitdunpacka); } +size_t bitndunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitdunpacka); } +size_t bitndunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitdunpacka); } -unsigned char *bitnd1unpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out, uint8_t start) { uint8_t *op; BITNDUNPACK(in, n, out, 128, 8, start, bitd1unpacka); } -unsigned char *bitnd1unpack16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out, uint16_t start) { uint16_t *op; BITNDUNPACK(in, n, out, 128, 16, start, bitd1unpacka); } -unsigned char *bitnd1unpack32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start) { uint32_t *op; BITNDUNPACK(in, n, out, 128, 32, start, bitd1unpacka); } -unsigned char *bitnd1unpack64(unsigned char *__restrict in, size_t n, uint64_t *__restrict out, uint64_t start) { uint64_t *op; BITNDUNPACK(in, n, out, 128, 64, start, bitd1unpacka); } +size_t bitnd1unpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op,start; BITNDUNPACK(in, n, out, 128, 8, bitd1unpacka); } +size_t bitnd1unpack16(unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitd1unpacka); } +size_t bitnd1unpack32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitd1unpacka); } +size_t bitnd1unpack64(unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitd1unpacka); } //-------------------------------------------------------------------------------------------------------------------------------------- #ifdef __SSE2__ diff --git a/icbench.c b/icbench.c index 98e86c1..2fd0ebc 100644 --- a/icbench.c +++ b/icbench.c @@ -67,6 +67,7 @@ typedef unsigned long long tm_t; #define TM_T 1000000.0 #define TM_MAX (1ull<<63) +#if 1 #ifdef _WIN32 #include static LARGE_INTEGER tps; @@ -76,6 +77,9 @@ static tm_t tminit() { QueryPerformanceFrequency(&tps); tm_t t0=tmtime(),ts; whi static tm_t tmtime(void) { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return (tm_t)tm.tv_sec*1000000ull + tm.tv_nsec/1000; } static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } #endif +#else +#include "time_r.h" +#endif //---------------------------------------- bench ---------------------------------------------------------------------- #define TM_MAX (1ull<<63) @@ -942,7 +946,7 @@ int becomp(unsigned char *_in, unsigned _inlen, unsigned char *_out, unsigned ou op = codcomp(ip, iplen, op, oe-op, id, lev, prm, ifmt); ip += iplen; if(op > _out+outsize) - die("Overflow error %llu, %u in lib=%d\n", outsize, (int)(ptrdiff_t)(op - _out), id); + die("Compress overflow error %llu, %u in lib=%d\n", outsize, (int)(ptrdiff_t)(op - _out), id); } } TMEND(_inlen); // printf("cnt=%d, csize=%d\n", cnt, csize); @@ -958,9 +962,9 @@ int bedecomp(unsigned char *_in, int _inlen, unsigned char *_out, unsigned _outl for(ip = _in, out = _out; out < _out+_outlen;) { unsigned outlen=_outlen,bs; if(mode) { - vbget32(ip, outlen); //outlen = ctou32(ip); ip += 4; + vbget32(ip, outlen); //outlen = ctou32(ip); ip += 4; ctou32(out) = outlen; out += 4; - outlen *= 4; if(out+outlen >_out+_outlen) die("FATAL: overflow error %d ", outlen); + outlen *= 4; if(out+outlen >_out+_outlen) die("FATAL: decompress overflow output error %d ", outlen); } for(op = out, out += outlen; op < out; ) { unsigned oplen = out - op; diff --git a/plugins.cc b/plugins.cc index 14695c5..6f598ae 100644 --- a/plugins.cc +++ b/plugins.cc @@ -595,13 +595,13 @@ unsigned char *codcomps(unsigned char *_in, unsigned _n, unsigned char *out, int case TB_PFOR128: x = *in++; --n; VBPUT32(out, x); if(inc) return n == 128?p4d1enc128v32(in, n, out, x):p4d1enc32( in, n, out, x); else return n == 128?p4denc128v32( in, n, out, x):p4denc32( in, n, out, x); - case TB_PFORN128: x = *in++; --n; VBPUT32(out, x); return inc?p4nd1enc128v32( in, n, out, x):p4ndenc128v32(in, n, out, x); + case TB_PFORN128: return out+(inc?p4nd1enc128v32( in, n, out):p4ndenc128v32(in, n, out)); case TB_PACK128V: x = *in++; --n; if(inc) { b = bitd132(in, n, x); VBPUT32(out, x); *out++=b; return n == 128?bitd1pack128v32(in, n, out, x, b):bitd1pack32(in, n, out, x, b); } else { b = bitd32( in, n, x); VBPUT32(out, x); *out++=b; return n == 128?bitdpack128v32( in, n, out, x, b):bitdpack32( in, n, out, x, b); } #ifdef __AVX2__ case TB_PFOR256: x = *in++; bitdelta32( in, --n, pa, x, inc);VBPUT32(out, x); return n == 256?p4enc256v32(pa, n, out ):p4enc32(pa, n, out); - case TB_PFORN256: x = *in++; --n; VBPUT32(out, x); return inc?p4nd1enc256v32( in, n, out, x):p4ndenc256v32(in, n, out, x); + case TB_PFORN256: return out+(inc?p4nd1enc256v32( in, n, out):p4ndenc256v32(in, n, out)); /*case TB_PACK256V: x = *in++; --n; if(inc) { b = bitd132(in, n, x); VBPUT32(out, x); *out++=b; return n == 256?bitd1pack256v32(in, n, out, x, b):bitd1pack32(in, n, out, x, b); } else { b = bitd32( in, n, x); VBPUT32(out, x); *out++=b; return n == 256?bitdpack256v32( in, n, out, x, b):bitdpack32( in, n, out, x, b); }*/ @@ -618,7 +618,7 @@ unsigned char *codcomps(unsigned char *_in, unsigned _n, unsigned char *out, int case TB_PACK: x = *in++; --n; if(inc) { b = bitd132(in, n, x); VBPUT32(out, x); *out++=b; return bitd1pack32(in, n, out, x, b); } else { b = bitd32( in, n, x); VBPUT32(out, x); *out++=b; return bitdpack32( in, n, out, x, b); } - case TB_NPACK: x = *in++; --n; VBPUT32(out, x); return inc?bitnd1pack32(in, n, out, x):bitndpack32( in, n, out, x); + case TB_NPACK: return out+(inc?bitnd1pack32(in, n, out):bitndpack32( in, n, out)); #if C_SIMPLE8B case AM_SIMPLE8B: b = bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); if(b>28) die("simple-8b overflow.bits size>28\n"); return vs8benc( pa, n, out); @@ -731,30 +731,26 @@ unsigned char *coddecomps(unsigned char *in, unsigned _n, unsigned char *_out, i case TB_FOR: VBGET32(in, x);*out = x; b = *in++; return inc?bitf1unpack32( in, n-1, out+1, x, b):bitfunpack32( in, n-1, out+1, x, b); case TB_FORDA: VBGET32(in, x);*out = x; b = *in++; return inc?bitf1unpackx32( in, n-1, out+1, x, b):bitfunpackx32( in, n-1, out+1, x, b); case TB_PACK: VBGET32(in, x);*out = x; b = *in++; return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); - case TB_NPACK: VBGET32(in, x);*out = x; return inc?bitnd1unpack32( in, n-1, out+1, x):bitndunpack32( in, n-1, out+1, x); + case TB_NPACK: return in+(inc?bitnd1unpack32( in, n, out):bitndunpack32( in, n, out)); case TB_ELIASFANO:VBGET32(in, x);*out++ = x; --n; if(inc) { return efano1dec32( in, n, out, x+1); } else { return efanodec32( in, n, out, x); } #if C_TURBOPFORV case TB_ELIASFANOV:VBGET32(in, x); *out++ = x; --n; - if(inc) { return n==128?efano1dec128v32(in, n, out, x+1 ):efano1dec32(in, n, out, x+1); } - else { return n==128?efanodec128v32( in, n, out, x ):efanodec32( in, n, out, x); } + if(inc) { return n==128?efano1dec128v32(in, n, out, x+1 ):efano1dec32(in, n, out, x+1); } + else { return n==128?efanodec128v32( in, n, out, x ):efanodec32( in, n, out, x); } case TB_PFOR128: VBGET32(in, x); *out++ = x; --n; //__builtin_prefetch(in+256); - if(inc) { return n==128?p4d1dec128v32( in, n, out, x ):p4d1dec32(in, n, out, x); } - else { return n==128?p4ddec128v32( in, n, out, x ):p4ddec32( in, n, out, x); } - case TB_PFORN128: VBGET32(in, x); *out++ = x; --n; //__builtin_prefetch(in+256); - if(inc) { return p4nd1dec128v32( in, n, out, x ); } - else { return p4nddec128v32( in, n, out, x ); } + if(inc) { return n==128?p4d1dec128v32( in, n, out, x ):p4d1dec32(in, n, out, x); } + else { return n==128?p4ddec128v32( in, n, out, x ):p4ddec32( in, n, out, x); } + case TB_PFORN128: return in+(inc?p4nd1dec128v32(in, n, out):p4nddec128v32( in, n, out)); case TB_PACK128V: VBGET32(in, x);*out = x; b = *in++; - if(n <= 128) { return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in,n-1, out+1, x, b); } - else { return inc?bitd1unpack128v32(in,n, out+1, x, b):bitdunpack128v32(in,n, out+1, x, b); } + if(n <= 128) { return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in,n-1, out+1, x, b); } + else { return inc?bitd1unpack128v32( in, n, out+1, x, b):bitdunpack128v32(in,n, out+1, x, b); } #ifdef __AVX2__ case TB_PFOR256: VBGET32(in, x); *out++ = x; --n; //__builtin_prefetch(in+256); if(inc) { return n==256?p4d1dec256v32( in, n, out, x ):p4d1dec32(in, n, out, x); } else { return n==256?p4ddec256v32( in, n, out, x ):p4ddec32( in, n, out, x); } - case TB_PFORN256: VBGET32(in, x); *out++ = x; --n; - if(inc) { return p4nd1dec256v32( in, n, out, x ); } - else { return p4nddec256v32( in, n, out, x ); } + case TB_PFORN256: return in+(inc?p4nd1dec256v32(in, n, out ):p4nddec256v32( in, n, out)); /*case TB_PACK256V: VBGET32(in, x);*out = x; b = *in++; if(n <= 256) return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); else { in = bitunpack256v32( in, out+1, b);bitundn32(out, n, -inc, inc); } break;*/ @@ -880,17 +876,17 @@ unsigned char *codcomp(unsigned char *_in, unsigned _n, unsigned char *out, int case TB_FORDA: case TB_PACK128H: case TB_PACK: if(b < 0) { BITSIZE32(in, n, b); *out++ = b; } return bitpack32(in, n, out, b); - case TB_NPACK: return bitnpack32(in, n, out); + case TB_NPACK: return out+bitnpack32(in, n, out); case TB_PFORDA: return p4encx32( in, n, out); #if C_TURBOPFORV case TB_ELIASFANOV:return out; case TB_PFOR128: return n == 128?p4enc128v32(in, n, out):p4enc32(in, n, out); - case TB_PFORN128: return p4nenc128v32(in, n, out); + case TB_PFORN128: return out+p4nenc128v32(in, n, out); case TB_PACK128V: if(b < 0) { BITSIZE32(in, n, b); *out++ = b; } return n != 128?bitpack32(in, n, out, b):bitpack128v32(in, n, out, b); #ifdef __AVX2__ case TB_PFOR256: return n == 256?p4enc256v32(in, n, out):p4enc32(in, n, out); - case TB_PFORN256: return p4nenc256v32(in, n, out); + case TB_PFORN256: return out+p4nenc256v32(in, n, out); case TB_PACK256V: if(b < 0) { BITSIZE32(in, n, b); *out++ = b; } return n != 256?bitpack32(in, n, out, b):bitpack256v32(in, n, out, b); #endif #endif @@ -1060,13 +1056,13 @@ unsigned char *coddecomp(unsigned char *in, unsigned _n, unsigned char *_out, in case TB_FOR: if(b < 0) b = *in++; return bitfunpack32( in, n, out, 0, b); case TB_FORDA: if(b < 0) b = *in++; return _bitunpackx32( in, n, out, b); case TB_PACK: if(b < 0) b = *in++; return bitunpack32( in, n, out, b); - case TB_NPACK: return bitnunpack32( in, n, out); + case TB_NPACK: return in+bitnunpack32( in, n, out); #if C_TURBOPFORV case TB_PFOR128 : __builtin_prefetch(in+256);return n == 128?p4dec128v32(in, n, out):p4dec32(in, n, out); - case TB_PFORN128 : return p4ndec128v32(in, n, out); + case TB_PFORN128 : return in+p4ndec128v32(in, n, out); #ifdef __AVX2__ case TB_PFOR256 : __builtin_prefetch(in+256);return n == 256?p4dec256v32(in, n, out):p4dec32(in, n, out); - case TB_PFORN256 : return p4ndec256v32(in, n, out); + case TB_PFORN256 : return in+p4ndec256v32(in, n, out); case TB_PACK256V: if(b < 0) b = *in++; return n != 256?bitunpack32(in, n, out, b):bitunpack256v32(in, n, out, b); #endif case TB_ELIASFANOV: return in; diff --git a/vint.h b/vint.h index 76db3ba..1fe75a5 100644 --- a/vint.h +++ b/vint.h @@ -83,10 +83,12 @@ extern unsigned char _vtab32_[]; #define vbxput64(_op_, _x_) { unsigned long long _x = _x_; _vbxput64(_op_, _x, ;); } #define vbxput32(_op_, _x_) { register unsigned _x = _x_; _vbxput32(_op_, _x, ;); } #define vbxput16(_op_, _x_) vbxput32(_op_, _x_) +#define vbxput8( _op_, _x_) (*_op_++ = _x_) #define vbxget64(_ip_, _x_) _vbxget64(_ip_, _x_, ;) #define vbxget32(_ip_, _x_) _vbxget32(_ip_, _x_, ;) #define vbxget16(_ip_, _x_) vbxget32(_ip_,_x_) +#define vbxget8(_ip_, _x_) (_x_ = *_ip_++) //--------------------------------------------------------------------------- #define VB_SIZE 64 #define VB_MAX 254 @@ -159,10 +161,12 @@ static inline unsigned vbvlen64(unsigned x) { return _vbvlen64(x); } #define vbput64(_op_, _x_) { unsigned long long _x = _x_; _vbput64(_op_, _x, ;); } #define vbput32(_op_, _x_) { register unsigned _x = _x_; _vbput32(_op_, _x, ;); } #define vbput16(_op_, _x_) vbput32(_op_, _x_) +#define vbput8(_op_, _x_) (*_op_++ = _x_) #define vbget64(_ip_, _x_) _vbget64(_ip_, _x_, ;) #define vbget32(_ip_, _x_) _vbget32(_ip_, _x_, ;) #define vbget16(_ip_, _x_) vbget32(_ip_,_x_) +#define vbget8(_ip_, _x_) (_x_ = *_ip_++) //----------------------------- Variable byte: array functions ----------------------------------------------------------------------- // Encoding/DEcoding: Return value = end of compressed output/input buffer out/in diff --git a/vp4.h b/vp4.h index fc63c6f..b867f95 100644 --- a/vp4.h +++ b/vp4.h @@ -29,10 +29,59 @@ extern "C" { #endif #include +//************************************************ High level API - n unlimited **************************************************** +// Compress integer array with n values to the buffer out. +// Return value = number of bytes written to compressed buffer out +size_t p4nenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); // SIMD (Vertical bitpacking) +size_t p4nenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4ndenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4ndenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); + +size_t p4nd1enc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nd1enc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nd1enc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out); +size_t p4nd1enc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); + +// Decompress the compressed n values in input buffer in to the integer array out. +// Return value = number of bytes read from the ompressed buffer in +size_t p4ndec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); +size_t p4ndec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); +size_t p4ndec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4ndec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4ndec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4ndec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); + +// Delta minimum = 0 +size_t p4nddec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); +size_t p4nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); +size_t p4nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4nddec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); +// Delta minimum = 1 +size_t p4nd1dec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); +size_t p4nd1dec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); +size_t p4nd1dec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out); +size_t p4nd1dec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); + + +//************** Low level API - n limited to 128/256 *************************************** #define P4D_MAX 256 -//********************************************** TurboPFor: Encode ***************************************************************************** +// -------------- TurboPFor: Encode //#include // Low level API: Single block n limited //compress integer array with n values to the buffer out. Return value = end of compressed buffer out @@ -82,29 +131,6 @@ ALWAYS_INLINE unsigned _p4bits16( uint16_t *__restrict in, unsigned n, ALWAYS_INLINE unsigned _p4bits32( uint32_t *__restrict in, unsigned n, unsigned *pbx); ALWAYS_INLINE unsigned _p4bits64( uint64_t *__restrict in, unsigned n, unsigned *pbx); -//----------------------- n unlimited ------------------ -// compress integer array with n values to the buffer out. Return value = end of compressed buffer out -unsigned char *p4nenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *p4nenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *p4nenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *p4nenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); // SIMD (Vertical bitpacking) -unsigned char *p4nenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out); -unsigned char *p4nenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out); - -unsigned char *p4ndenc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out, uint8_t start); -unsigned char *p4ndenc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4ndenc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4ndenc128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start); // SIMD (Vertical bitpacking) -unsigned char *p4ndenc256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4ndenc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out, uint64_t start); - -unsigned char *p4nd1enc8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out, uint8_t start); -unsigned char *p4nd1enc16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out, uint16_t start); -unsigned char *p4nd1enc32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4nd1enc128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start); // SIMD (Vertical bitpacking) -unsigned char *p4nd1enc256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out, uint32_t start); -unsigned char *p4nd1enc64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out, uint64_t start); - #define P4EB(_b_) (_b_ << 1) #define P4EBX(_b_, _bx_) (_bx_ << 8 | _b_ << 1 | 1) #define P4SAVE(_out_, _b_, _bx_) do { if(!_bx_) *_out_++ = P4EB(_b_);else *(uint16_t *)_out_ = P4EBX(_b_, _bx_), _out_ += 2; } while(0) @@ -162,29 +188,6 @@ unsigned char *p4d1dec128v32( unsigned char *__restrict in, unsigned n, uint32_t unsigned char *p4d1dec256v32( unsigned char *__restrict in, unsigned n, uint32_t *__restrict out, uint32_t start); unsigned char *p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); -//************************************************ n unlimitred ****************************************************************************************** -unsigned char *p4ndec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out); -unsigned char *p4ndec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out); -unsigned char *p4ndec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -unsigned char *p4ndec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); // SIMD (Vertical BitPacking) -unsigned char *p4ndec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out); -unsigned char *p4ndec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out); - -// Delta minimum = 0 -unsigned char *p4nddec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out, uint8_t start); -unsigned char *p4nddec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out, uint16_t start); -unsigned char *p4nddec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4nddec128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start); // SIMD (Vertical BitPacking) -unsigned char *p4nddec256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4nddec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out, uint64_t start); -// Delta minimum = 1 -unsigned char *p4nd1dec8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out, uint8_t start); -unsigned char *p4nd1dec16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out, uint16_t start); -unsigned char *p4nd1dec32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4nd1dec128v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start); // SIMD (Vertical BitPacking) -unsigned char *p4nd1dec256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out, uint32_t start); -unsigned char *p4nd1dec64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out, uint64_t start); - //---------------- Direct Access functions to compressed TurboPFor array p4encx16/p4encx32 ------------------------------------------------------- #ifndef NTURBOPFOR_DAC #define P4D_PAD8(_x_) ( (((_x_)+8-1)/8) ) diff --git a/vp4c.c b/vp4c.c index 40db731..1a7b1d1 100644 --- a/vp4c.c +++ b/vp4c.c @@ -290,18 +290,20 @@ unsigned char *TEMPLATE2(P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsign return TEMPLATE2(_P4ENC, USIZE)(in, n, out, b, bx); } -unsigned char *TEMPLATE2(P4NENC, USIZE)(uint_t *__restrict in, size_t n, unsigned char *__restrict out) { +size_t TEMPLATE2(P4NENC, USIZE)(uint_t *__restrict in, size_t n, unsigned char *__restrict out) { + if(!n) return 0; + unsigned char *op = out; uint_t *ip; for(ip = in; ip != in+(n&~(CSIZE-1)); ip += CSIZE) { __builtin_prefetch(ip+512); unsigned bx, b = TEMPLATE2(_p4bits, USIZE)(ip, CSIZE, &bx); #if EXCEP > 0 - if(bx <= USIZE) { P4SAVE(out, b, bx); } else *out++= 0x80|b<<1; + if(bx <= USIZE) { P4SAVE(op, b, bx); } else *op++= 0x80|b<<1; #else - P4SAVE(out, b, bx); + P4SAVE(op, b, bx); #endif - out = TEMPLATE2(_P4ENC, USIZE)(ip, CSIZE, out, b, bx); // out = TEMPLATE2(P4ENC, USIZE)(ip, CSIZE, out); + op = TEMPLATE2(_P4ENC, USIZE)(ip, CSIZE, op, b, bx); // op = TEMPLATE2(P4ENC, USIZE)(ip, CSIZE, op); } - return TEMPLATE2(p4enc, USIZE)(ip, n&(CSIZE-1), out); + return TEMPLATE2(p4enc, USIZE)(ip, n&(CSIZE-1), op) - out; } #else ALWAYS_INLINE unsigned char *TEMPLATE2(P4DENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, uint_t start) { if(!n) return out; @@ -310,21 +312,25 @@ ALWAYS_INLINE unsigned char *TEMPLATE2(P4DENC, USIZE)(uint_t *__restrict in, uns return TEMPLATE2(P4ENC, USIZE)(_in, n, out); } -unsigned char *TEMPLATE2(P4NENC, USIZE)(uint_t *__restrict in, size_t n, unsigned char *__restrict out, uint_t start) { - uint_t *ip; - for(ip = in; ip != in+(n&~(CSIZE-1)); ip += CSIZE) { __builtin_prefetch(ip+512); +size_t TEMPLATE2(P4NENC, USIZE)(uint_t *__restrict in, size_t n, unsigned char *__restrict out) { + if(!n) return out; + unsigned char *op = out; + uint_t *ip, start = *in++; + + TEMPLATE2(vbxput, USIZE)(op, start); + for(ip = in, --n; ip != in+(n&~(CSIZE-1)); ip += CSIZE) { __builtin_prefetch(ip+512); uint_t _in[P4D_MAX+8]; TEMPLATE2(bitdelta, USIZE)(ip, CSIZE, _in, start, P4DELTA); unsigned bx, b = TEMPLATE2(_p4bits, USIZE)(_in, CSIZE, &bx); #if EXCEP > 0 - if(bx <= USIZE) { P4SAVE(out, b, bx); } else *out++= 0x80|b<<1; + if(bx <= USIZE) { P4SAVE(op, b, bx); } else *op++= 0x80|b<<1; #else - P4SAVE(out, b, bx); + P4SAVE(op, b, bx); #endif - out = TEMPLATE2(_P4ENC, USIZE)(_in, CSIZE, out, b, bx); // out = TEMPLATE2(P4ENC, USIZE)(_in, CSIZE, out); + op = TEMPLATE2(_P4ENC, USIZE)(_in, CSIZE, op, b, bx); // op = TEMPLATE2(P4ENC, USIZE)(_in, CSIZE, op); start = ip[CSIZE-1]; } - return TEMPLATE2(P4NENCS, USIZE)(ip, n&(CSIZE-1), out, start); + return TEMPLATE2(P4NENCS, USIZE)(ip, n&(CSIZE-1), op, start) - out; } #endif diff --git a/vp4d.c b/vp4d.c index 01ffc05..6f1f7a8 100644 --- a/vp4d.c +++ b/vp4d.c @@ -61,6 +61,7 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define P4DELTA(a) #define P4DELTA_(a) +#undef DELTA #define _P4DEC _p4dec #define P4DEC p4dec @@ -87,6 +88,7 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define P4DELTA(a) ,a #define P4DELTA_(a) a +#define DELTA #define _P4DEC _p4ddec //delta0 #define P4DEC p4ddec @@ -139,6 +141,8 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define VSIZE 128 #define P4DELTA(a) #define P4DELTA_(a) +#undef DELTA + #define _P4DEC _p4dec128v #define P4DEC p4dec128v #define P4NDEC p4ndec128v @@ -150,6 +154,8 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define P4DELTA(a) ,a #define P4DELTA_(a) a +#define DELTA + #define _P4DEC _p4ddec128v #define P4DEC p4ddec128v #define P4NDEC p4nddec128v @@ -169,11 +175,13 @@ static ALIGNED(char, shuffles[16][16], 16) = { #include "vp4d.c" #undef BITUNDD #undef P4DELTA +#undef DELTA #endif #ifdef __AVX2__ #define P4DELTA(a) #define P4DELTA_(a) +#undef DELTA #define VSIZE 256 #define _P4DEC _p4dec256v #define P4DEC p4dec256v @@ -186,6 +194,7 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define P4DELTA(a) ,a #define P4DELTA_(a) a +#define DELTA #define _P4DEC _p4ddec256v #define P4DEC p4ddec256v #define P4NDEC p4nddec256v @@ -303,40 +312,48 @@ unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, #define CSIZE 128 #endif -unsigned char *TEMPLATE2(P4NDEC, USIZE)(unsigned char *__restrict in, size_t n, uint_t *__restrict out P4DELTA(uint_t start) ) { - uint_t *op; - for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) { __builtin_prefetch(in+512); - unsigned b = *in++,bx,i; +size_t TEMPLATE2(P4NDEC, USIZE)(unsigned char *__restrict in, size_t n, uint_t *__restrict out) { + if(!n) return 0; + unsigned char *ip = in; + uint_t *op; + #ifdef DELTA + uint_t start; + TEMPLATE2(vbxget, USIZE)(ip, start); + *out++ = start; + --n; + #endif + for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) { __builtin_prefetch(ip+512); + unsigned b = *ip++,bx,i; if(likely(!(b & 0x80))) { if(b & 1) - bx = *in++; - in = TEMPLATE2(_P4DEC, USIZE)(in, CSIZE, op P4DELTA(start), b, bx ); + bx = *ip++; + ip = TEMPLATE2(_P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start), b, bx ); } #if USIZE > 8 else { uint_t ex[P4D_MAX+8]; b = (b & 0x7f)>>1; - bx = *in++; - in = TEMPLATE2(BITUNPACK, USIZE)(in, CSIZE, op, b); - in = TEMPLATE2(vbdec, USIZE)(in, bx, ex); + bx = *ip++; + ip = TEMPLATE2(BITUNPACK, USIZE)(ip, CSIZE, op, b); + ip = TEMPLATE2(vbdec, USIZE)(ip, bx, ex); for(i = 0; i != (bx & ~3); i += 4) { - op[in[i ]] |= ex[i ] << b; - op[in[i+1]] |= ex[i+1] << b; - op[in[i+2]] |= ex[i+2] << b; - op[in[i+3]] |= ex[i+3] << b; + op[ip[i ]] |= ex[i ] << b; + op[ip[i+1]] |= ex[i+1] << b; + op[ip[i+2]] |= ex[i+2] << b; + op[ip[i+3]] |= ex[i+3] << b; } for(;i < bx; i++) - op[in[i]] |= ex[i] << b; - in += bx; + op[ip[i]] |= ex[i] << b; + ip += bx; #ifdef BITUNDD TEMPLATE2(BITUNDD, USIZE)(op, CSIZE, start); #endif - } // in = TEMPLATE2(P4DEC, USIZE)(in, CSIZE, op P4DELTA(start)); + } // ip = TEMPLATE2(P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start)); #endif P4DELTA_(start = op[CSIZE-1]); } - return TEMPLATE2(P4NDECS, USIZE)(in, n&(CSIZE-1), op P4DELTA(start)); + return TEMPLATE2(P4NDECS, USIZE)(ip, n&(CSIZE-1), op P4DELTA(start)) - in; } #ifdef P4DECX