From 66248a2f49ee2f3b0fdfbeede39239f85fda1b18 Mon Sep 17 00:00:00 2001 From: x Date: Fri, 10 Mar 2023 20:28:32 +0100 Subject: [PATCH] TurboPFor: Bit UnPacking --- bitunpack.c | 319 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 247 insertions(+), 72 deletions(-) diff --git a/bitunpack.c b/bitunpack.c index cf3e184..5708ef4 100644 --- a/bitunpack.c +++ b/bitunpack.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2019 + Copyright (C) powturbo 2013-2023 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -22,25 +22,25 @@ - email : powturbo [_AT_] gmail [_DOT_] com **/ // "Integer Compression" Bit Packing -#define BITUTIL_IN -#define VINT_IN -#include "conf.h" -#include "bitutil.h" -#include "bitpack.h" -#include "vint.h" - -#define PAD8(_x_) (((_x_)+7)/8) - #pragma warning( disable : 4005) #pragma warning( disable : 4090) #pragma warning( disable : 4068) +#include "include_/conf.h" +#include "include_/bitpack.h" +#include "include_/bitutil.h" +#include "include_/vlcbyte.h" + +#include "include_/bitutil_.h" + +#define PAD8(_x_) (((_x_)+7)/8) + #pragma GCC push_options #pragma GCC optimize ("align-functions=16") #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunsequenced" -#if !defined(SSE2_ON) && !defined(AVX2_ON) //----------------------------------- Plain ------------------------------------------------------------------------------------------- +#ifndef __AVX2__ //----------------------------------- Plain ------------------------------------------------------------------------------------------- typedef unsigned char *(*BITUNPACK_F8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out); typedef unsigned char *(*BITUNPACK_D8)( const unsigned char *__restrict in, unsigned n, uint8_t *__restrict out, uint8_t start); typedef unsigned char *(*BITUNPACK_F16)(const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out); @@ -58,39 +58,51 @@ typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsi #define OPX(_op_) _op_ += 32 #endif +//-- bitpack ------------- #define OPI(_op_,_nb_,_parm_) OPX(_op_) #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = _w_ #define _BITUNPACK_ bitunpack #include "bitunpack_.h" -#define DELTA +#define DELTA +//-- bitunpack delta ------------- #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += (_w_)) #define _BITUNPACK_ bitdunpack // delta + 0 #include "bitunpack_.h" -#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += TEMPLATE2(zigzagdec, USIZE)(_w_)) +//-- bitunpack zigzag ----------- +#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += T2(zigzagdec, USIZE)(_w_)) #define _BITUNPACK_ bitzunpack // zigzag #include "bitunpack_.h" +//-- bitunpack xor ----------- +#define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ ^= (_w_)) +#define _BITUNPACK_ bitxunpack // xor +#include "bitunpack_.h" + +//-- bitunpack FOR ----------- #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ + (_w_)) #define _BITUNPACK_ bitfunpack // for #include "bitunpack_.h" +//-- bitunpack delta 1 ----------- #define OPI(_op_,_nb_,_parm_) OPX(_op_); _parm_ += 32 #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = (_parm_ += (_w_)) + (_x_+1) #define _BITUNPACK_ bitd1unpack // delta + 1 #include "bitunpack_.h" +//-- bitunpack FOR 1 ----------- #define OUT( _op_, _x_, _w_, _nb_,_parm_) OP(_op_,_x_) = _parm_ + (_w_)+(_x_+1) #define _BITUNPACK_ bitf1unpack // for + 1 #include "bitunpack_.h" #undef OPI +//------------------------------------------------------- bitnunpack ---------------------------------------------------------- #define BITNUNPACK(in, n, out, _csize_, _usize_) {\ unsigned char *ip = in;\ - for(op = out,out+=n; op < out;) { unsigned oplen = out - op,b; if(oplen > _csize_) oplen = _csize_; PREFETCH(ip+512,0);\ - b = *ip++; ip = TEMPLATE2(bitunpacka, _usize_)[b](ip, oplen, op);\ + for(op = out,out+=n; op < out;) { unsigned oplen = out - op,b; if(oplen > _csize_) oplen = _csize_; /*PREFETCH(ip+512,0);*/\ + b = *ip++; ip = T2(bitunpacka, _usize_)[b](ip, oplen, op);\ op += oplen;\ } \ return ip - in;\ @@ -98,10 +110,10 @@ typedef unsigned char *(*BITUNPACK_D64)(const unsigned char *__restrict in, unsi #define BITNDUNPACK(in, n, out, _csize_, _usize_, _bitunpacka_) { if(!n) return 0;\ unsigned char *ip = in;\ - TEMPLATE2(vbxget, _usize_)(ip, start);\ - for(*out++ = start,--n,op = out; op != out+(n&~(_csize_-1)); ) { PREFETCH(ip+512,0);\ - unsigned b = *ip++; ip = TEMPLATE2(_bitunpacka_, _usize_)[b](ip, _csize_, op, start); op += _csize_; start = op[-1];\ - } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(_bitunpacka_, _usize_)[b](ip, n, op, start); }\ + T2(vbxget, _usize_)(ip, start);\ + for(*out++ = start,--n,op = out; op != out+(n&~(_csize_-1)); ) { /*PREFETCH(ip+512,0);*/\ + unsigned b = *ip++; ip = T2(_bitunpacka_, _usize_)[b](ip, _csize_, op, start); op += _csize_; start = op[-1];\ + } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(_bitunpacka_, _usize_)[b](ip, n, op, start); }\ return ip - in;\ } @@ -125,28 +137,33 @@ size_t bitnzunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restri size_t bitnzunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitzunpacka); } size_t bitnzunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitzunpacka); } +size_t bitnxunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op,start; BITNDUNPACK(in, n, out, 128, 8, bitxunpacka); } +size_t bitnxunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitxunpacka); } +size_t bitnxunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitxunpacka); } +size_t bitnxunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitxunpacka); } + size_t bitnfunpack8( unsigned char *__restrict in, size_t n, uint8_t *__restrict out) { uint8_t *op,start; BITNDUNPACK(in, n, out, 128, 8, bitfunpacka); } size_t bitnfunpack16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; BITNDUNPACK(in, n, out, 128, 16, bitfunpacka); } size_t bitnfunpack32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; BITNDUNPACK(in, n, out, 128, 32, bitfunpacka); } size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op,start; BITNDUNPACK(in, n, out, 128, 64, bitfunpacka); } - -#else //-------------------------------------------- SSE/AVX2 --------------------------------------------------------------------------------------- + #endif +//#else //-------------------------------------------- SSE/AVX2 --------------------------------------------------------------------------------------- #define _BITNUNPACKV(in, n, out, _csize_, _usize_, _bitunpackv_) {\ unsigned char *ip = in;\ - for(op = out; op != out+(n&~(_csize_-1)); op += _csize_) { PREFETCH(in+512,0);\ - unsigned b = *ip++; ip = TEMPLATE2(_bitunpackv_, _usize_)(ip, _csize_, op,b);\ - } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(bitunpack, _usize_)(ip, n, op,b); }\ + for(op = out; op != out+(n&~(_csize_-1)); op += _csize_) { /*PREFETCH(in+512,0);*/\ + unsigned b = *ip++; ip = T2(_bitunpackv_, _usize_)(ip, _csize_, op,b);\ + } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(bitunpack, _usize_)(ip, n, op,b); }\ return ip - in;\ } #define _BITNDUNPACKV(in, n, out, _csize_, _usize_, _bitunpackv_, _bitunpack_) { if(!n) return 0;\ unsigned char *ip = in;\ - TEMPLATE2(vbxget, _usize_)(ip, start); \ + T2(vbxget, _usize_)(ip, start); \ *out++ = start;\ - for(--n,op = out; op != out+(n&~(_csize_-1)); ) { PREFETCH(ip+512,0);\ - unsigned b = *ip++; ip = TEMPLATE2(_bitunpackv_, _usize_)(ip, _csize_, op, start,b); op += _csize_; start = op[-1];\ - } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = TEMPLATE2(_bitunpack_, _usize_)(ip, n, op, start,b); }\ + for(--n,op = out; op != out+(n&~(_csize_-1)); ) { /*PREFETCH(ip+512,0);*/\ + unsigned b = *ip++; ip = T2(_bitunpackv_, _usize_)(ip, _csize_, op, start,b); op += _csize_; start = op[-1];\ + } if(n&=(_csize_-1)) { unsigned b = *ip++; ip = T2(_bitunpack_, _usize_)(ip, n, op, start,b); }\ return ip - in;\ } #ifdef __AVX2__ //-------------------------------- AVX2 ---------------------------------------------------------------------------- @@ -471,38 +488,57 @@ unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, } //--------------------------------------- zeromask unpack for TurboPFor vp4d.c -------------------------------------- + +//-- bitunpack used in vp4d.c --------- #define VO32(_op_, _i_, _ov_, _nb_,_parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) )); pex += popcnt32(xm) #define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm) #define BITUNPACK0(_parm_) #include "bitunpack_.h" unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + const unsigned char *ip = in+PAD8(256*b); + unsigned xm; + __m256i zv = _mm256_setzero_si256(), sv, + tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip; } +//-- bitunpack zigzag --------- #define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_) #define VO32(_op_, i, _ov_, _nb_,_sv_) _ov_ = mm256_zzagd_epi32(_ov_); _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_) #include "bitunpack_.h" #define BITUNPACK0(_parm_) unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(256*b); - __m256i sv = _mm256_set1_epi32(start);//, zv = _mm256_setzero_si256(); + __m256i sv = _mm256_set1_epi32(start); //, zv = _mm256_setzero_si256(); BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip; } +//-- bitunpack xor --------- +#define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_) +#define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_xord_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_) +#include "bitunpack_.h" +#define BITUNPACK0(_parm_) +unsigned char *bitxunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +//-- bitunpack delta --------- #define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_scan_epi32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_) #include "bitunpack_.h" #define BITUNPACK0(_parm_) unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(256*b); - __m256i sv = _mm256_set1_epi32(start);// zv = _mm256_setzero_si256(); + __m256i sv = _mm256_set1_epi32(start); // zv = _mm256_setzero_si256(); BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip; } +//-- bitunpack FOR --------- #define VO32( _op_, _i_, _ov_, _nb_,_parm_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, sv)) #include "bitunpack_.h" #define BITUNPACK0(_parm_) @@ -512,7 +548,8 @@ unsigned char *bitfunpack256v32( const unsigned char *__restrict in, unsigned n, BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip; } -//----------------------------------------------------------------------------- + +//-- bitunpack delta used in vp4d.c --------- #define VX32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) ); pex += popcnt32(xm) #define VXZ32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm) @@ -527,6 +564,7 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n return (unsigned char *)ip; } +//-- bitunpack zigag used in vp4d.c --------- #define VX32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), _nb_) ); pex += popcnt32(xm) #define VXZ32(_i_, _nb_,_ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm) @@ -553,12 +591,14 @@ unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { const unsigned char *ip = in+PAD8(256*b); unsigned xm; - const __m256i zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + const __m256i zv = _mm256_setzero_si256(), + tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); __m256i sv = _mm256_set1_epi32(start); BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip; } +//-- bitunpack delta 1 ----------------------------- #define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); #define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv) #include "bitunpack_.h" @@ -566,11 +606,13 @@ unsigned char *_bitzunpack256v32( const unsigned char *__restrict in, unsigned n unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(256*b); const __m256i zv = _mm256_setzero_si256(); - __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1); + __m256i sv = _mm256_set1_epi32(start), + cv = _mm256_set_epi32(8,7,6,5,4,3,2,1); BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip; } +//-- bitunpack FOR 1 ----------------------------- #define VO32( _op_, _i_, _ov_, _nb_,_sv_) _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _sv_)); _sv_ = _mm256_add_epi32(_sv_, cv) #define VOZ32(_op_, _i_, ov, _nb_,_sv_) _mm256_storeu_si256(_op_++, _sv_); _sv_ = _mm256_add_epi32(_sv_, cv); #include "bitunpack_.h" @@ -583,24 +625,31 @@ unsigned char *bitf1unpack256v32( const unsigned char *__restrict in, unsigned n return (unsigned char *)ip; } +//-- bitunpack delta 1 for vp4d.c ----------------------------- #define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); #define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) VXZ32(_i_, _nb_,_ov_); _sv_ = mm256_scani_epi32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); #include "bitunpack_.h" #define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128() unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(256*b); unsigned xm; - const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1), zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + const unsigned char *ip = in+PAD8(256*b); + unsigned xm; + const __m256i cv = _mm256_set_epi32(8,7,6,5,4,3,2,1), + zv = _mm256_setzero_si256(), + tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); __m256i sv = _mm256_set1_epi32(start); BITUNPACK256V32(in, b, out, sv); return (unsigned char *)ip; } +//---------------------------------------------------- bitnunpack --------------------------------------------------------------------- size_t bitnunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op; _BITNUNPACKV( in, n, out, 256, 32, bitunpack256v); } size_t bitndunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitdunpack256v, bitdunpack); } size_t bitnd1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitd1unpack256v, bitd1unpack); } //size_t bitns1unpack256v32(unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bits1unpack256v, bitd1unpack); } size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitzunpack256v, bitzunpack); } +size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitxunpack256v, bitxunpack); } size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitfunpack256v, bitfunpack); } + #elif defined(__SSE2__) || defined(__ARM_NEON) //------------------------------ SSE2/SSSE3 --------------------------------------------------------- #define BITMAX16 16 #define BITMAX32 32 @@ -1001,15 +1050,30 @@ ALIGNED(char, _shuffle_16[256][16],16) = { #define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128() #include "bitunpack_.h" +//--- bitunpack for vp4d.c ------------------------------ unsigned char *_bitunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b, unsigned short *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv; BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; + __m128i sv; + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } + unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv; BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; + __m128i sv; + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; } + unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *_in=in; unsigned *_out=out, m; __m128i sv; - BITUNPACK128V32(in, b, out, sv); out = _out+128; in=_in+PAD8(128*b); + const unsigned char *_in=in; + unsigned *_out=out, m; + __m128i sv; + BITUNPACK128V32(in, b, out, sv); + out = _out+128; + in = _in+PAD8(128*b); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)_in+PAD8(256*b); } @@ -1023,19 +1087,23 @@ unsigned char *_bitunpack256w32( const unsigned char *__restrict in, unsigned n, #include "bitunpack_.h" unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, uint32_t *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i zv = _mm_setzero_si128(); BITUNPACK128V32(in, b, out, 0); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; + __m128i zv = _mm_setzero_si128(); + BITUNPACK128V32(in, b, out, 0); + return (unsigned char *)ip; } -#define BITMAX16 16 -#define BITMAX32 32 - #undef VO32 #undef VOZ32 #undef VO16 #undef VOZ16 #undef BITUNPACK0 +//-------------------------------------------------------------------------------------------------------------------------------------------- +#define BITMAX16 16 +#define BITMAX32 32 -//------------------------------------------------------------------- +//--- bitunpack zigzag -------------------- #define VOZ16(_op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_) #define VOZ32(_op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_) #define VO16( _op_, _i_, _ov_, _nb_,_sv_) _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_) @@ -1043,38 +1111,77 @@ unsigned char *_bitunpack128v64( const unsigned char *__restrict in, unsigned n, #include "bitunpack_.h" #define BITUNPACK0(_parm_) unsigned char *bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; -} -unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi16(start); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } +unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi32(start); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} + +#define VO16( _op_, _i_, _ov_, _nb_,_sv_) _sv_ = mm_xord_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_) +#define VO32( _op_, _i_, _ov_, _nb_,_sv_) _sv_ = mm_xord_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_) +#include "bitunpack_.h" +#define BITUNPACK0(_parm_) +unsigned char *bitxunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi16(start); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; +} + +unsigned char *bitxunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi32(start); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} + +//-- bitunpack delta ------------------------------ #define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_) #define VO16(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_) #include "bitunpack_.h" #define BITUNPACK0(_parm_) unsigned char *bitdunpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi16(start); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } + unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; } +//-- bitunpack FOR ---------------------------- #define VO32( _op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, sv)) #define VO16( _op_, _i_, _ov_, _nb_,_parm_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, sv)) #include "bitunpack_.h" #define BITUNPACK0(_parm_) unsigned char *bitfunpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi16(start); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } + unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi32(start); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; } #if defined(__SSSE3__) || defined(__ARM_NEON) #define BITMAX16 15 #define BITMAX32 31 +//-- bitunpack delta used in vp4d.c --------- #define VX32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8( mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m) #define VXZ32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_32[m])); pex += popcnt32(m) #define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); _sv_ = mm_scan_epi32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); @@ -1087,10 +1194,18 @@ unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, #include "bitunpack_.h" #define BITUNPACK0(_parm_) unsigned char *_bitdunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; + __m128i sv = _mm_set1_epi16(start); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } + unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; __m128i sv = _mm_set1_epi32(start); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; } /* @@ -1100,6 +1215,7 @@ unsigned char *_bitdunpack128v64( const unsigned char *__restrict in, unsigned n const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start),zv = _mm_setzero_si128(); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; }*/ +//-- bitunpack zigzag used in vp4d.c -------------------------- #define VX16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8( mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ) ); pex += popcnt32(m) #define VXZ16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m]) ); pex += popcnt32(m) #define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_); _ov_ = mm_zzagd_epi16(_ov_); _sv_ = mm_scan_epi16(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); @@ -1113,29 +1229,47 @@ unsigned char *_bitdunpack128v64( const unsigned char *__restrict in, unsigned n #include "bitunpack_.h" #define BITUNPACK0(_parm_) unsigned char *_bitzunpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; __m128i sv = _mm_set1_epi16(start); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } unsigned char *_bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; __m128i sv = _mm_set1_epi32(start); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; } #define BITMAX16 16 #define BITMAX32 32 #endif +//-- bitunpack delta 1 ------------------------------ #define VO16(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scani_epi16(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); #define VO32(_op_, i, _ov_, _nb_,_sv_) _sv_ = mm_scani_epi32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); #define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv) #define VOZ32(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv) #include "bitunpack_.h" #define BITUNPACK0(_parm_) _parm_ = _mm_add_epi16(_parm_, cv); cv = _mm_set1_epi16(8) + unsigned char *bitd1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start), cv = _mm_set_epi16(8,7,6,5,4,3,2,1); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; -} -#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4) -unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi16(start), + cv = _mm_set_epi16(8,7,6,5,4,3,2,1); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } +#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4) +unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi32(start), + cv = _mm_set_epi32(4,3,2,1); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} + +//-- bitunpack sub 1 ------------------------------ #define VO16(_op_, i, _ov_, _nb_,_sv_) ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); #define VO32(_op_, i, _ov_, _nb_,_sv_) ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); #define VOZ16(_op_, _i_, ov, _nb_,_parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi16(_parm_, cv) @@ -1143,29 +1277,48 @@ unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n #include "bitunpack_.h" #define BITUNPACK0(_parm_) _parm_ = _mm_add_epi16(_parm_, cv); cv = _mm_set1_epi16(8) unsigned char *bits1unpack128v16( const unsigned char *__restrict in, unsigned n, uint16_t *__restrict out, uint16_t start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; -} -#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4) -unsigned char *bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi16(start), + cv = _mm_set1_epi16(8); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } +#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4) +unsigned char *bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set1_epi32(start), + cv = _mm_set1_epi32(4); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; +} + +//-- bitunpack FOR 1 ------------------ #define VO16( _op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _mm_add_epi16(_ov_, _sv_)); _sv_ = _mm_add_epi16(_sv_, cv) #define VO32( _op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _sv_)); _sv_ = _mm_add_epi32(_sv_, cv) #define VOZ32(_op_, _i_, _ov_, _nb_,_sv_) _mm_storeu_si128(_op_++, _sv_); _sv_ = _mm_add_epi32(_sv_, cv); #include "bitunpack_.h" #define BITUNPACK0(_parm_) unsigned char *bitf1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set_epi16(start+8,start+7,start+6,start+5,start+4,start+3,start+2,start+1), + cv = _mm_set1_epi16(8); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set_epi32(start+4,start+3,start+2,start+1), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + __m128i sv = _mm_set_epi32(start+4,start+3,start+2,start+1), + cv = _mm_set1_epi32(4); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; } #if defined(__SSSE3__) || defined(__ARM_NEON) #define BITMAX16 15 #define BITMAX32 31 +//-- bitunpack delta 1 for vp4d.c ----------------------- #define VX16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_add_epi16(_ov_, _mm_shuffle_epi8( mm_slli_epi16(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_16[m]))); pex += popcnt32(m) #define VX32(_i_, _nb_,_ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8( mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), _nb_), _mm_loadu_si128((__m128i*)_shuffle_32[m]))); pex += popcnt32(m) #define VXZ16(_i_, _nb_,_ov_) m = *bb++; _ov_ = _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_loadu_si128((__m128i*)_shuffle_16[m])); pex += popcnt32(m) @@ -1179,13 +1332,20 @@ unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n #include "bitunpack_.h" #define BITUNPACK0(_parm_) mv = _mm_setzero_si128() //_parm_ = _mm_setzero_si128() unsigned char *_bitd1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start), cv = _mm_set_epi16(8,7,6,5,4,3,2,1); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; + __m128i sv = _mm_set1_epi16(start), + cv = _mm_set_epi16(8,7,6,5,4,3,2,1); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } + #define BITUNPACK0(_parm_) mv = _mm_setzero_si128() unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32( 4,3,2,1); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; } +//-- bitunpack sub 1 ----------------------- #define VO16( _op_, _i_, _ov_, _nb_,_sv_) VX16( _i_, _nb_,_ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); #define VOZ16(_op_, _i_, _ov_, _nb_,_sv_) VXZ16( _i_, _nb_,_ov_); ADDI16x8(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); #define VO32( _op_, _i_, _ov_, _nb_,_sv_) VX32( _i_, _nb_,_ov_); ADDI32x4(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); @@ -1194,16 +1354,28 @@ unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned #include "bitunpack_.h" #define BITUNPACK0(_parm_) mv = _mm_setzero_si128() //_parm_ = _mm_setzero_si128() unsigned char *_bits1unpack128v16( const unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start, unsigned b, unsigned short *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITUNPACK128V16(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; + __m128i sv = _mm_set1_epi16(start), + cv = _mm_set1_epi16(8); + BITUNPACK128V16(in, b, out, sv); + return (unsigned char *)ip; } + #define BITUNPACK0(_parm_) mv = _mm_setzero_si128() unsigned char *_bits1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; + const unsigned char *ip = in+PAD8(128*b); + unsigned m; + __m128i sv = _mm_set1_epi32(start), + cv = _mm_set1_epi32(4); + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; } #define BITMAX16 16 #define BITMAX32 32 #endif +//--------------------------------------------------- bitnunpack ------------------------------------------------------------------------------------------------------------------ size_t bitnunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op; _BITNUNPACKV( in, n, out, 128, 16, bitunpack128v); } size_t bitnunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op; _BITNUNPACKV( in, n, out, 128, 32, bitunpack128v); } size_t bitnunpack128v64( unsigned char *__restrict in, size_t n, uint64_t *__restrict out) { uint64_t *op; _BITNUNPACKV( in, n, out, 128, 64, bitunpack128v); } @@ -1221,11 +1393,14 @@ size_t bitns1unpack128v32(unsigned char *__restrict in, size_t n, uint32_t *__re size_t bitnzunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitzunpack128v, bitzunpack); } size_t bitnzunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitzunpack128v, bitzunpack); } +size_t bitnxunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitxunpack128v, bitxunpack); } +size_t bitnxunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitxunpack128v, bitxunpack); } + size_t bitnfunpack128v16( unsigned char *__restrict in, size_t n, uint16_t *__restrict out) { uint16_t *op,start; _BITNDUNPACKV(in, n, out, 128, 16, bitfunpack128v, bitfunpack); } size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 128, 32, bitfunpack128v, bitfunpack); } #endif -#endif +//#endif #pragma clang diagnostic pop #pragma GCC pop_options