diff --git a/vp4dc.c b/vp4dc.c deleted file mode 100644 index 895d7f7..0000000 --- a/vp4dc.c +++ /dev/null @@ -1,183 +0,0 @@ -/** - Copyright (C) powturbo 2013-2016 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Integer Compression" Turbo PforDelta - #ifndef USIZE -#include - -#include "conf.h" -#include "bitpack.h" -#include "vp4dc.h" - -#define PAD8(__x) ( (((__x)+8-1)/8) ) -//------------------------------------------ -#define P4DSIZE 128 //64 // - -#define P4D p4d -#define P4DENC p4denc -#define P4DNENC p4dnenc -#define P4DE p4de - -#define BITPACK bitpack -#define USIZE 32 -#include __FILE__ -#undef USIZE - -#define USIZE 16 -#include __FILE__ -#undef USIZE - -#define USIZE 64 -#include __FILE__ -#undef USIZE - -#undef BITPACK -#undef P4DENC -#undef P4DNENC -#undef P4DE -#undef P4D -//------------------------------------------ -#define P4DENC p4dencv -#define P4DNENC p4dnencv -#define P4DE p4dev -#define BITPACK bitpackv -#define USIZE 32 -#include __FILE__ -#undef USIZE - - #else -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wparentheses" - -#define uint_t TEMPLATE3(uint, USIZE, _t) -#define P4DN (P4DSIZE/64) - - #ifdef P4D -//#define MINCALC(_i_) { l = PAD8(n*_i_) + PAD8(x*(bx-_i_)); x += cnt[_i_]; if(unlikely(l < ml)) b = _i_, ml = l; } -#define MINCALC(_i_) { l = PAD8(n*_i_) + PAD8(x*(bx-_i_)); x += cnt[_i_]; unlikely(l < ml)?(ml=l,b=_i_):(ml=ml,b=b); } -unsigned TEMPLATE2(P4D, USIZE)(uint_t *__restrict in, unsigned n, unsigned *pbx) { - uint_t *ip,b=0; int i,ml,l; unsigned x, bx, cnt[USIZE+1] = {0}; - - for(ip = in; ip != in+(n&~3); ) { - ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++; - ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++; - ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++; - ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++; - } - while(ip != in+n) ++cnt[TEMPLATE2(bsr, USIZE)(*ip)], b |= *ip++; - b = TEMPLATE2(bsr, USIZE)(b); - - bx = b; ml = PAD8(n*b)+1-2-P4DN*8; x = cnt[b]; - #if USIZE > 32 - for(i = b-1; i >= 0; --i) MINCALC(i); // l = PAD8(n*i) + PAD8(x*(bx-i)); x += cnt[i]; if(unlikely(l < ml)) b = i, ml = l; - #else - switch(b-1) { - case 31: MINCALC(31); - case 30: MINCALC(30); - case 29: MINCALC(29); - case 28: MINCALC(28); - case 27: MINCALC(27); - case 26: MINCALC(26); - case 25: MINCALC(25); - case 24: MINCALC(24); - case 23: MINCALC(23); - case 22: MINCALC(22); - case 21: MINCALC(21); - case 20: MINCALC(20); - - case 19: MINCALC(19); - case 18: MINCALC(18); - case 17: MINCALC(17); - case 16: MINCALC(16); - case 15: MINCALC(15); - case 14: MINCALC(14); - case 13: MINCALC(13); - case 12: MINCALC(12); - case 11: MINCALC(11); - case 10: MINCALC(10); - - case 9: MINCALC( 9); - case 8: MINCALC( 8); - case 7: MINCALC( 7); - case 6: MINCALC( 6); - case 5: MINCALC( 5); - case 4: MINCALC( 4); - case 3: MINCALC( 3); - case 2: MINCALC( 2); - case 1: MINCALC( 1); - case 0: MINCALC( 0); - } - #endif - *pbx = bx - b; - return b; -} -#endif - -unsigned char *TEMPLATE2(P4DE, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx) { unsigned i, xn, c; - if(!bx || b==USIZE) return TEMPLATE2(BITPACK, USIZE)(in, n, out, b); - - uint_t msk = (1ull << b)-1,_in[P4DSIZE], inx[P4DSIZE*2]; - unsigned long long xmap[P4DN]; - unsigned miss[P4DSIZE]; - #if P4DN == 2 - xmap[0] = xmap[1] = 0; - #else - for(i = 0; i < P4DN; i++) xmap[i] = 0; - #endif - for(xn = i = 0; i != n&~3; ) { - miss[xn] = i; xn += in[i] > msk; _in[i] = in[i] & msk; i++; - miss[xn] = i; xn += in[i] > msk; _in[i] = in[i] & msk; i++; - miss[xn] = i; xn += in[i] > msk; _in[i] = in[i] & msk; i++; - miss[xn] = i; xn += in[i] > msk; _in[i] = in[i] & msk; i++; - } - while(i != n) { miss[xn] = i; xn += in[i] > msk; _in[i] = in[i] & msk; i++; } - for(i = 0; i != xn; ++i) { - c = miss[i]; - xmap[c>>6] |= (1ull << (c&0x3f)); - inx[i] = in[c] >> b; - } - #if P4DN == 2 - *(unsigned long long *)out = xmap[0]; out += 8; - *(unsigned long long *)out = xmap[1]; out += 8; - #else - for(i=0;i < P4DN; i++) { *(unsigned long long *)out = xmap[i]; out += 8; } - #endif - out = TEMPLATE2(bitpack, USIZE)(inx, xn, out, bx); - return TEMPLATE2(BITPACK, USIZE)(_in, n, out, b); -} - -unsigned char *TEMPLATE2(P4DENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out) { - unsigned bx,b = TEMPLATE2(p4d, USIZE)(in, n, &bx); - P4DSAVE(out, b, bx); - return TEMPLATE2(P4DE, USIZE)(in, n, out, b, bx); -} - -unsigned char *TEMPLATE2(P4DNENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out) { - uint_t *ip; - for(ip = in; ip < in+n; ip += P4DSIZE) { unsigned l = (in+n) - ip; l = min(l, P4DSIZE); - out = TEMPLATE2(P4DENC, USIZE)(ip, l, out); - } - return out; -} -#pragma clang diagnostic pop - #endif diff --git a/vp4dc.h b/vp4dc.h deleted file mode 100644 index 96e2cac..0000000 --- a/vp4dc.h +++ /dev/null @@ -1,57 +0,0 @@ -/** - Copyright (C) powturbo 2013-2015 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// vp4dc.h - "Integer Compression" TurboPfor (see vp4dd.h for decompression) - -#define P4DEB(__b) (__b << 1) -#define P4DEBX(__b, __bx) (__bx << 8 | __b << 1 | 1) -#define P4DSAVE(__out, __b, __bx) do { if(!__bx) *__out++ = P4DEB(__b);else *(unsigned short *)__out = P4DEBX(__b, __bx), __out += 2; } while(0) - -#ifdef __cplusplus -extern "C" { -#endif - -// compress integer array with n values to the buffer out. Return value = end of compressed buffer out -unsigned char *p4denc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4denc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *p4denc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); - -// SIMD compress integer array with n values to the buffer out. Return value = end of compressed buffer out -unsigned char *p4dencv32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out); -//unsigned char *p4denc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); - -// calculate the best bit size b and bx. -unsigned p4d32(unsigned *__restrict in, unsigned n, unsigned *pbx); -unsigned p4d16(unsigned short *__restrict in, unsigned n, unsigned *pbx); -unsigned p4d64(uint64_t *__restrict in, unsigned n, unsigned *pbx); - -// same as p4denc, but with b and bx as parameters. Call after p4d32/p4d16 -unsigned char *p4de32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -unsigned char *p4de16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); -unsigned char *p4de64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); - -unsigned char *p4dev32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); - -#ifdef __cplusplus -} -#endif diff --git a/vp4dd.c b/vp4dd.c deleted file mode 100644 index abccd6a..0000000 --- a/vp4dd.c +++ /dev/null @@ -1,590 +0,0 @@ -/** - Copyright (C) powturbo 2013-2016 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// vp4dd.c - "Integer Compression" Turbo PforDelta - #ifndef USIZE -#include - -#include "conf.h" -#include "bitunpack.h" -#include "bitutil.h" -#include "vp4dd.h" - -#define PAD8(__x) ( (((__x)+8-1)/8) ) - -//#define __AVX2__ON // disabled (tested w/ notebook cpu i5-4200u and skylake i6700-3.7GHz, but slower than SSE3) - - #if defined(__AVX2__) && defined(__AVX2__ON) -#include -static ALIGNED(unsigned char, shuffles[256][8], 32) = { - { 0,0,0,0,0,0,0,0 }, - { 0,1,1,1,1,1,1,1 }, - { 1,0,1,1,1,1,1,1 }, - { 0,1,2,2,2,2,2,2 }, - { 1,1,0,1,1,1,1,1 }, - { 0,2,1,2,2,2,2,2 }, - { 2,0,1,2,2,2,2,2 }, - { 0,1,2,3,3,3,3,3 }, - { 1,1,1,0,1,1,1,1 }, - { 0,2,2,1,2,2,2,2 }, - { 2,0,2,1,2,2,2,2 }, - { 0,1,3,2,3,3,3,3 }, - { 2,2,0,1,2,2,2,2 }, - { 0,3,1,2,3,3,3,3 }, - { 3,0,1,2,3,3,3,3 }, - { 0,1,2,3,4,4,4,4 }, - { 1,1,1,1,0,1,1,1 }, - { 0,2,2,2,1,2,2,2 }, - { 2,0,2,2,1,2,2,2 }, - { 0,1,3,3,2,3,3,3 }, - { 2,2,0,2,1,2,2,2 }, - { 0,3,1,3,2,3,3,3 }, - { 3,0,1,3,2,3,3,3 }, - { 0,1,2,4,3,4,4,4 }, - { 2,2,2,0,1,2,2,2 }, - { 0,3,3,1,2,3,3,3 }, - { 3,0,3,1,2,3,3,3 }, - { 0,1,4,2,3,4,4,4 }, - { 3,3,0,1,2,3,3,3 }, - { 0,4,1,2,3,4,4,4 }, - { 4,0,1,2,3,4,4,4 }, - { 0,1,2,3,4,5,5,5 }, - { 1,1,1,1,1,0,1,1 }, - { 0,2,2,2,2,1,2,2 }, - { 2,0,2,2,2,1,2,2 }, - { 0,1,3,3,3,2,3,3 }, - { 2,2,0,2,2,1,2,2 }, - { 0,3,1,3,3,2,3,3 }, - { 3,0,1,3,3,2,3,3 }, - { 0,1,2,4,4,3,4,4 }, - { 2,2,2,0,2,1,2,2 }, - { 0,3,3,1,3,2,3,3 }, - { 3,0,3,1,3,2,3,3 }, - { 0,1,4,2,4,3,4,4 }, - { 3,3,0,1,3,2,3,3 }, - { 0,4,1,2,4,3,4,4 }, - { 4,0,1,2,4,3,4,4 }, - { 0,1,2,3,5,4,5,5 }, - { 2,2,2,2,0,1,2,2 }, - { 0,3,3,3,1,2,3,3 }, - { 3,0,3,3,1,2,3,3 }, - { 0,1,4,4,2,3,4,4 }, - { 3,3,0,3,1,2,3,3 }, - { 0,4,1,4,2,3,4,4 }, - { 4,0,1,4,2,3,4,4 }, - { 0,1,2,5,3,4,5,5 }, - { 3,3,3,0,1,2,3,3 }, - { 0,4,4,1,2,3,4,4 }, - { 4,0,4,1,2,3,4,4 }, - { 0,1,5,2,3,4,5,5 }, - { 4,4,0,1,2,3,4,4 }, - { 0,5,1,2,3,4,5,5 }, - { 5,0,1,2,3,4,5,5 }, - { 0,1,2,3,4,5,6,6 }, - { 1,1,1,1,1,1,0,1 }, - { 0,2,2,2,2,2,1,2 }, - { 2,0,2,2,2,2,1,2 }, - { 0,1,3,3,3,3,2,3 }, - { 2,2,0,2,2,2,1,2 }, - { 0,3,1,3,3,3,2,3 }, - { 3,0,1,3,3,3,2,3 }, - { 0,1,2,4,4,4,3,4 }, - { 2,2,2,0,2,2,1,2 }, - { 0,3,3,1,3,3,2,3 }, - { 3,0,3,1,3,3,2,3 }, - { 0,1,4,2,4,4,3,4 }, - { 3,3,0,1,3,3,2,3 }, - { 0,4,1,2,4,4,3,4 }, - { 4,0,1,2,4,4,3,4 }, - { 0,1,2,3,5,5,4,5 }, - { 2,2,2,2,0,2,1,2 }, - { 0,3,3,3,1,3,2,3 }, - { 3,0,3,3,1,3,2,3 }, - { 0,1,4,4,2,4,3,4 }, - { 3,3,0,3,1,3,2,3 }, - { 0,4,1,4,2,4,3,4 }, - { 4,0,1,4,2,4,3,4 }, - { 0,1,2,5,3,5,4,5 }, - { 3,3,3,0,1,3,2,3 }, - { 0,4,4,1,2,4,3,4 }, - { 4,0,4,1,2,4,3,4 }, - { 0,1,5,2,3,5,4,5 }, - { 4,4,0,1,2,4,3,4 }, - { 0,5,1,2,3,5,4,5 }, - { 5,0,1,2,3,5,4,5 }, - { 0,1,2,3,4,6,5,6 }, - { 2,2,2,2,2,0,1,2 }, - { 0,3,3,3,3,1,2,3 }, - { 3,0,3,3,3,1,2,3 }, - { 0,1,4,4,4,2,3,4 }, - { 3,3,0,3,3,1,2,3 }, - { 0,4,1,4,4,2,3,4 }, - { 4,0,1,4,4,2,3,4 }, - { 0,1,2,5,5,3,4,5 }, - { 3,3,3,0,3,1,2,3 }, - { 0,4,4,1,4,2,3,4 }, - { 4,0,4,1,4,2,3,4 }, - { 0,1,5,2,5,3,4,5 }, - { 4,4,0,1,4,2,3,4 }, - { 0,5,1,2,5,3,4,5 }, - { 5,0,1,2,5,3,4,5 }, - { 0,1,2,3,6,4,5,6 }, - { 3,3,3,3,0,1,2,3 }, - { 0,4,4,4,1,2,3,4 }, - { 4,0,4,4,1,2,3,4 }, - { 0,1,5,5,2,3,4,5 }, - { 4,4,0,4,1,2,3,4 }, - { 0,5,1,5,2,3,4,5 }, - { 5,0,1,5,2,3,4,5 }, - { 0,1,2,6,3,4,5,6 }, - { 4,4,4,0,1,2,3,4 }, - { 0,5,5,1,2,3,4,5 }, - { 5,0,5,1,2,3,4,5 }, - { 0,1,6,2,3,4,5,6 }, - { 5,5,0,1,2,3,4,5 }, - { 0,6,1,2,3,4,5,6 }, - { 6,0,1,2,3,4,5,6 }, - { 0,1,2,3,4,5,6,7 }, - { 1,1,1,1,1,1,1,0 }, - { 0,2,2,2,2,2,2,1 }, - { 2,0,2,2,2,2,2,1 }, - { 0,1,3,3,3,3,3,2 }, - { 2,2,0,2,2,2,2,1 }, - { 0,3,1,3,3,3,3,2 }, - { 3,0,1,3,3,3,3,2 }, - { 0,1,2,4,4,4,4,3 }, - { 2,2,2,0,2,2,2,1 }, - { 0,3,3,1,3,3,3,2 }, - { 3,0,3,1,3,3,3,2 }, - { 0,1,4,2,4,4,4,3 }, - { 3,3,0,1,3,3,3,2 }, - { 0,4,1,2,4,4,4,3 }, - { 4,0,1,2,4,4,4,3 }, - { 0,1,2,3,5,5,5,4 }, - { 2,2,2,2,0,2,2,1 }, - { 0,3,3,3,1,3,3,2 }, - { 3,0,3,3,1,3,3,2 }, - { 0,1,4,4,2,4,4,3 }, - { 3,3,0,3,1,3,3,2 }, - { 0,4,1,4,2,4,4,3 }, - { 4,0,1,4,2,4,4,3 }, - { 0,1,2,5,3,5,5,4 }, - { 3,3,3,0,1,3,3,2 }, - { 0,4,4,1,2,4,4,3 }, - { 4,0,4,1,2,4,4,3 }, - { 0,1,5,2,3,5,5,4 }, - { 4,4,0,1,2,4,4,3 }, - { 0,5,1,2,3,5,5,4 }, - { 5,0,1,2,3,5,5,4 }, - { 0,1,2,3,4,6,6,5 }, - { 2,2,2,2,2,0,2,1 }, - { 0,3,3,3,3,1,3,2 }, - { 3,0,3,3,3,1,3,2 }, - { 0,1,4,4,4,2,4,3 }, - { 3,3,0,3,3,1,3,2 }, - { 0,4,1,4,4,2,4,3 }, - { 4,0,1,4,4,2,4,3 }, - { 0,1,2,5,5,3,5,4 }, - { 3,3,3,0,3,1,3,2 }, - { 0,4,4,1,4,2,4,3 }, - { 4,0,4,1,4,2,4,3 }, - { 0,1,5,2,5,3,5,4 }, - { 4,4,0,1,4,2,4,3 }, - { 0,5,1,2,5,3,5,4 }, - { 5,0,1,2,5,3,5,4 }, - { 0,1,2,3,6,4,6,5 }, - { 3,3,3,3,0,1,3,2 }, - { 0,4,4,4,1,2,4,3 }, - { 4,0,4,4,1,2,4,3 }, - { 0,1,5,5,2,3,5,4 }, - { 4,4,0,4,1,2,4,3 }, - { 0,5,1,5,2,3,5,4 }, - { 5,0,1,5,2,3,5,4 }, - { 0,1,2,6,3,4,6,5 }, - { 4,4,4,0,1,2,4,3 }, - { 0,5,5,1,2,3,5,4 }, - { 5,0,5,1,2,3,5,4 }, - { 0,1,6,2,3,4,6,5 }, - { 5,5,0,1,2,3,5,4 }, - { 0,6,1,2,3,4,6,5 }, - { 6,0,1,2,3,4,6,5 }, - { 0,1,2,3,4,5,7,6 }, - { 2,2,2,2,2,2,0,1 }, - { 0,3,3,3,3,3,1,2 }, - { 3,0,3,3,3,3,1,2 }, - { 0,1,4,4,4,4,2,3 }, - { 3,3,0,3,3,3,1,2 }, - { 0,4,1,4,4,4,2,3 }, - { 4,0,1,4,4,4,2,3 }, - { 0,1,2,5,5,5,3,4 }, - { 3,3,3,0,3,3,1,2 }, - { 0,4,4,1,4,4,2,3 }, - { 4,0,4,1,4,4,2,3 }, - { 0,1,5,2,5,5,3,4 }, - { 4,4,0,1,4,4,2,3 }, - { 0,5,1,2,5,5,3,4 }, - { 5,0,1,2,5,5,3,4 }, - { 0,1,2,3,6,6,4,5 }, - { 3,3,3,3,0,3,1,2 }, - { 0,4,4,4,1,4,2,3 }, - { 4,0,4,4,1,4,2,3 }, - { 0,1,5,5,2,5,3,4 }, - { 4,4,0,4,1,4,2,3 }, - { 0,5,1,5,2,5,3,4 }, - { 5,0,1,5,2,5,3,4 }, - { 0,1,2,6,3,6,4,5 }, - { 4,4,4,0,1,4,2,3 }, - { 0,5,5,1,2,5,3,4 }, - { 5,0,5,1,2,5,3,4 }, - { 0,1,6,2,3,6,4,5 }, - { 5,5,0,1,2,5,3,4 }, - { 0,6,1,2,3,6,4,5 }, - { 6,0,1,2,3,6,4,5 }, - { 0,1,2,3,4,7,5,6 }, - { 3,3,3,3,3,0,1,2 }, - { 0,4,4,4,4,1,2,3 }, - { 4,0,4,4,4,1,2,3 }, - { 0,1,5,5,5,2,3,4 }, - { 4,4,0,4,4,1,2,3 }, - { 0,5,1,5,5,2,3,4 }, - { 5,0,1,5,5,2,3,4 }, - { 0,1,2,6,6,3,4,5 }, - { 4,4,4,0,4,1,2,3 }, - { 0,5,5,1,5,2,3,4 }, - { 5,0,5,1,5,2,3,4 }, - { 0,1,6,2,6,3,4,5 }, - { 5,5,0,1,5,2,3,4 }, - { 0,6,1,2,6,3,4,5 }, - { 6,0,1,2,6,3,4,5 }, - { 0,1,2,3,7,4,5,6 }, - { 4,4,4,4,0,1,2,3 }, - { 0,5,5,5,1,2,3,4 }, - { 5,0,5,5,1,2,3,4 }, - { 0,1,6,6,2,3,4,5 }, - { 5,5,0,5,1,2,3,4 }, - { 0,6,1,6,2,3,4,5 }, - { 6,0,1,6,2,3,4,5 }, - { 0,1,2,7,3,4,5,6 }, - { 5,5,5,0,1,2,3,4 }, - { 0,6,6,1,2,3,4,5 }, - { 6,0,6,1,2,3,4,5 }, - { 0,1,7,2,3,4,5,6 }, - { 6,6,0,1,2,3,4,5 }, - { 0,7,1,2,3,4,5,6 }, - { 7,0,1,2,3,4,5,6 }, - { 0,1,2,3,4,5,6,7, } - }; - #elif defined(__SSSE3__) -#include -static ALIGNED(char, shuffles[16][16], 16) = { - #define _ 0x80 - { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, - { 0,1,2,3, _,_,_,_, _,_, _, _, _, _, _,_ }, - { _,_,_,_, 0,1,2,3, _,_, _, _, _, _, _,_ }, - { 0,1,2,3, 4,5,6,7, _,_, _, _, _, _, _,_ }, - { _,_,_,_, _,_,_,_, 0,1, 2, 3, _, _, _,_ }, - { 0,1,2,3, _,_,_,_, 4,5, 6, 7, _, _, _,_ }, - { _,_,_,_, 0,1,2,3, 4,5, 6, 7, _, _, _,_ }, - { 0,1,2,3, 4,5,6,7, 8,9,10,11, _, _, _,_ }, - { _,_,_,_, _,_,_,_, _,_,_,_, 0, 1, 2, 3 }, - { 0,1,2,3, _,_,_,_, _,_,_, _, 4, 5, 6, 7 }, - { _,_,_,_, 0,1,2,3, _,_,_, _, 4, 5, 6, 7 }, - { 0,1,2,3, 4,5,6,7, _,_, _, _, 8, 9,10,11 }, - { _,_,_,_, _,_,_,_, 0,1, 2, 3, 4, 5, 6, 7 }, - { 0,1,2,3, _,_,_,_, 4,5, 6, 7, 8, 9,10,11 }, - { _,_,_,_, 0,1,2,3, 4,5, 6, 7, 8, 9,10,11 }, - { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, - #undef _ -}; - #endif -//---------------------- 32 bits --------------------- -#define USIZE 32 - -//----- scalar ----------- -#define P4DD p4dd -#define P4DDEC p4ddec -#define BITUNPACK bitunpack - -#define P4DDECX - //delta 1 -#define P4DDD p4dd1d -#define P4DDECD p4dd1dec -#define BITUNPACKD bitd1unpack -#define BITUNDD bitund1 -#include __FILE__ - -#undef P4DD -#undef P4DDEC - -#undef P4DDECX - -#undef P4DDD -#undef P4DDECD -#undef BITUNPACKD -#undef BITUNDD - //delta 0 -#define P4DDD p4ddd -#define P4DDECD p4dddec -#define BITUNPACKD bitdunpack -#define BITUNDD bitund -#include __FILE__ - -#undef P4DDD -#undef P4DDECD -#undef BITUNPACKD -#undef BITUNDD -#undef BITUNPACK - -//------ SIMD ------------- -#define P4SIMD -#define P4DD p4ddv -#define P4DDEC p4ddecv -#define BITUNPACK bitunpackv - -//#define P4DDECX - -#define P4DDD p4dd1dv -#define P4DDECD p4dd1decv -#define BITUNPACKD bitd1unpackv -#define BITUNPACKD_ _bitd1unpackv -#define BITUNDD bitund1 -#include __FILE__ - -#undef P4DDEC -//#undef BITUNPACK -//#undef P4DDECX - -#undef P4DDD -#undef P4DDECD -#undef BITUNPACKD -#undef BITUNPACKD_ -#undef BITUNDD - -#define P4DDD p4dddv -#define P4DDECD p4dddecv -#define BITUNPACKD bitdunpackv -#define BITUNPACKD_ _bitdunpackv -#define BITUNDD bitund -#include __FILE__ - -#undef P4DDD -#undef P4DDECD -#undef BITUNPACKD -#undef BITUNDD - -#undef P4DD -#undef BITUNPACK -#undef USIZE -#undef P4SIMD -//---------------------------------- -#define USIZE 64 -#define P4DD p4dd -#define P4DDEC p4ddec -#define BITUNPACK bitunpack -#include __FILE__ -#undef P4DD -#undef P4DDEC -#undef BITUNPACK - - #else -#define uint_t TEMPLATE3(uint, USIZE, _t) - - #ifdef P4DDEC -unsigned char *_bitunpackv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb); - -unsigned char *TEMPLATE2(P4DD, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, unsigned b, unsigned bx) { - uint_t ex[0x100+8]; - - if(!(b & 1)) return TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b>>1); - b >>= 1; - #if defined(P4SIMD) && defined(__SSE3__) && USIZE == 32 && P4DN == 2 - unsigned char *pb = in; - in = TEMPLATE2(bitunpack, USIZE)(in+16, popcnt64(*(unsigned long long *)in) + popcnt64(*(unsigned long long *)(in+8)), ex, bx); - return _bitunpackv32(in, n, out, b, ex, pb); - #else - #if P4DN == 2 - unsigned long long bb[P4DN]; unsigned num=0,i; - bb[0] = *(unsigned long long *)in; in += 8; - bb[1] = *(unsigned long long *)in; in += 8; - in = TEMPLATE2(bitunpack, USIZE)(in, popcnt64(bb[0]) + popcnt64(bb[1]), ex, bx); - #else - unsigned long long bb[P4DN]; unsigned num=0; - for(i = 0; i < P4DN; i++) { bb[i] = *(unsigned long long *)in; in += 8; num += popcnt64(bb[i]); } - in = TEMPLATE2(bitunpack, USIZE)(in, num, ex, bx); - #endif - - in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b); - #ifdef __AVX2__ON - uint_t *op,*pex = ex; - #if P4DN == 2 - for(op = out; b0; b0 >>= 8,op += 8) { unsigned m = (unsigned char)b0, mc = popcnt32(m), s = pex[mc]; pex[mc]=0; - _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; - } - for(op = out+64; b1; b1 >>= 8,op += 8) { unsigned m = (unsigned char)b1, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; - _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; - } - #else - for(i = 0; i < P4DN; i++) { - for(op = out; bb[i]; bb[i] >>= 8,op += 8) { unsigned m = (unsigned char)bb[i], mc=popcnt32(m), s = pex[mc]; pex[mc]=0; - _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; - } out += 64; - } - #endif - #elif defined(__SSSE3__) && USIZE == 32 - uint_t *op,*pex = ex; - #if P4DN == 2 - for(op = out; bb[0]; bb[0] >>= 4,op+=4) { const unsigned m = bb[0]&0xf; - _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); - } - for(op=out+64; bb[1]; bb[1] >>= 4,op+=4) { const unsigned m = bb[1]&0xf; - _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); - } - #else - for(i = 0; i < P4DN; i++) { // Loop unrolling - for(op = out; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; - _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); - } out+=64; - } - #endif - #else - unsigned k = 0; - #if P4DN == 2 - while(bb[0]) { unsigned x = ctz64(bb[0]); out[x] += ex[k++]<>1); - b >>= 1; - - #if defined(P4SIMD) && defined(__SSE3__) && USIZE == 32 && P4DN == 2 - unsigned char *pb = in; - in = TEMPLATE2(bitunpack, USIZE)(in+16, popcnt64(*(unsigned long long *)in) + popcnt64(*(unsigned long long *)(in+8)), ex, bx); - return TEMPLATE2(BITUNPACKD_, USIZE)(in, n, out, start, b, ex, pb); - #else - unsigned long long bb[P4DN]; unsigned num=0,i; - for(i = 0; i < P4DN; i++) { bb[i] = *(unsigned long long *)in; in += 8; num += popcnt64(bb[i]); } - in = TEMPLATE2(bitunpack, USIZE)(in, num, ex, bx); - - in = TEMPLATE2(BITUNPACK, USIZE)(in, n, out, b); - #ifdef __AVX2__ON - uint_t *op,*pex = ex; - for(i = 0; i < P4DN; i++) { - for(op = out; bb[i]; bb[i] >>= 8,op += 8) { unsigned m = (unsigned char)bb[i], mc=popcnt32(m), s = pex[mc]; pex[mc]=0; - _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; - } out += 64; - } - #elif defined(__SSSE3__) - uint_t *op,*pex = ex; - #if P4DN == 2 - for(op = out; bb[0]; bb[0] >>= 4,op+=4) { const unsigned m = bb[0]&0xf; - _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); - } - for(op=out+64; bb[1]; bb[1] >>= 4,op+=4) { const unsigned m = bb[1]&0xf; - _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); - } - #else - for(i = 0; i < P4DN; i++) { // Loop unrolling - for(op = out; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; - _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); - } out+=64; - } - #endif - #else - unsigned k = 0; - for(i = 0; i < P4DN; i++) { - while(bb[i]) { unsigned x = ctz64(bb[i]); out[x] += ex[k++]< -#define P4DSIZE 128 //64 // -#define P4DN (P4DSIZE/64) - -//---------------- Bulk decompress of TurboPFor compressed integer array ------------------------------------------------------- -// decompress a previously (with p4denc32) 32 bits packed array. Return value = end of packed buffer in -unsigned char *p4ddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); -unsigned char *p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out); - -unsigned char *p4dd32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned bx); -unsigned char *p4dd64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx); - -unsigned char *p4ddecv32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); // SIMD - -//-- delta min = 0 -unsigned char *p4dddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *p4dddecv32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -//-- delta min = 1 -unsigned char *p4dd1dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *p4dd1decv32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -// same as abose, b and bx not stored within the compressed stream header (see idxcr.c/idxqry.c for an example) -unsigned char *p4ddv32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned bx); - -unsigned char *p4ddd32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); -unsigned char *p4dddv32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); - -unsigned char *p4dd1d32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); -unsigned char *p4dd1dv32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); - -//---------------- Direct Access functions to compressed TurboPFor array ------------------------------------------------------- -#define P4D_PAD8(__x) ( (((__x)+8-1)/8) ) -#define P4D_B(__x) (((__x) >> 1) & 0x3f) -#define P4D_XB(__x) (((__x) & 1)?((__x) >> 8):0) -#define P4D_ININC(__in, __x) __in += 1+(__x & 1) - -static inline unsigned vp4dbits(unsigned char *__restrict in, int *bx) { unsigned i = *(unsigned short *)in; *bx = P4D_XB(i); return P4D_B(i); } - -struct p4d { - unsigned long long *xmap; - unsigned char *ex; - unsigned i,bx,cum[P4DN+1]; - int oval,idx; -}; - -// prepare direct access usage -static inline void p4dini(struct p4d *p4d, unsigned char *__restrict *pin, unsigned n, unsigned *b) { unsigned char *in = *pin; - static unsigned long long xmap[P4DN+1] = { 0 }; - - unsigned i = *(unsigned short *)in; - p4d->i = i; - *b = P4D_B(i); - p4d->bx = P4D_XB(i); - P4D_ININC(in, i); - - p4d->xmap = (i&1)?(unsigned long long *)in:xmap; - p4d->ex = in + ((i&1)?8*P4DN:0); - for(p4d->cum[0] = 0, i=1; i < P4DN; i++) p4d->cum[i] = p4d->cum[i-1] + popcnt64(p4d->xmap[i-1]); - *pin = p4d->ex + P4D_PAD8((p4d->cum[P4DN-1] + popcnt64(p4d->xmap[P4DN-1]))*p4d->bx); - p4d->oval = p4d->idx = -1; -} - -// Get a single value with index "idx" from a p4denc32 packed array -static ALWAYS_INLINE unsigned vp4dget32(struct p4d *p4d, unsigned char *__restrict in, unsigned b, unsigned idx) { unsigned bi, cl, u = _bitgetx32(in, b, idx*b); - if(unlikely(p4d->xmap[bi = idx>>6] & (1ull<<(cl = (idx & 0x3f))))) u |= _bitgetx32(p4d->ex, p4d->bx, (p4d->cum[bi] + popcnt64(p4d->xmap[bi] & ~((~0ull)<bx ) << b; - return u; -} - -// like vp4dget32 but for 16 bits packed array (with p4denc16) -static ALWAYS_INLINE unsigned vp4dget16(struct p4d *p4d, unsigned char *__restrict in, unsigned b, unsigned idx) { unsigned bi, cl, u = _bitgetx16(in, b, idx*b); - if(unlikely(p4d->xmap[bi = idx>>6] & (1ull<<(cl = (idx & 0x3f))))) u |= _bitgetx32(p4d->ex, p4d->bx, (p4d->cum[bi] + popcnt64(p4d->xmap[bi] & ~((~0ull)<bx ) << b; - return u; -} - -// Get the next single value greater of equal to val -static ALWAYS_INLINE int vp4dgeq(struct p4d *p4d, unsigned char *__restrict in, unsigned b, int val) { do p4d->oval += vp4dget32(p4d, in, b, ++p4d->idx)+1; while(p4d->oval < val); return p4d->oval; } - -/* like p4ddec32 but using direct access. This is only a demo showing direct access usage. Use p4ddec32 for instead for decompressing entire blocks */ -unsigned char *p4ddecx32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); -unsigned char *p4dfdecx32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *p4df0decx32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); - -#define P4DSIZE 128 //64 // -#define P4DN (P4DSIZE/64) -#ifdef __cplusplus -} -#endif