TurboPFor: Bit Packing

This commit is contained in:
x
2019-07-15 10:32:54 +02:00
parent 2e7012dc38
commit 4aed815098
5 changed files with 54 additions and 777 deletions

89
bitpack.c Normal file → Executable file
View File

@ -1,5 +1,5 @@
/**
Copyright (C) powturbo 2013-2018
Copyright (C) powturbo 2013-2019
GPL v2 License
This program is free software; you can redistribute it and/or modify
@ -25,10 +25,11 @@
#include <stdio.h>
#include "conf.h"
#include "bitpack.h"
#include "bitutil.h"
#include "vint.h"
#include "bitpack.h"
#define PAD8(_x_) ( (((_x_)+8-1)/8) )
#define PREFETCH(_ip_) __builtin_prefetch(_ip_+768,0)//#define PREFETCH(ip)
#pragma warning( disable : 4005)
#pragma warning( disable : 4090)
@ -45,7 +46,6 @@ typedef unsigned char *(*BITPACK_D32)(uint32_t *__restrict out, unsigned n, cons
typedef unsigned char *(*BITPACK_F64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in);
typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, const unsigned char *__restrict in, uint64_t start);
#define PREFETCH(_ip_) __builtin_prefetch(_ip_+768,0)//#define PREFETCH(ip)
#if 1 //def _MSC_VER
#define VX (v=x)
@ -192,6 +192,7 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
size_t bitnpack8( uint8_t *__restrict in, size_t n, unsigned char *__restrict out) { uint8_t *ip,start; BITNPACK(in, n, out, 128, 8); }
size_t bitnpack16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; BITNPACK(in, n, out, 128, 16); }
size_t bitnpack32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; BITNPACK(in, n, out, 128, 32); }
size_t bitnpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip,start; BITNPACK(in, n, out, 128, 64); }
@ -225,38 +226,40 @@ size_t bitnfpack64( uint64_t *__restrict in, size_t n, unsigned char *__restrict
return op - out;\
}
#define _BITNDPACKV(in, n, out, _csize_, _usize_, _bitd_, _bitpackv_, _bitpack_) { if(!n) return 0;\
#define _BITNDPACKV(in, n, out, _csize_, _usize_, _bitdv_, _bitpackv_, _bitd_, _bitpack_) { if(!n) return 0;\
unsigned char *op = out; \
start = *in++; \
TEMPLATE2(vbxput, _usize_)(op, start);\
for(n--,ip = in; ip != in + (n&~(_csize_-1)); ) { PREFETCH(ip+512);\
unsigned b = TEMPLATE2(_bitd_, _usize_)(ip, _csize_, start); *op++ = b; op = TEMPLATE2(_bitpackv_, _usize_)(ip, _csize_, op, start, b); ip += _csize_; start = ip[-1];\
} if(n&=(_csize_-1)) { unsigned b = TEMPLATE2(_bitd_, _usize_)(ip, n, start); *op++ = b; op = TEMPLATE2(_bitpack_, _usize_)(ip, n, op, start, b); }\
unsigned b = TEMPLATE2(_bitdv_, _usize_)(ip, _csize_, start); *op++ = b; op = TEMPLATE2(_bitpackv_, _usize_)(ip, _csize_, op, start, b); ip += _csize_; start = ip[-1];\
} if(n&=(_csize_-1)) { unsigned b = TEMPLATE2(_bitd_, _usize_)(ip, n, start); *op++ = b; op = TEMPLATE2(_bitpack_, _usize_)(ip, n, op, start, b); }\
return op - out;\
}
#if defined(__SSE2__) && defined(SSE2_ON)
#include <emmintrin.h>
#if (defined(__SSE2__) || defined(__ARM_NEON)) && defined(SSE2_ON)
#define OPPE(__op)
#define IPPE(__op)
#define VI16(ip, i, iv, parm)
#define VI32(ip, i, iv, parm)
#define IP16(ip, i, iv) _mm_loadu_si128(ip++)
#define IP32(ip, i, iv) _mm_loadu_si128(ip++)
#include "bitpack_.h"
#define IP16(_ip_, i, iv) _mm_loadu_si128(_ip_++)
#define IP32(_ip_, i, iv) _mm_loadu_si128(_ip_++)
#include "bitpack_.h"
unsigned char *bitpack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(128*b); BITPACK128V16(in, b, out, 0); return pout; }
unsigned char *bitpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return pout; }
unsigned char *bitpack256w32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *_out=out; unsigned *_in=in;
BITPACK128V32(in, b, out, 0); in = _in+128; out = _out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return _out+PAD8(256*b); }
#define IP32(ip, i, iv) _mm_or_si128(_mm_shuffle_epi32(_mm_loadu_si128(ip++),_MM_SHUFFLE(2, 0, 3, 1)), _mm_shuffle_epi32(_mm_loadu_si128(ip++),_MM_SHUFFLE(3, 1, 2, 0)) )
#ifdef __ARM_NEON
//#define IP32(_ip_, i, iv) _mm_or_si128(_mm_shuffle_epi32( _mm_loadu_si128(_ip_++),_MM_SHUFFLE(3, 1, 2, 0)), _mm_shuffle_epi32( _mm_loadu_si128(_ip_++),_MM_SHUFFLE(2, 0, 3, 1)) )
#define IP32(_ip_, _i_, _iv_) _mm_or_si128(mm_shuffle_3120_epi32(_mm_loadu_si128(_ip_++) ), mm_shuffle_2031_epi32(_mm_loadu_si128(_ip++) ) ) // optimized shuffle
#else
#define IP32(_ip_, i, iv) _mm_or_si128(_mm_shuffle_epi32( _mm_loadu_si128(_ip_++),_MM_SHUFFLE(2, 0, 3, 1)), _mm_shuffle_epi32( _mm_loadu_si128(_ip_++),_MM_SHUFFLE(3, 1, 2, 0)) )
#endif
#include "bitpack_.h"
unsigned char *bitpack128v64(uint64_t *__restrict _in, unsigned n, unsigned char *__restrict out, unsigned b) {
if(b>32) return bitpack64(_in,n,out,b);
else { unsigned char *pout = out+PAD8(128*b); uint32_t *in = _in; BITPACK128V32(in, b, out, 0); return pout; }
}
unsigned char *bitpack128v64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) {
if(b<=32) { unsigned char *pout = out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return pout; } else return bitpack64(in,n,out,b);
}
#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = DELTA128x16(v,_sv_); _sv_ = v
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = DELTA128x32(v,_sv_); _sv_ = v
@ -270,7 +273,7 @@ unsigned char *bitdpack128v32(unsigned *__restrict in, unsigned n, unsigne
__m128i v,sv = _mm_set1_epi32(start); BITPACK128V32(in, b, out, sv); return pout;
}
#define VI16(_ip_, _i_, _iv_, _sv_)
#define VI16(_ip_, _i_, _iv_, _sv_)
#define VI32(_ip_, _i_, _iv_, _sv_)
#define IP16(_ip_, i, _iv_) _mm_sub_epi16(_mm_loadu_si128(_ip_++),sv)
#define IP32(_ip_, i, _iv_) _mm_sub_epi32(_mm_loadu_si128(_ip_++),sv)
@ -281,18 +284,30 @@ unsigned char *bitfpack128v16(unsigned short *__restrict in, unsigned n, unsigne
unsigned char *bitfpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
__m128i v, sv = _mm_set1_epi32(start); BITPACK128V32(in, b, out, sv); return pout;
}
#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi16(DELTA128x16(v,_sv_),cv); _sv_ = v
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi32(DELTA128x32(v,_sv_),cv); _sv_ = v
#define IP16(ip, i, _iv_) _iv_
#define IP32(ip, i, _iv_) _iv_
unsigned char *bitd1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
__m128i v, sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(1); BITPACK128V16(in, b, out, sv); return pout;
__m128i sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(1), v; BITPACK128V16(in, b, out, sv); return pout;
}
unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
__m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); BITPACK128V32(in, b, out, sv); return pout;
}
#define VI16(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi16(SUBI128x16(v,_sv_),cv); _sv_ = v
#define VI32(_ip_, _i_, _iv_, _sv_) v = _mm_loadu_si128(_ip_++); _iv_ = _mm_sub_epi32(SUBI128x32(v,_sv_),cv); _sv_ = v
#define IP16(ip, i, _iv_) _iv_
#define IP32(ip, i, _iv_) _iv_
unsigned char *bits1pack128v16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
__m128i v, sv = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); BITPACK128V16(in, b, out, sv); return pout;
}
unsigned char *bits1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b);
__m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); BITPACK128V32(in, b, out, sv); return pout;
}
#define VI16(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi16(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi16(_sv_,cv);
#define VI32(_ip_, _i_, _iv_, _sv_) _iv_ = _mm_sub_epi32(_mm_loadu_si128(_ip_++),_sv_); _sv_ = _mm_add_epi32(_sv_,cv);
#define IP16(ip, i, _iv_) _iv_
@ -316,18 +331,22 @@ unsigned char *bitzpack128v32(unsigned *__restrict in, unsigned n, unsigne
size_t bitnpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip; _BITNPACKV( in, n, out, 128, 16, bitpack128v); }
size_t bitnpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; _BITNPACKV( in, n, out, 128, 32, bitpack128v); }
size_t bitnpack128v64( uint64_t *__restrict in, size_t n, unsigned char *__restrict out) { uint64_t *ip; _BITNPACKV( in, n, out, 128, 64, bitpack128v); }
size_t bitnpack256w32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; _BITNPACKV( in, n, out, 256, 32, bitpack256w); }
size_t bitndpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitd, bitdpack128v, bitdpack); }
size_t bitndpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitd, bitdpack128v, bitdpack); }
size_t bitndpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitd, bitdpack128v, bitd, bitdpack); }
size_t bitndpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitd, bitdpack128v, bitd, bitdpack); }
size_t bitnd1pack128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitd1, bitd1pack128v, bitd1pack); }
size_t bitnd1pack128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitd1, bitd1pack128v, bitd1pack); }
size_t bitnd1pack128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitd1, bitd1pack128v, bitd1, bitd1pack); }
size_t bitnd1pack128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitd1, bitd1pack128v, bitd1, bitd1pack); }
size_t bitnzpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitz, bitzpack128v, bitzpack); }
size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitz, bitzpack128v, bitzpack); }
size_t bitns1pack128v16(uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bits128v, bits1pack128v, bitd1, bitd1pack); }
size_t bitns1pack128v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bits128v, bits1pack128v, bitd1, bitd1pack); }
size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitf, bitfpack128v, bitfpack); }
size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitf, bitfpack128v, bitfpack); }
size_t bitnzpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitz, bitzpack128v, bitz, bitzpack); }
size_t bitnzpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitz, bitzpack128v, bitz, bitzpack); }
size_t bitnfpack128v16( uint16_t *__restrict in, size_t n, unsigned char *__restrict out) { uint16_t *ip,start; _BITNDPACKV(in, n, out, 128, 16, bitf, bitfpack128v, bitf, bitfpack); }
size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 128, 32, bitf, bitfpack128v, bitf, bitfpack); }
#endif
#if defined(__AVX2__) && defined(AVX2_ON)
@ -342,8 +361,7 @@ size_t bitnfpack128v32( uint32_t *__restrict in, size_t n, unsigned char *__rest
#define VI32(ip, i, iv, parm)
#define IP32(ip, i, iv) _mm256_loadu_si256(ip++)
#include "bitpack_.h"
unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; }
#undef VI32
#undef IP32
@ -389,11 +407,12 @@ unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned n, unsigne
}
size_t bitnpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip; _BITNPACKV( in, n, out, 256, 32, bitpack256v); }
size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd, bitdpack256v, bitdpack); }
size_t bitnd1pack256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd1, bitd1pack256v, bitd1pack); }
size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitz, bitzpack256v, bitzpack); }
size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitfpack); }
size_t bitndpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd, bitdpack256v, bitd, bitdpack); }
size_t bitnd1pack256v32(uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitd1, bitd1pack256v,bitd1, bitd1pack); }
size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitz, bitzpack256v, bitz, bitzpack); }
size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); }
#endif
#pragma clang diagnostic pop

View File

@ -1,72 +0,0 @@
/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding https://github.com/powturbo/TurboRLE"
**/
#if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h"
#else
#include <stdint.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
// RLE with specified escape char
unsigned _srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
unsigned _srled8( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint8_t e);
unsigned _srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
unsigned _srled16(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint16_t e);
unsigned _srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
unsigned _srled32(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint32_t e);
unsigned _srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
unsigned _srled64(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint64_t e);
// functions w/ overflow handling
unsigned srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
unsigned srled8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint8_t e);
unsigned srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
unsigned srled16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint16_t e);
unsigned srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
unsigned srled32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint32_t e);
unsigned srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
unsigned srled64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint64_t e);
// RLE w. automatic escape char determination
unsigned srlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
unsigned _srled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen);
unsigned srled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
// Turbo RLE
unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
unsigned _trled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen);
unsigned trled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
#ifdef __cplusplus
}
#endif

View File

@ -1,62 +0,0 @@
/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
//------------------------- Variable Byte from https://github.com/powturbo/TurboPFor -----------------------------------------------------
#include "../conf.h"
#define VB_SIZE 64
#define VB_MAX 254
#define VB_B2 6
#define VB_B3 3
#define VB_BA3 (VB_MAX - (VB_SIZE/8 - 3))
#define VB_BA2 (VB_BA3 - (1<<VB_B3))
#define VB_OFS1 (VB_BA2 - (1<<VB_B2))
#define VB_OFS2 (VB_OFS1 + (1 << (8+VB_B2)))
#define VB_OFS3 (VB_OFS2 + (1 << (16+VB_B3)))
#define _vblen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_OFS2?2:((_x_) < VB_OFS3)?3:(bsr32(_x_)+7)/8+1))
#define _vbvlen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_BA2?2:((_x_) < VB_BA3)?3:(_x_-VB_BA3)))
#define _vbput32(_op_, _x_, _act_) {\
if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_); _act_;}\
else if ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1)); _op_ += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\
else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\
else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou32(_op_) = (_x_); _op_ += _b; _act_;}\
}
#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
if(likely(_x_ < VB_OFS1)) { _act_ ;}\
else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \
else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\
else { unsigned _b = _x_-VB_BA3; _x_ = ctou32(_ip_) & ((1u << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\
} while(0)
#define vbput32(_op_, _x_) { register unsigned _x = _x_; _vbput32(_op_, _x, ;); }
#define vbget32(_ip_, _x_) _vbget32(_ip_, _x_, ;)
#define vbzput(_op_, _x_, _m_, _emap_) do { if(unlikely((_x_) < _m_)) *_op_++ = _emap_[_x_]; else { unsigned _xi = (_x_) - _m_; *_op_++ = _emap_[_m_]; vbput32(_op_, _xi); } } while(0)
#define vbzget(_ip_, _x_, _m_, _e_) { _x_ = _e_; if(unlikely(_x_ == _m_)) { vbget32(_ip_,_x_); _x_+=_m_; } }
#define TMIN 3

View File

@ -1,349 +0,0 @@
/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
#ifndef USIZE
#include <string.h>
#ifdef __SSE__
#include <emmintrin.h>
#endif
#include "trle_.h"
#include "trle.h"
//------------------------------------- Histogram ---------------------------------------------------------
static inline unsigned hist(const unsigned char *__restrict in, unsigned inlen, unsigned *cc) { // Optimized for x86
unsigned c0[256+8]={0},c1[256+8]={0},c2[256+8]={0},c3[256+8]={0},c4[256+8]={0},c5[256+8]={0},c6[256+8]={0},c7[256+8]={0};
const unsigned char *ip;
unsigned cp = *(unsigned *)in,a;
int i;
for(ip = in; ip != in+(inlen&~(16-1));) {
unsigned c = cp, d = *(unsigned *)(ip+=4); cp = *(unsigned *)(ip+=4);
c0[(unsigned char) c ]++;
c1[(unsigned char) d ]++;
c2[(unsigned char)(c>>8)]++; c>>=16;
c3[(unsigned char)(d>>8)]++; d>>=16;
c4[(unsigned char) c ]++;
c5[(unsigned char) d ]++;
c6[ c>>8 ]++;
c7[ d>>8 ]++;
c = cp; d = *(unsigned *)(ip+=4); cp = *(unsigned *)(ip+=4);
c0[(unsigned char) c ]++;
c1[(unsigned char) d ]++;
c2[(unsigned char)(c>>8)]++; c>>=16;
c3[(unsigned char)(d>>8)]++; d>>=16;
c4[(unsigned char) c ]++;
c5[(unsigned char) d ]++;
c6[ c>>8 ]++;
c7[ d>>8 ]++;
}
while(ip < in+inlen) c0[*ip++]++;
for(i = 0; i < 256; i++)
cc[i] += c0[i]+c1[i]+c2[i]+c3[i]+c4[i]+c5[i]+c6[i]+c7[i];
a = 256;
while(a > 1 && !cc[a-1]) a--;
return a;
}
//------------------------------------- RLE with Escape char ------------------------------------------------------------------
#define SRLE8 32
#define USIZE 8
#include "trlec.c"
#if SRLE8
#define SRLEC8(pp, ip, op, e) do {\
unsigned i = ip - pp;\
if(i > 3) { *op++ = e; i -= 3; vbput32(op, i); *op++ = c; }\
else if(c == e) {\
while(i--) { *op++ = e; vbput32(op, 0); }\
} else while(i--) *op++ = c;\
} while(0)
unsigned _srlec8(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e) {
const uint8_t *ip = in, *pp = in - 1;
uint8_t *op = out,c;
if(inlen > SRLE8)
while(ip < in+(inlen-1-SRLE8)) {
#if 0 //def __SSE__ // SSE slower than scalar
__m128i cv = _mm_set1_epi8(*ip);
unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(ip+1)), cv)); if(mask != 0xffffu) goto a; ip += 16;
mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(ip+1)), cv)); if(mask != 0xffffu) goto a; ip += 16;
continue;
a: c = *ip;
ip += __builtin_ctz((unsigned short)(~mask));
SRLEC8(pp, ip, op, e);
pp = ip++;
#elif __WORDSIZE == 64
{unsigned long long z;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#if SRLE8 >= 32
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz64(z)>>3;
SRLEC8(pp, ip, op, e);
pp = ip++;
}
#else
{ unsigned z;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
#if SRLE8 >= 16
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz32(z)>>3;
SRLEC8(pp, ip, op, e);
pp = ip++;
}
#endif
}
for(;ip < in+inlen; ip++)
if(*ip != ip[1]) {
c = *ip;
SRLEC8(pp,ip, op, e);
pp = ip;
}
c = *ip;
SRLEC8(pp, ip, op, e);
return op - out;
}
#endif
unsigned srlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out) {
unsigned m = 0xffffffffu, mi = 0, i, b[256] = {0},a;
size_t l;
if(inlen < 1) return 0;
a = hist(in,inlen,b);
if(b[a-1] == inlen) {
*out = *in;
return 1;
}
for(i = 0; i < 256; i++)
if(b[i] <= m)
m = b[i],mi = i;
*out = mi;
l = _srlec8(in, inlen, out+1, mi)+1;
if(l < inlen)
return l;
memcpy(out, in, inlen);
return inlen;
}
//------------------------------------------------- TurboRLE ------------------------------------------
struct u { unsigned c,i; };
#define PUTC(op, x) *op++ = x
#define TRLEC(pp, ip, op, _goto_) do {\
unsigned _i = ip - pp;\
if(_i >= TMIN) {\
unsigned char *q = op; \
vbzput(op, _i-TMIN, m, rmap); \
if((op-q) + 1 < _i) { *op++ = c; _goto_; } op=q;\
} while(_i--) PUTC(op,c);\
} while(0)
#define TRLEC0(pp, ip, op, _goto_) do { unsigned _i = ip - pp;\
if(_i >= TMIN) { vbzput(op, _i-TMIN, m, rmap); *op++ = c; } else while(_i--) PUTC(op,c);\
} while(0)
unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out) {
int m,i;
unsigned b[256] = {0}, rmap[256],a;
struct u u[256],*v; // sort
unsigned char *op;
const unsigned char *ip,*pp;
uint8_t c;
if(inlen < 1) return 0;
a = hist(in,inlen,b);
if(b[a-1] == inlen) {
*out = *in;
return 1;
}
for(i = 0; i < 256; i++) u[i].c = b[i], u[i].i = i,b[i]=0;
for(v = u + 1; v < u + 256; ++v)
if(v->c < v[-1].c) {
struct u *w, tmp = *v;
for(w = v; w > u && tmp.c < w[-1].c; --w) *w = w[-1];
*w = tmp;
}
for(m = -1,i = 0; i < 256 && !u[i].c; i++)
b[u[i].i]++, ++m;
op = out;
if(m < 0) { // no unused bytes found
size_t l;
*op++ = 0;
*op++ = u[0].i;
l = _srlec8(in, inlen, op, u[0].i)+2;
if(l < inlen) return l;
memcpy(out, in, inlen);
return inlen;
}
*op++ = 1;
memset(op, 0, 32);
for(m = -1,i = 0; i < 256; i++)
if(b[i]) {
op[i>>3] |= 1<<(i&7);
rmap[++m] = i;
}
op += 32;
ip = in; pp=in-1;
if(inlen > SRLE8)
while(ip < in+(inlen-1-SRLE8)) {
unsigned long long z;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#if SRLE8 >= 32
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz64(z)>>3;
TRLEC(pp, ip, op, goto laba);
laba:pp = ip++;
}
for(;ip < in+inlen; ip++) {
if(*ip != *(ip+1)) {
c = *ip;
TRLEC(pp, ip, op, goto labb);
labb:pp = ip;
}
}
c = *ip;
TRLEC(pp,ip, op, goto labc);
labc:
if(op - out < inlen)
return op - out;
memcpy(out, in, inlen);
return inlen;
}
#undef USIZE
#undef SRLE8
#define USIZE 16
#include "trlec.c"
#undef USIZE
#define USIZE 32
#include "trlec.c"
#undef USIZE
#define USIZE 64
#include "trlec.c"
#undef USIZE
#else
#define uint_t TEMPLATE3(uint, USIZE, _t)
#define SRLEC(pp, ip, op, e) do {\
unsigned i = ip - pp;\
if(i > 3) { *(uint_t *)op = e; op+=sizeof(uint_t); i -= 3; vbput32(op, i); *(uint_t *)op = c; op+=sizeof(uint_t); }\
else if(c == e) {\
while(i--) { *(uint_t *)op = e; op+=sizeof(uint_t); vbput32(op, 0); }\
} else while(i--) { *(uint_t *)op = c; op+=sizeof(uint_t); }\
} while(0)
#if !SRLE8
unsigned TEMPLATE2(_srlec, USIZE)(const unsigned char *__restrict cin, unsigned inlen, unsigned char *__restrict out, uint_t e) {
unsigned char *op = out;
uint_t *in = (uint_t *)cin, *pp = in-1, *ip=in,c;
unsigned n = inlen/sizeof(uint_t);
unsigned char *p;
if(n > 4)
for(; ip < in+(n-1-4);) {
#if 0
if(* ip == ip[1])
if(*++ip == ip[1])
if(*++ip == ip[1])
if(*++ip == ip[1]) {
ip++; __builtin_prefetch(ip +256, 0);
continue;
}
#else
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip; __builtin_prefetch(ip +256, 0);
continue;
a:;
#endif
c = *ip;
SRLEC(pp,ip, op, e);
pp = ip++;
}
for(;ip < in+n; ip++)
if(*ip != ip[1]) {
c = *ip;
SRLEC(pp,ip, op, e);
pp = ip;
}
c = *ip;
SRLEC(pp, ip, op, e);
#if USIZE > 8
p = (unsigned char *)ip;
while(p < cin+inlen)
*op++ = *p++;
#endif
return op - out;
}
#endif
#undef SRLEC
unsigned TEMPLATE2(srlec, USIZE)(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint_t e) {
size_t l = TEMPLATE2(_srlec, USIZE)(in, inlen, out, e);
if(l < inlen)
return l;
memcpy(out, in, inlen);
return inlen;
}
#endif

View File

@ -1,259 +0,0 @@
/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
#ifndef USIZE
#include <string.h>
#ifdef __SSE__
#include <emmintrin.h>
#endif
#include "trle.h"
#include "trle_.h"
//------------------------------------- RLE with Escape char ------------------------------------------------------------------
//#define MEMSAFE
#define SRLE8 32 // 16//
#define USIZE 8
#include "trled.c"
#if SRLE8
unsigned _srled8(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, unsigned char e) {
const uint8_t *ip = in;
uint8_t *op = out, c;
uint32_t i;
#ifdef __SSE__
__m128i ev = _mm_set1_epi8(e);
#endif
if(outlen >= SRLE8)
while(op < out+(outlen-SRLE8)) {
#ifdef __SSE__ // TODO: test _mm_cmpestrm/_mm_cmpestri on sse4
uint32_t mask;
__m128i u,v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v, ev)); if(mask) goto a; op += 16; ip += 16;
#if SRLE8 >= 32
u = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, u); mask = _mm_movemask_epi8(_mm_cmpeq_epi8(u, ev)); if(mask) goto a; op += 16; ip += 16;
#endif
__builtin_prefetch(ip+512, 0);
continue;
a: i = ctz32(mask);
op += i; ip += i+1;
{
#else
if(likely((c = *(uint8_t *)ip) != e)) {
ip++;
*op++ = c;
} else {
#endif
vbget32(ip, i);
if(likely(i)) {
uint8_t c = *ip++;
i += TMIN;
rmemset(op, c, i);
} else
*op++ = e;
}
}
#define rmemset8(_op_, _c_, _i_) while(_i_--) *_op_++ = _c_
while(op < out+outlen)
if(likely((c = *ip) != e)) {
ip++;
*op++ = c;
} else {
int i;
ip++;
vbget32(ip, i);
if(likely(i)) {
c = *ip++;
i += TMIN;
rmemset8(op, c, i);
} else
*op++ = e;
}
return ip - in;
}
#endif
unsigned _srled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen) {
return _srled8(in+1, out, outlen, *in);
}
unsigned srled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen) {
if(inlen == outlen)
memcpy(out, in, outlen);
else if(inlen == 1)
memset(out, in[0], outlen);
else
return _srled8(in+1, out, outlen, *in);
return inlen;
}
//------------------------------------- TurboRLE ------------------------------------------
unsigned _trled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen) {
uint8_t b[256] = {0},*op = out;
const uint8_t *ip;
int m = -1, i, c;
if(outlen < 1)
return 0;
if(!*in++)
return _srled8(in+1, out, outlen, *in)+2;
for(ip = in; ip < in+32; ip++)
for(i = 0; i < 8; ++i)
if(((*ip) >> i) & 1)
b[(ip-in)<<3 | i] = ++m+1;
if(outlen >= 32)
while(op < out+(outlen-32)) {
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
__builtin_prefetch(ip+256, 0);
continue;
a:
c = b[*ip++];
vbzget(ip, i, m, c-1);
c = *ip++;
i += 3;
rmemset(op,c,i);
}
while(op < out+outlen) {
if(likely(!(c = b[*ip])))
*op++ = *ip++;
else {
ip++;
vbzget(ip, i, m, c-1);
c = *ip++;
i += 3;
rmemset8(op,c,i);
}
}
return ip - in;
}
unsigned trled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen) {
if(inlen == outlen)
memcpy(out, in, outlen);
else if(inlen == 1)
memset(out, in[0], outlen);
else
return _trled(in, out, outlen);
return inlen;
}
#undef USIZE
#undef rmemset
#undef SRLE8
#define USIZE 16
#include "trled.c"
#undef rmemset
#undef USIZE
#undef runcpy
#define USIZE 32
#include "trled.c"
#undef rmemset
#undef USIZE
#undef runcpy
#define USIZE 64
#include "trled.c"
#undef rmemset
#undef USIZE
#else
#ifdef MEMSAFE
#define rmemset(_op_, _c_, _i_) while(_i_--) *_op_++ = _c_
#elif defined(__SSE__) && USIZE < 64
#define rmemset(_op_, _c_, _i_) do { \
__m128i *_up = (__m128i *)_op_, cv = TEMPLATE2(_mm_set1_epi, USIZE)(_c_);\
_op_ += _i_;\
do { _mm_storeu_si128( _up, cv); _mm_storeu_si128(_up+1, cv); _up+=2; } while(_up < (__m128i *)_op_);\
} while(0)
#else
#define _cset64(_cc,_c_) _cc = _c_
#define _cset32(_cc,_c_) _cc = _c_; _cc = _cc<<32|_cc
#define _cset16(_cc,_c_) _cc = _c_; _cc = _cc<<48|_cc<<32|_cc<<16|_cc
#define _cset8( _cc,_c_) _cc = (uint32_t)_c_<<24 | (uint32_t)_c_<<16 | (uint32_t)_c_<<8 | (uint32_t)_c_; _cc = _cc<<32|_cc
#define rmemset(_op_, _c_, _i_) do { uint64_t _cc; uint8_t *_up = (uint8_t *)_op_; _op_ +=_i_;\
TEMPLATE2(_cset, USIZE)(_cc,_c_);\
do {\
TEMPLATE2(ctou, USIZE)(_up) = _c_; _up += USIZE/8;\
TEMPLATE2(ctou, USIZE)(_up) = _c_; _up += USIZE/8;\
} while(_up < (uint8_t *)_op_);\
} while(0)
#endif
#define uint_t TEMPLATE3(uint, USIZE, _t)
#if !SRLE8
unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned char *__restrict cout, unsigned outlen, uint_t e) {
uint_t *out = (uint_t *)cout, *op = out, c;
const unsigned char *ip = in;
while(op < out+outlen/sizeof(uint_t)) { __builtin_prefetch(ip +384, 0);
if(likely((c = *(uint_t *)ip) != e)) {
ip += sizeof(uint_t);
*op++ = c;
} else {
int i;
ip += sizeof(uint_t);
vbget32(ip, i);
if(likely(i)) {
c = *(uint_t *)ip;
ip += sizeof(uint_t);
i += 3;
rmemset(op, c, i);
} else
*op++ = e;
}
}
#if USIZE > 8
{ unsigned char *p = (unsigned char *)op;
while(p < cout+outlen) *p++ = *ip++;
}
#endif
return ip - in;
}
#endif
unsigned TEMPLATE2(srled, USIZE)(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint_t e) {
if(inlen == outlen)
memcpy(out, in, outlen);
else if(inlen == 1)
memset(out, in[0], outlen);
else
return TEMPLATE2(_srled, USIZE)(in, out, outlen, e);
return inlen;
}
#endif