diff --git a/ext/FastPFor b/ext/FastPFor index 285bcd1..71d54a9 160000 --- a/ext/FastPFor +++ b/ext/FastPFor @@ -1 +1 @@ -Subproject commit 285bcd192d1d04ee579c8c6853289ea4a975292f +Subproject commit 71d54a9793245ae90e69c86a425d4ee1ee6543d8 diff --git a/ext/SPDP_10.c b/ext/SPDP_10.c new file mode 100644 index 0000000..a0e04b2 --- /dev/null +++ b/ext/SPDP_10.c @@ -0,0 +1,238 @@ +/* +SPDP code: SPDP is a unified compression/decompression algorithm that works +well on both binary 32-bit single-precision (float) and binary 64-bit double- +precision (double) floating-point data. + +Copyright (c) 2016, Texas State University. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted for academic, research, experimental, or personal use provided +that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions, and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions, and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Texas State University nor the names of its + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +For all other uses, please contact the Office for Commercialization and Industry +Relations at Texas State University . + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Authors: Martin Burtscher and Steven Claggett +*/ + + +#include +#include +#include + +#define MAX_TABLE_SIZE (1 << 18) + +typedef unsigned char byte_t; +typedef unsigned int word_t; + + +static size_t compress(const byte_t level, const size_t length, byte_t* const buf1, byte_t* const buf2) +{ + word_t* in = (word_t*)buf1; + word_t* out = (word_t*)buf2; + size_t len = length / sizeof(word_t); + + word_t prev2 = 0; + word_t prev1 = 0; + size_t pos; + for (pos = 0; pos < len; pos++) { + word_t curr = in[pos]; + out[pos] = curr - prev2; + prev2 = prev1; + prev1 = curr; + } + + for (pos = len * sizeof(word_t); pos < length; pos++) { + buf2[pos] = buf1[pos]; + } + + byte_t prev = 0; + size_t wpos = 0; + size_t d; + for (d = 0; d < 8; d++) { + size_t rpos; + for (rpos = d; rpos < length; rpos += 8) { + byte_t curr = buf2[rpos]; + buf1[wpos] = curr - prev; + prev = curr; + wpos++; + } + } + + size_t predtabsize = 1 << (level + 9); + if (predtabsize > MAX_TABLE_SIZE) predtabsize = MAX_TABLE_SIZE; + const size_t predtabsizem1 = predtabsize - 1; + + unsigned int lastpos[MAX_TABLE_SIZE]; + memset(lastpos, 0, predtabsize * sizeof(unsigned int)); + + size_t rpos = 0; + wpos = 0; + unsigned int hist = 0; + while (rpos < length) { + byte_t val = buf1[rpos]; + unsigned int lpos = lastpos[hist]; + if (lpos >= 6) { + if ((buf1[lpos - 6] == buf1[rpos - 6]) && (buf1[lpos - 5] == buf1[rpos - 5]) && + (buf1[lpos - 4] == buf1[rpos - 4]) && (buf1[lpos - 3] == buf1[rpos - 3]) && + (buf1[lpos - 2] == buf1[rpos - 2]) && (buf1[lpos - 1] == buf1[rpos - 1])) { + byte_t cnt = 0; + while ((val == buf1[lpos]) && (cnt < 255) && (rpos < (length - 1))) { + lastpos[hist] = rpos; + hist = ((hist << 2) ^ val) & predtabsizem1; + rpos++; + lpos++; + cnt++; + val = buf1[rpos]; + } + buf2[wpos] = cnt; + wpos++; + } + } + buf2[wpos] = val; + wpos++; + lastpos[hist] = rpos; + hist = ((hist << 2) ^ val) & predtabsizem1; + rpos++; + } + + return wpos; +} + +static void decompress(const byte_t level, const size_t length, byte_t* const buf2, byte_t* const buf1) +{ + unsigned int predtabsize = 1 << (level + 9); + if (predtabsize > MAX_TABLE_SIZE) predtabsize = MAX_TABLE_SIZE; + const unsigned int predtabsizem1 = predtabsize - 1; + + unsigned int lastpos[MAX_TABLE_SIZE]; + memset(lastpos, 0, predtabsize * sizeof(unsigned int)); + + size_t rpos = 0; + size_t wpos = 0; + unsigned int hist = 0; + while (rpos < length) { + unsigned int lpos = lastpos[hist]; + if (lpos >= 6) { + if ((buf1[lpos - 6] == buf1[wpos - 6]) && (buf1[lpos - 5] == buf1[wpos - 5]) && + (buf1[lpos - 4] == buf1[wpos - 4]) && (buf1[lpos - 3] == buf1[wpos - 3]) && + (buf1[lpos - 2] == buf1[wpos - 2]) && (buf1[lpos - 1] == buf1[wpos - 1])) { + byte_t cnt = buf2[rpos]; + rpos++; + byte_t j; + for (j = 0; j < cnt; j++) { + byte_t val = buf1[wpos] = buf1[lpos]; + lastpos[hist] = wpos; + hist = ((hist << 2) ^ val) & predtabsizem1; + wpos++; + lpos++; + } + } + } + byte_t val = buf1[wpos] = buf2[rpos]; + lastpos[hist] = wpos; + hist = ((hist << 2) ^ val) & predtabsizem1; + wpos++; + rpos++; + } + const size_t usize = wpos; + + byte_t val = 0; + rpos = 0; + size_t d; + for (d = 0; d < 8; d++) { + size_t wpos; + for (wpos = d; wpos < usize; wpos += 8) { + val += buf1[rpos]; + buf2[wpos] = val; + rpos++; + } + } + + word_t* in = (word_t*)buf2; + word_t* out = (word_t*)buf1; + const size_t len = usize / sizeof(word_t); + + word_t prev2 = 0; + word_t prev1 = 0; + size_t pos; + for (pos = 0; pos < len; pos++) { + word_t curr = in[pos] + prev2; + out[pos] = curr; + prev2 = prev1; + prev1 = curr; + } + for (pos = len * sizeof(word_t); pos < usize; pos++) { + buf1[pos] = buf2[pos]; + } +} +#ifndef NMAIN +#define BUFFER_SIZE (1 << 23) +static byte_t buffer1[BUFFER_SIZE]; +static byte_t buffer2[BUFFER_SIZE * 2 + 9]; +int main(int argc, char *argv[]) +{ + fprintf(stderr, "SPDP Floating-Point Compressor v1.0\n"); + fprintf(stderr, "Copyright (c) 2016 Texas State University\n\n"); + + if ((argc != 1) && (argc != 2)) { + fprintf(stderr, "compression usage: %s level < uncompressed_file > compressed_file\n", argv[0]); + fprintf(stderr, "decompression usage: %s < compressed_file > decompressed_file\n", argv[0]); + return -1; + } + + if (argc == 2) { // compression + byte_t level = atoi(argv[1]); + if (level < 0) level = 0; + if (level > 9) level = 9; + fwrite(&level, sizeof(byte_t), 1, stdout); + + int length = fread(buffer1, sizeof(byte_t), BUFFER_SIZE, stdin); + while (length > 0) { + fwrite(&length, sizeof(int), 1, stdout); + int csize = compress(level, length, buffer1, buffer2); + fwrite(&csize, sizeof(int), 1, stdout); + fwrite(buffer2, sizeof(byte_t), csize, stdout); + length = fread(buffer1, sizeof(byte_t), BUFFER_SIZE, stdin); + } + } else { // decompression + byte_t level = 10; + fread(&level, sizeof(byte_t), 1, stdin); + if ((level < 0) || (level > 9)) { + fprintf(stderr, "incorrect input file type\n"); + return -2; + } + + int length; + while (fread(&length, sizeof(int), 1, stdin) > 0) { + int csize; + fread(&csize, sizeof(int), 1, stdin); + fread(buffer2, sizeof(byte_t), csize, stdin); + decompress(level, csize, buffer2, buffer1); + fwrite(buffer1, sizeof(byte_t), length, stdout); + } + } + + return 0; +} +#endif diff --git a/ext/c-blosc b/ext/c-blosc index 32ccb88..39ca44b 160000 --- a/ext/c-blosc +++ b/ext/c-blosc @@ -1 +1 @@ -Subproject commit 32ccb88cda4351ce53a0ec868f9528ad71d2e966 +Subproject commit 39ca44b1dc4a06b7c25ba2a71a050c4b90d1217d diff --git a/ext/lz4 b/ext/lz4 index f76ee4e..b5233d3 160000 --- a/ext/lz4 +++ b/ext/lz4 @@ -1 +1 @@ -Subproject commit f76ee4e267e567bbce611aecd91f41b5de3b44d5 +Subproject commit b5233d3726b416b1176c71483d20b4c543851c6f diff --git a/ext/streamvbyte b/ext/streamvbyte index 5249a2f..a0bc621 160000 --- a/ext/streamvbyte +++ b/ext/streamvbyte @@ -1 +1 @@ -Subproject commit 5249a2f11eafc23a583e321937f84466d7cb62d6 +Subproject commit a0bc6210ca9e4cd684d2e1d7649614f1cd384897 diff --git a/ext/trle.h b/ext/trle.h new file mode 100644 index 0000000..4740665 --- /dev/null +++ b/ext/trle.h @@ -0,0 +1,72 @@ +/** + Copyright (C) powturbo 2015-2018 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo [AT] gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + TurboRLE - "Most efficient and fastest Run Length Encoding" +**/ +#if defined(_MSC_VER) && _MSC_VER < 1600 +#include "vs/stdint.h" +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif +// RLE with specified escape char +unsigned _srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e); +unsigned _srled8( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint8_t e); + +unsigned _srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e); +unsigned _srled16(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint16_t e); + +unsigned _srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e); +unsigned _srled32(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint32_t e); + +unsigned _srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e); +unsigned _srled64(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint64_t e); + +// functions w/ overflow handling +unsigned srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e); +unsigned srled8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint8_t e); + +unsigned srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e); +unsigned srled16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint16_t e); + +unsigned srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e); +unsigned srled32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint32_t e); + +unsigned srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e); +unsigned srled64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint64_t e); + +// RLE w. automatic escape char determination +unsigned srlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out); +unsigned _srled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen); +unsigned srled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen); + +// Turbo RLE +unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out); +unsigned _trled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen); +unsigned trled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen); +#ifdef __cplusplus +} +#endif diff --git a/ext/trle_.h b/ext/trle_.h new file mode 100644 index 0000000..b3b929f --- /dev/null +++ b/ext/trle_.h @@ -0,0 +1,62 @@ +/** + Copyright (C) powturbo 2015-2018 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo [AT] gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + TurboRLE - "Most efficient and fastest Run Length Encoding" +**/ +//------------------------- Variable Byte from https://github.com/powturbo/TurboPFor ----------------------------------------------------- +#include "../conf.h" +#define VB_SIZE 64 +#define VB_MAX 254 +#define VB_B2 6 +#define VB_B3 3 +#define VB_BA3 (VB_MAX - (VB_SIZE/8 - 3)) +#define VB_BA2 (VB_BA3 - (1<> 8); *_op_++ = (_x_);*/ _act_; }\ + else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\ + else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou32(_op_) = (_x_); _op_ += _b; _act_;}\ +} + +#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ + if(likely(_x_ < VB_OFS1)) { _act_ ;}\ + else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \ + else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\ + else { unsigned _b = _x_-VB_BA3; _x_ = ctou32(_ip_) & ((1u << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\ +} while(0) + +#define vbput32(_op_, _x_) { register unsigned _x = _x_; _vbput32(_op_, _x, ;); } +#define vbget32(_ip_, _x_) _vbget32(_ip_, _x_, ;) + +#define vbzput(_op_, _x_, _m_, _emap_) do { if(unlikely((_x_) < _m_)) *_op_++ = _emap_[_x_]; else { unsigned _xi = (_x_) - _m_; *_op_++ = _emap_[_m_]; vbput32(_op_, _xi); } } while(0) +#define vbzget(_ip_, _x_, _m_, _e_) { _x_ = _e_; if(unlikely(_x_ == _m_)) { vbget32(_ip_,_x_); _x_+=_m_; } } + +#define TMIN 3 diff --git a/ext/trlec.c b/ext/trlec.c new file mode 100644 index 0000000..631ae77 --- /dev/null +++ b/ext/trlec.c @@ -0,0 +1,349 @@ +/** + Copyright (C) powturbo 2015-2018 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo [AT] gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + TurboRLE - "Most efficient and fastest Run Length Encoding" +**/ + #ifndef USIZE +#include + #ifdef __SSE__ +#include + #endif + +#include "trle_.h" +#include "trle.h" + +//------------------------------------- Histogram --------------------------------------------------------- +static inline unsigned hist(const unsigned char *__restrict in, unsigned inlen, unsigned *cc) { // Optimized for x86 + unsigned c0[256+8]={0},c1[256+8]={0},c2[256+8]={0},c3[256+8]={0},c4[256+8]={0},c5[256+8]={0},c6[256+8]={0},c7[256+8]={0}; + + const unsigned char *ip; + unsigned cp = *(unsigned *)in,a; + int i; + for(ip = in; ip != in+(inlen&~(16-1));) { + unsigned c = cp, d = *(unsigned *)(ip+=4); cp = *(unsigned *)(ip+=4); + c0[(unsigned char) c ]++; + c1[(unsigned char) d ]++; + c2[(unsigned char)(c>>8)]++; c>>=16; + c3[(unsigned char)(d>>8)]++; d>>=16; + c4[(unsigned char) c ]++; + c5[(unsigned char) d ]++; + c6[ c>>8 ]++; + c7[ d>>8 ]++; + + c = cp; d = *(unsigned *)(ip+=4); cp = *(unsigned *)(ip+=4); + c0[(unsigned char) c ]++; + c1[(unsigned char) d ]++; + c2[(unsigned char)(c>>8)]++; c>>=16; + c3[(unsigned char)(d>>8)]++; d>>=16; + c4[(unsigned char) c ]++; + c5[(unsigned char) d ]++; + c6[ c>>8 ]++; + c7[ d>>8 ]++; + } + while(ip < in+inlen) c0[*ip++]++; + + for(i = 0; i < 256; i++) + cc[i] += c0[i]+c1[i]+c2[i]+c3[i]+c4[i]+c5[i]+c6[i]+c7[i]; + a = 256; + while(a > 1 && !cc[a-1]) a--; + return a; +} +//------------------------------------- RLE with Escape char ------------------------------------------------------------------ +#define SRLE8 32 +#define USIZE 8 +#include "trlec.c" + + #if SRLE8 +#define SRLEC8(pp, ip, op, e) do {\ + unsigned i = ip - pp;\ + if(i > 3) { *op++ = e; i -= 3; vbput32(op, i); *op++ = c; }\ + else if(c == e) {\ + while(i--) { *op++ = e; vbput32(op, 0); }\ + } else while(i--) *op++ = c;\ +} while(0) + +unsigned _srlec8(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e) { + const uint8_t *ip = in, *pp = in - 1; + uint8_t *op = out,c; + + if(inlen > SRLE8) + while(ip < in+(inlen-1-SRLE8)) { + #if 0 //def __SSE__ // SSE slower than scalar + __m128i cv = _mm_set1_epi8(*ip); + unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(ip+1)), cv)); if(mask != 0xffffu) goto a; ip += 16; + mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(ip+1)), cv)); if(mask != 0xffffu) goto a; ip += 16; + continue; + a: c = *ip; + ip += __builtin_ctz((unsigned short)(~mask)); + SRLEC8(pp, ip, op, e); + pp = ip++; + #elif __WORDSIZE == 64 + {unsigned long long z; + if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; + if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; + #if SRLE8 >= 32 + if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; + if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; + #endif + __builtin_prefetch(ip +256, 0); + continue; + a: c = *ip; + ip += ctz64(z)>>3; + SRLEC8(pp, ip, op, e); + pp = ip++; + } + #else + { unsigned z; + if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4; + if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4; + #if SRLE8 >= 16 + if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4; + if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4; + #endif + __builtin_prefetch(ip +256, 0); + continue; + a: c = *ip; + ip += ctz32(z)>>3; + SRLEC8(pp, ip, op, e); + pp = ip++; + } + #endif + } + + for(;ip < in+inlen; ip++) + if(*ip != ip[1]) { + c = *ip; + SRLEC8(pp,ip, op, e); + pp = ip; + } + c = *ip; + SRLEC8(pp, ip, op, e); + return op - out; +} +#endif + +unsigned srlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out) { + unsigned m = 0xffffffffu, mi = 0, i, b[256] = {0},a; + size_t l; + if(inlen < 1) return 0; + + a = hist(in,inlen,b); + if(b[a-1] == inlen) { + *out = *in; + return 1; + } + + for(i = 0; i < 256; i++) + if(b[i] <= m) + m = b[i],mi = i; + *out = mi; + l = _srlec8(in, inlen, out+1, mi)+1; + if(l < inlen) + return l; + memcpy(out, in, inlen); + return inlen; +} + +//------------------------------------------------- TurboRLE ------------------------------------------ +struct u { unsigned c,i; }; + +#define PUTC(op, x) *op++ = x +#define TRLEC(pp, ip, op, _goto_) do {\ + unsigned _i = ip - pp;\ + if(_i >= TMIN) {\ + unsigned char *q = op; \ + vbzput(op, _i-TMIN, m, rmap); \ + if((op-q) + 1 < _i) { *op++ = c; _goto_; } op=q;\ + } while(_i--) PUTC(op,c);\ +} while(0) + +#define TRLEC0(pp, ip, op, _goto_) do { unsigned _i = ip - pp;\ + if(_i >= TMIN) { vbzput(op, _i-TMIN, m, rmap); *op++ = c; } else while(_i--) PUTC(op,c);\ +} while(0) + +unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out) { + int m,i; + unsigned b[256] = {0}, rmap[256],a; + struct u u[256],*v; // sort + unsigned char *op; + const unsigned char *ip,*pp; + uint8_t c; + if(inlen < 1) return 0; + + a = hist(in,inlen,b); + if(b[a-1] == inlen) { + *out = *in; + return 1; + } + + for(i = 0; i < 256; i++) u[i].c = b[i], u[i].i = i,b[i]=0; + for(v = u + 1; v < u + 256; ++v) + if(v->c < v[-1].c) { + struct u *w, tmp = *v; + for(w = v; w > u && tmp.c < w[-1].c; --w) *w = w[-1]; + *w = tmp; + } + + for(m = -1,i = 0; i < 256 && !u[i].c; i++) + b[u[i].i]++, ++m; + + op = out; + + if(m < 0) { // no unused bytes found + size_t l; + *op++ = 0; + *op++ = u[0].i; + l = _srlec8(in, inlen, op, u[0].i)+2; + if(l < inlen) return l; + memcpy(out, in, inlen); + return inlen; + } + + *op++ = 1; + memset(op, 0, 32); + for(m = -1,i = 0; i < 256; i++) + if(b[i]) { + op[i>>3] |= 1<<(i&7); + rmap[++m] = i; + } + op += 32; + + ip = in; pp=in-1; + if(inlen > SRLE8) + while(ip < in+(inlen-1-SRLE8)) { + unsigned long long z; + if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; + if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; + #if SRLE8 >= 32 + if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; + if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; + #endif + __builtin_prefetch(ip +256, 0); + continue; + a: c = *ip; + ip += ctz64(z)>>3; + TRLEC(pp, ip, op, goto laba); + laba:pp = ip++; + } + + for(;ip < in+inlen; ip++) { + if(*ip != *(ip+1)) { + c = *ip; + TRLEC(pp, ip, op, goto labb); + labb:pp = ip; + } + } + + c = *ip; + TRLEC(pp,ip, op, goto labc); + labc: + if(op - out < inlen) + return op - out; + memcpy(out, in, inlen); + return inlen; +} + +#undef USIZE +#undef SRLE8 + +#define USIZE 16 +#include "trlec.c" +#undef USIZE + +#define USIZE 32 +#include "trlec.c" +#undef USIZE + +#define USIZE 64 +#include "trlec.c" +#undef USIZE + +#else +#define uint_t TEMPLATE3(uint, USIZE, _t) + +#define SRLEC(pp, ip, op, e) do {\ + unsigned i = ip - pp;\ + if(i > 3) { *(uint_t *)op = e; op+=sizeof(uint_t); i -= 3; vbput32(op, i); *(uint_t *)op = c; op+=sizeof(uint_t); }\ + else if(c == e) {\ + while(i--) { *(uint_t *)op = e; op+=sizeof(uint_t); vbput32(op, 0); }\ + } else while(i--) { *(uint_t *)op = c; op+=sizeof(uint_t); }\ +} while(0) + + #if !SRLE8 +unsigned TEMPLATE2(_srlec, USIZE)(const unsigned char *__restrict cin, unsigned inlen, unsigned char *__restrict out, uint_t e) { + unsigned char *op = out; + uint_t *in = (uint_t *)cin, *pp = in-1, *ip=in,c; + unsigned n = inlen/sizeof(uint_t); + unsigned char *p; + if(n > 4) + for(; ip < in+(n-1-4);) { + #if 0 + if(* ip == ip[1]) + if(*++ip == ip[1]) + if(*++ip == ip[1]) + if(*++ip == ip[1]) { + ip++; __builtin_prefetch(ip +256, 0); + continue; + } + #else + if(*ip != ip[1]) goto a; ++ip; + if(*ip != ip[1]) goto a; ++ip; + if(*ip != ip[1]) goto a; ++ip; + if(*ip != ip[1]) goto a; ++ip; __builtin_prefetch(ip +256, 0); + continue; + a:; + #endif + c = *ip; + SRLEC(pp,ip, op, e); + pp = ip++; + } + + for(;ip < in+n; ip++) + if(*ip != ip[1]) { + c = *ip; + SRLEC(pp,ip, op, e); + pp = ip; + } + c = *ip; + SRLEC(pp, ip, op, e); + + #if USIZE > 8 + p = (unsigned char *)ip; + while(p < cin+inlen) + *op++ = *p++; + #endif + return op - out; +} + #endif +#undef SRLEC + +unsigned TEMPLATE2(srlec, USIZE)(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint_t e) { + size_t l = TEMPLATE2(_srlec, USIZE)(in, inlen, out, e); + + if(l < inlen) + return l; + memcpy(out, in, inlen); + return inlen; +} +#endif + diff --git a/ext/trled.c b/ext/trled.c new file mode 100644 index 0000000..6ca1eb3 --- /dev/null +++ b/ext/trled.c @@ -0,0 +1,259 @@ +/** + Copyright (C) powturbo 2015-2018 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo [AT] gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + TurboRLE - "Most efficient and fastest Run Length Encoding" +**/ + #ifndef USIZE +#include + #ifdef __SSE__ +#include + #endif + +#include "trle.h" +#include "trle_.h" + +//------------------------------------- RLE with Escape char ------------------------------------------------------------------ +//#define MEMSAFE +#define SRLE8 32 // 16// +#define USIZE 8 +#include "trled.c" + + #if SRLE8 +unsigned _srled8(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, unsigned char e) { + const uint8_t *ip = in; + uint8_t *op = out, c; + uint32_t i; + #ifdef __SSE__ + __m128i ev = _mm_set1_epi8(e); + #endif + if(outlen >= SRLE8) + while(op < out+(outlen-SRLE8)) { + + #ifdef __SSE__ // TODO: test _mm_cmpestrm/_mm_cmpestri on sse4 + uint32_t mask; + __m128i u,v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v, ev)); if(mask) goto a; op += 16; ip += 16; + #if SRLE8 >= 32 + u = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, u); mask = _mm_movemask_epi8(_mm_cmpeq_epi8(u, ev)); if(mask) goto a; op += 16; ip += 16; + #endif + __builtin_prefetch(ip+512, 0); + continue; + a: i = ctz32(mask); + op += i; ip += i+1; + { + #else + if(likely((c = *(uint8_t *)ip) != e)) { + ip++; + *op++ = c; + } else { + #endif + vbget32(ip, i); + if(likely(i)) { + uint8_t c = *ip++; + i += TMIN; + rmemset(op, c, i); + } else + *op++ = e; + } + } + + #define rmemset8(_op_, _c_, _i_) while(_i_--) *_op_++ = _c_ + while(op < out+outlen) + if(likely((c = *ip) != e)) { + ip++; + *op++ = c; + } else { + int i; + ip++; + vbget32(ip, i); + if(likely(i)) { + c = *ip++; + i += TMIN; + rmemset8(op, c, i); + } else + *op++ = e; + } + return ip - in; +} + #endif + +unsigned _srled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen) { + return _srled8(in+1, out, outlen, *in); +} + +unsigned srled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen) { + if(inlen == outlen) + memcpy(out, in, outlen); + else if(inlen == 1) + memset(out, in[0], outlen); + else + return _srled8(in+1, out, outlen, *in); + return inlen; +} +//------------------------------------- TurboRLE ------------------------------------------ +unsigned _trled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen) { + uint8_t b[256] = {0},*op = out; + const uint8_t *ip; + int m = -1, i, c; + + if(outlen < 1) + return 0; + + if(!*in++) + return _srled8(in+1, out, outlen, *in)+2; + + for(ip = in; ip < in+32; ip++) + for(i = 0; i < 8; ++i) + if(((*ip) >> i) & 1) + b[(ip-in)<<3 | i] = ++m+1; + + if(outlen >= 32) + while(op < out+(outlen-32)) { + if(b[*ip]) goto a; *op++ = *ip++; + if(b[*ip]) goto a; *op++ = *ip++; + if(b[*ip]) goto a; *op++ = *ip++; + if(b[*ip]) goto a; *op++ = *ip++; + if(b[*ip]) goto a; *op++ = *ip++; + if(b[*ip]) goto a; *op++ = *ip++; + if(b[*ip]) goto a; *op++ = *ip++; + if(b[*ip]) goto a; *op++ = *ip++; + __builtin_prefetch(ip+256, 0); + continue; + a: + c = b[*ip++]; + vbzget(ip, i, m, c-1); + c = *ip++; + i += 3; + rmemset(op,c,i); + } + while(op < out+outlen) { + if(likely(!(c = b[*ip]))) + *op++ = *ip++; + else { + ip++; + vbzget(ip, i, m, c-1); + c = *ip++; + i += 3; + rmemset8(op,c,i); + } + } + return ip - in; +} + +unsigned trled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen) { + if(inlen == outlen) + memcpy(out, in, outlen); + else if(inlen == 1) + memset(out, in[0], outlen); + else + return _trled(in, out, outlen); + return inlen; +} + +#undef USIZE +#undef rmemset +#undef SRLE8 + +#define USIZE 16 +#include "trled.c" +#undef rmemset +#undef USIZE +#undef runcpy + +#define USIZE 32 +#include "trled.c" +#undef rmemset +#undef USIZE +#undef runcpy + +#define USIZE 64 +#include "trled.c" +#undef rmemset +#undef USIZE + + #else + #ifdef MEMSAFE +#define rmemset(_op_, _c_, _i_) while(_i_--) *_op_++ = _c_ + #elif defined(__SSE__) && USIZE < 64 +#define rmemset(_op_, _c_, _i_) do { \ + __m128i *_up = (__m128i *)_op_, cv = TEMPLATE2(_mm_set1_epi, USIZE)(_c_);\ + _op_ += _i_;\ + do { _mm_storeu_si128( _up, cv); _mm_storeu_si128(_up+1, cv); _up+=2; } while(_up < (__m128i *)_op_);\ +} while(0) + #else +#define _cset64(_cc,_c_) _cc = _c_ +#define _cset32(_cc,_c_) _cc = _c_; _cc = _cc<<32|_cc +#define _cset16(_cc,_c_) _cc = _c_; _cc = _cc<<48|_cc<<32|_cc<<16|_cc +#define _cset8( _cc,_c_) _cc = (uint32_t)_c_<<24 | (uint32_t)_c_<<16 | (uint32_t)_c_<<8 | (uint32_t)_c_; _cc = _cc<<32|_cc + +#define rmemset(_op_, _c_, _i_) do { uint64_t _cc; uint8_t *_up = (uint8_t *)_op_; _op_ +=_i_;\ + TEMPLATE2(_cset, USIZE)(_cc,_c_);\ + do {\ + TEMPLATE2(ctou, USIZE)(_up) = _c_; _up += USIZE/8;\ + TEMPLATE2(ctou, USIZE)(_up) = _c_; _up += USIZE/8;\ + } while(_up < (uint8_t *)_op_);\ +} while(0) + #endif + +#define uint_t TEMPLATE3(uint, USIZE, _t) + + #if !SRLE8 +unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned char *__restrict cout, unsigned outlen, uint_t e) { + uint_t *out = (uint_t *)cout, *op = out, c; + const unsigned char *ip = in; + + while(op < out+outlen/sizeof(uint_t)) { __builtin_prefetch(ip +384, 0); + if(likely((c = *(uint_t *)ip) != e)) { + ip += sizeof(uint_t); + *op++ = c; + } else { + int i; + ip += sizeof(uint_t); + vbget32(ip, i); + if(likely(i)) { + c = *(uint_t *)ip; + ip += sizeof(uint_t); + i += 3; + rmemset(op, c, i); + } else + *op++ = e; + } + } + #if USIZE > 8 + { unsigned char *p = (unsigned char *)op; + while(p < cout+outlen) *p++ = *ip++; + } + #endif + return ip - in; +} + #endif + +unsigned TEMPLATE2(srled, USIZE)(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint_t e) { + if(inlen == outlen) + memcpy(out, in, outlen); + else if(inlen == 1) + memset(out, in[0], outlen); + else + return TEMPLATE2(_srled, USIZE)(in, out, outlen, e); + return inlen; +} + #endif