IcBench App: External libs

This commit is contained in:
x
2018-03-02 17:11:31 +01:00
parent f10802cbfe
commit 17d099f318
9 changed files with 984 additions and 4 deletions

238
ext/SPDP_10.c Normal file
View File

@ -0,0 +1,238 @@
/*
SPDP code: SPDP is a unified compression/decompression algorithm that works
well on both binary 32-bit single-precision (float) and binary 64-bit double-
precision (double) floating-point data.
Copyright (c) 2016, Texas State University. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted for academic, research, experimental, or personal use provided
that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions, and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions, and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Texas State University nor the names of its
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
For all other uses, please contact the Office for Commercialization and Industry
Relations at Texas State University <http://www.txstate.edu/ocir/>.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Authors: Martin Burtscher and Steven Claggett
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define MAX_TABLE_SIZE (1 << 18)
typedef unsigned char byte_t;
typedef unsigned int word_t;
static size_t compress(const byte_t level, const size_t length, byte_t* const buf1, byte_t* const buf2)
{
word_t* in = (word_t*)buf1;
word_t* out = (word_t*)buf2;
size_t len = length / sizeof(word_t);
word_t prev2 = 0;
word_t prev1 = 0;
size_t pos;
for (pos = 0; pos < len; pos++) {
word_t curr = in[pos];
out[pos] = curr - prev2;
prev2 = prev1;
prev1 = curr;
}
for (pos = len * sizeof(word_t); pos < length; pos++) {
buf2[pos] = buf1[pos];
}
byte_t prev = 0;
size_t wpos = 0;
size_t d;
for (d = 0; d < 8; d++) {
size_t rpos;
for (rpos = d; rpos < length; rpos += 8) {
byte_t curr = buf2[rpos];
buf1[wpos] = curr - prev;
prev = curr;
wpos++;
}
}
size_t predtabsize = 1 << (level + 9);
if (predtabsize > MAX_TABLE_SIZE) predtabsize = MAX_TABLE_SIZE;
const size_t predtabsizem1 = predtabsize - 1;
unsigned int lastpos[MAX_TABLE_SIZE];
memset(lastpos, 0, predtabsize * sizeof(unsigned int));
size_t rpos = 0;
wpos = 0;
unsigned int hist = 0;
while (rpos < length) {
byte_t val = buf1[rpos];
unsigned int lpos = lastpos[hist];
if (lpos >= 6) {
if ((buf1[lpos - 6] == buf1[rpos - 6]) && (buf1[lpos - 5] == buf1[rpos - 5]) &&
(buf1[lpos - 4] == buf1[rpos - 4]) && (buf1[lpos - 3] == buf1[rpos - 3]) &&
(buf1[lpos - 2] == buf1[rpos - 2]) && (buf1[lpos - 1] == buf1[rpos - 1])) {
byte_t cnt = 0;
while ((val == buf1[lpos]) && (cnt < 255) && (rpos < (length - 1))) {
lastpos[hist] = rpos;
hist = ((hist << 2) ^ val) & predtabsizem1;
rpos++;
lpos++;
cnt++;
val = buf1[rpos];
}
buf2[wpos] = cnt;
wpos++;
}
}
buf2[wpos] = val;
wpos++;
lastpos[hist] = rpos;
hist = ((hist << 2) ^ val) & predtabsizem1;
rpos++;
}
return wpos;
}
static void decompress(const byte_t level, const size_t length, byte_t* const buf2, byte_t* const buf1)
{
unsigned int predtabsize = 1 << (level + 9);
if (predtabsize > MAX_TABLE_SIZE) predtabsize = MAX_TABLE_SIZE;
const unsigned int predtabsizem1 = predtabsize - 1;
unsigned int lastpos[MAX_TABLE_SIZE];
memset(lastpos, 0, predtabsize * sizeof(unsigned int));
size_t rpos = 0;
size_t wpos = 0;
unsigned int hist = 0;
while (rpos < length) {
unsigned int lpos = lastpos[hist];
if (lpos >= 6) {
if ((buf1[lpos - 6] == buf1[wpos - 6]) && (buf1[lpos - 5] == buf1[wpos - 5]) &&
(buf1[lpos - 4] == buf1[wpos - 4]) && (buf1[lpos - 3] == buf1[wpos - 3]) &&
(buf1[lpos - 2] == buf1[wpos - 2]) && (buf1[lpos - 1] == buf1[wpos - 1])) {
byte_t cnt = buf2[rpos];
rpos++;
byte_t j;
for (j = 0; j < cnt; j++) {
byte_t val = buf1[wpos] = buf1[lpos];
lastpos[hist] = wpos;
hist = ((hist << 2) ^ val) & predtabsizem1;
wpos++;
lpos++;
}
}
}
byte_t val = buf1[wpos] = buf2[rpos];
lastpos[hist] = wpos;
hist = ((hist << 2) ^ val) & predtabsizem1;
wpos++;
rpos++;
}
const size_t usize = wpos;
byte_t val = 0;
rpos = 0;
size_t d;
for (d = 0; d < 8; d++) {
size_t wpos;
for (wpos = d; wpos < usize; wpos += 8) {
val += buf1[rpos];
buf2[wpos] = val;
rpos++;
}
}
word_t* in = (word_t*)buf2;
word_t* out = (word_t*)buf1;
const size_t len = usize / sizeof(word_t);
word_t prev2 = 0;
word_t prev1 = 0;
size_t pos;
for (pos = 0; pos < len; pos++) {
word_t curr = in[pos] + prev2;
out[pos] = curr;
prev2 = prev1;
prev1 = curr;
}
for (pos = len * sizeof(word_t); pos < usize; pos++) {
buf1[pos] = buf2[pos];
}
}
#ifndef NMAIN
#define BUFFER_SIZE (1 << 23)
static byte_t buffer1[BUFFER_SIZE];
static byte_t buffer2[BUFFER_SIZE * 2 + 9];
int main(int argc, char *argv[])
{
fprintf(stderr, "SPDP Floating-Point Compressor v1.0\n");
fprintf(stderr, "Copyright (c) 2016 Texas State University\n\n");
if ((argc != 1) && (argc != 2)) {
fprintf(stderr, "compression usage: %s level < uncompressed_file > compressed_file\n", argv[0]);
fprintf(stderr, "decompression usage: %s < compressed_file > decompressed_file\n", argv[0]);
return -1;
}
if (argc == 2) { // compression
byte_t level = atoi(argv[1]);
if (level < 0) level = 0;
if (level > 9) level = 9;
fwrite(&level, sizeof(byte_t), 1, stdout);
int length = fread(buffer1, sizeof(byte_t), BUFFER_SIZE, stdin);
while (length > 0) {
fwrite(&length, sizeof(int), 1, stdout);
int csize = compress(level, length, buffer1, buffer2);
fwrite(&csize, sizeof(int), 1, stdout);
fwrite(buffer2, sizeof(byte_t), csize, stdout);
length = fread(buffer1, sizeof(byte_t), BUFFER_SIZE, stdin);
}
} else { // decompression
byte_t level = 10;
fread(&level, sizeof(byte_t), 1, stdin);
if ((level < 0) || (level > 9)) {
fprintf(stderr, "incorrect input file type\n");
return -2;
}
int length;
while (fread(&length, sizeof(int), 1, stdin) > 0) {
int csize;
fread(&csize, sizeof(int), 1, stdin);
fread(buffer2, sizeof(byte_t), csize, stdin);
decompress(level, csize, buffer2, buffer1);
fwrite(buffer1, sizeof(byte_t), length, stdout);
}
}
return 0;
}
#endif

Submodule ext/lz4 updated: f76ee4e267...b5233d3726

72
ext/trle.h Normal file
View File

@ -0,0 +1,72 @@
/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
#if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h"
#else
#include <stdint.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
// RLE with specified escape char
unsigned _srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
unsigned _srled8( const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint8_t e);
unsigned _srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
unsigned _srled16(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint16_t e);
unsigned _srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
unsigned _srled32(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint32_t e);
unsigned _srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
unsigned _srled64(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, uint64_t e);
// functions w/ overflow handling
unsigned srlec8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e);
unsigned srled8( const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint8_t e);
unsigned srlec16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint16_t e);
unsigned srled16(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint16_t e);
unsigned srlec32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint32_t e);
unsigned srled32(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint32_t e);
unsigned srlec64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint64_t e);
unsigned srled64(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint64_t e);
// RLE w. automatic escape char determination
unsigned srlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
unsigned _srled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen);
unsigned srled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
// Turbo RLE
unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out);
unsigned _trled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen);
unsigned trled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen);
#ifdef __cplusplus
}
#endif

62
ext/trle_.h Normal file
View File

@ -0,0 +1,62 @@
/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
//------------------------- Variable Byte from https://github.com/powturbo/TurboPFor -----------------------------------------------------
#include "../conf.h"
#define VB_SIZE 64
#define VB_MAX 254
#define VB_B2 6
#define VB_B3 3
#define VB_BA3 (VB_MAX - (VB_SIZE/8 - 3))
#define VB_BA2 (VB_BA3 - (1<<VB_B3))
#define VB_OFS1 (VB_BA2 - (1<<VB_B2))
#define VB_OFS2 (VB_OFS1 + (1 << (8+VB_B2)))
#define VB_OFS3 (VB_OFS2 + (1 << (16+VB_B3)))
#define _vblen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_OFS2?2:((_x_) < VB_OFS3)?3:(bsr32(_x_)+7)/8+1))
#define _vbvlen32(_x_) ((_x_) < VB_OFS1?1:((_x_) < VB_BA2?2:((_x_) < VB_BA3)?3:(_x_-VB_BA3)))
#define _vbput32(_op_, _x_, _act_) {\
if(likely((_x_) < VB_OFS1)){ *_op_++ = (_x_); _act_;}\
else if ((_x_) < VB_OFS2) { ctou16(_op_) = bswap16((VB_OFS1<<8)+((_x_)-VB_OFS1)); _op_ += 2; /*(_x_) -= VB_OFS1; *_op_++ = VB_OFS1 + ((_x_) >> 8); *_op_++ = (_x_);*/ _act_; }\
else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\
else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou32(_op_) = (_x_); _op_ += _b; _act_;}\
}
#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\
if(likely(_x_ < VB_OFS1)) { _act_ ;}\
else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \
else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\
else { unsigned _b = _x_-VB_BA3; _x_ = ctou32(_ip_) & ((1u << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\
} while(0)
#define vbput32(_op_, _x_) { register unsigned _x = _x_; _vbput32(_op_, _x, ;); }
#define vbget32(_ip_, _x_) _vbget32(_ip_, _x_, ;)
#define vbzput(_op_, _x_, _m_, _emap_) do { if(unlikely((_x_) < _m_)) *_op_++ = _emap_[_x_]; else { unsigned _xi = (_x_) - _m_; *_op_++ = _emap_[_m_]; vbput32(_op_, _xi); } } while(0)
#define vbzget(_ip_, _x_, _m_, _e_) { _x_ = _e_; if(unlikely(_x_ == _m_)) { vbget32(_ip_,_x_); _x_+=_m_; } }
#define TMIN 3

349
ext/trlec.c Normal file
View File

@ -0,0 +1,349 @@
/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
#ifndef USIZE
#include <string.h>
#ifdef __SSE__
#include <emmintrin.h>
#endif
#include "trle_.h"
#include "trle.h"
//------------------------------------- Histogram ---------------------------------------------------------
static inline unsigned hist(const unsigned char *__restrict in, unsigned inlen, unsigned *cc) { // Optimized for x86
unsigned c0[256+8]={0},c1[256+8]={0},c2[256+8]={0},c3[256+8]={0},c4[256+8]={0},c5[256+8]={0},c6[256+8]={0},c7[256+8]={0};
const unsigned char *ip;
unsigned cp = *(unsigned *)in,a;
int i;
for(ip = in; ip != in+(inlen&~(16-1));) {
unsigned c = cp, d = *(unsigned *)(ip+=4); cp = *(unsigned *)(ip+=4);
c0[(unsigned char) c ]++;
c1[(unsigned char) d ]++;
c2[(unsigned char)(c>>8)]++; c>>=16;
c3[(unsigned char)(d>>8)]++; d>>=16;
c4[(unsigned char) c ]++;
c5[(unsigned char) d ]++;
c6[ c>>8 ]++;
c7[ d>>8 ]++;
c = cp; d = *(unsigned *)(ip+=4); cp = *(unsigned *)(ip+=4);
c0[(unsigned char) c ]++;
c1[(unsigned char) d ]++;
c2[(unsigned char)(c>>8)]++; c>>=16;
c3[(unsigned char)(d>>8)]++; d>>=16;
c4[(unsigned char) c ]++;
c5[(unsigned char) d ]++;
c6[ c>>8 ]++;
c7[ d>>8 ]++;
}
while(ip < in+inlen) c0[*ip++]++;
for(i = 0; i < 256; i++)
cc[i] += c0[i]+c1[i]+c2[i]+c3[i]+c4[i]+c5[i]+c6[i]+c7[i];
a = 256;
while(a > 1 && !cc[a-1]) a--;
return a;
}
//------------------------------------- RLE with Escape char ------------------------------------------------------------------
#define SRLE8 32
#define USIZE 8
#include "trlec.c"
#if SRLE8
#define SRLEC8(pp, ip, op, e) do {\
unsigned i = ip - pp;\
if(i > 3) { *op++ = e; i -= 3; vbput32(op, i); *op++ = c; }\
else if(c == e) {\
while(i--) { *op++ = e; vbput32(op, 0); }\
} else while(i--) *op++ = c;\
} while(0)
unsigned _srlec8(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e) {
const uint8_t *ip = in, *pp = in - 1;
uint8_t *op = out,c;
if(inlen > SRLE8)
while(ip < in+(inlen-1-SRLE8)) {
#if 0 //def __SSE__ // SSE slower than scalar
__m128i cv = _mm_set1_epi8(*ip);
unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(ip+1)), cv)); if(mask != 0xffffu) goto a; ip += 16;
mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(ip+1)), cv)); if(mask != 0xffffu) goto a; ip += 16;
continue;
a: c = *ip;
ip += __builtin_ctz((unsigned short)(~mask));
SRLEC8(pp, ip, op, e);
pp = ip++;
#elif __WORDSIZE == 64
{unsigned long long z;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#if SRLE8 >= 32
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz64(z)>>3;
SRLEC8(pp, ip, op, e);
pp = ip++;
}
#else
{ unsigned z;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
#if SRLE8 >= 16
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz32(z)>>3;
SRLEC8(pp, ip, op, e);
pp = ip++;
}
#endif
}
for(;ip < in+inlen; ip++)
if(*ip != ip[1]) {
c = *ip;
SRLEC8(pp,ip, op, e);
pp = ip;
}
c = *ip;
SRLEC8(pp, ip, op, e);
return op - out;
}
#endif
unsigned srlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out) {
unsigned m = 0xffffffffu, mi = 0, i, b[256] = {0},a;
size_t l;
if(inlen < 1) return 0;
a = hist(in,inlen,b);
if(b[a-1] == inlen) {
*out = *in;
return 1;
}
for(i = 0; i < 256; i++)
if(b[i] <= m)
m = b[i],mi = i;
*out = mi;
l = _srlec8(in, inlen, out+1, mi)+1;
if(l < inlen)
return l;
memcpy(out, in, inlen);
return inlen;
}
//------------------------------------------------- TurboRLE ------------------------------------------
struct u { unsigned c,i; };
#define PUTC(op, x) *op++ = x
#define TRLEC(pp, ip, op, _goto_) do {\
unsigned _i = ip - pp;\
if(_i >= TMIN) {\
unsigned char *q = op; \
vbzput(op, _i-TMIN, m, rmap); \
if((op-q) + 1 < _i) { *op++ = c; _goto_; } op=q;\
} while(_i--) PUTC(op,c);\
} while(0)
#define TRLEC0(pp, ip, op, _goto_) do { unsigned _i = ip - pp;\
if(_i >= TMIN) { vbzput(op, _i-TMIN, m, rmap); *op++ = c; } else while(_i--) PUTC(op,c);\
} while(0)
unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out) {
int m,i;
unsigned b[256] = {0}, rmap[256],a;
struct u u[256],*v; // sort
unsigned char *op;
const unsigned char *ip,*pp;
uint8_t c;
if(inlen < 1) return 0;
a = hist(in,inlen,b);
if(b[a-1] == inlen) {
*out = *in;
return 1;
}
for(i = 0; i < 256; i++) u[i].c = b[i], u[i].i = i,b[i]=0;
for(v = u + 1; v < u + 256; ++v)
if(v->c < v[-1].c) {
struct u *w, tmp = *v;
for(w = v; w > u && tmp.c < w[-1].c; --w) *w = w[-1];
*w = tmp;
}
for(m = -1,i = 0; i < 256 && !u[i].c; i++)
b[u[i].i]++, ++m;
op = out;
if(m < 0) { // no unused bytes found
size_t l;
*op++ = 0;
*op++ = u[0].i;
l = _srlec8(in, inlen, op, u[0].i)+2;
if(l < inlen) return l;
memcpy(out, in, inlen);
return inlen;
}
*op++ = 1;
memset(op, 0, 32);
for(m = -1,i = 0; i < 256; i++)
if(b[i]) {
op[i>>3] |= 1<<(i&7);
rmap[++m] = i;
}
op += 32;
ip = in; pp=in-1;
if(inlen > SRLE8)
while(ip < in+(inlen-1-SRLE8)) {
unsigned long long z;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#if SRLE8 >= 32
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz64(z)>>3;
TRLEC(pp, ip, op, goto laba);
laba:pp = ip++;
}
for(;ip < in+inlen; ip++) {
if(*ip != *(ip+1)) {
c = *ip;
TRLEC(pp, ip, op, goto labb);
labb:pp = ip;
}
}
c = *ip;
TRLEC(pp,ip, op, goto labc);
labc:
if(op - out < inlen)
return op - out;
memcpy(out, in, inlen);
return inlen;
}
#undef USIZE
#undef SRLE8
#define USIZE 16
#include "trlec.c"
#undef USIZE
#define USIZE 32
#include "trlec.c"
#undef USIZE
#define USIZE 64
#include "trlec.c"
#undef USIZE
#else
#define uint_t TEMPLATE3(uint, USIZE, _t)
#define SRLEC(pp, ip, op, e) do {\
unsigned i = ip - pp;\
if(i > 3) { *(uint_t *)op = e; op+=sizeof(uint_t); i -= 3; vbput32(op, i); *(uint_t *)op = c; op+=sizeof(uint_t); }\
else if(c == e) {\
while(i--) { *(uint_t *)op = e; op+=sizeof(uint_t); vbput32(op, 0); }\
} else while(i--) { *(uint_t *)op = c; op+=sizeof(uint_t); }\
} while(0)
#if !SRLE8
unsigned TEMPLATE2(_srlec, USIZE)(const unsigned char *__restrict cin, unsigned inlen, unsigned char *__restrict out, uint_t e) {
unsigned char *op = out;
uint_t *in = (uint_t *)cin, *pp = in-1, *ip=in,c;
unsigned n = inlen/sizeof(uint_t);
unsigned char *p;
if(n > 4)
for(; ip < in+(n-1-4);) {
#if 0
if(* ip == ip[1])
if(*++ip == ip[1])
if(*++ip == ip[1])
if(*++ip == ip[1]) {
ip++; __builtin_prefetch(ip +256, 0);
continue;
}
#else
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip; __builtin_prefetch(ip +256, 0);
continue;
a:;
#endif
c = *ip;
SRLEC(pp,ip, op, e);
pp = ip++;
}
for(;ip < in+n; ip++)
if(*ip != ip[1]) {
c = *ip;
SRLEC(pp,ip, op, e);
pp = ip;
}
c = *ip;
SRLEC(pp, ip, op, e);
#if USIZE > 8
p = (unsigned char *)ip;
while(p < cin+inlen)
*op++ = *p++;
#endif
return op - out;
}
#endif
#undef SRLEC
unsigned TEMPLATE2(srlec, USIZE)(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint_t e) {
size_t l = TEMPLATE2(_srlec, USIZE)(in, inlen, out, e);
if(l < inlen)
return l;
memcpy(out, in, inlen);
return inlen;
}
#endif

259
ext/trled.c Normal file
View File

@ -0,0 +1,259 @@
/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
#ifndef USIZE
#include <string.h>
#ifdef __SSE__
#include <emmintrin.h>
#endif
#include "trle.h"
#include "trle_.h"
//------------------------------------- RLE with Escape char ------------------------------------------------------------------
//#define MEMSAFE
#define SRLE8 32 // 16//
#define USIZE 8
#include "trled.c"
#if SRLE8
unsigned _srled8(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen, unsigned char e) {
const uint8_t *ip = in;
uint8_t *op = out, c;
uint32_t i;
#ifdef __SSE__
__m128i ev = _mm_set1_epi8(e);
#endif
if(outlen >= SRLE8)
while(op < out+(outlen-SRLE8)) {
#ifdef __SSE__ // TODO: test _mm_cmpestrm/_mm_cmpestri on sse4
uint32_t mask;
__m128i u,v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v, ev)); if(mask) goto a; op += 16; ip += 16;
#if SRLE8 >= 32
u = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, u); mask = _mm_movemask_epi8(_mm_cmpeq_epi8(u, ev)); if(mask) goto a; op += 16; ip += 16;
#endif
__builtin_prefetch(ip+512, 0);
continue;
a: i = ctz32(mask);
op += i; ip += i+1;
{
#else
if(likely((c = *(uint8_t *)ip) != e)) {
ip++;
*op++ = c;
} else {
#endif
vbget32(ip, i);
if(likely(i)) {
uint8_t c = *ip++;
i += TMIN;
rmemset(op, c, i);
} else
*op++ = e;
}
}
#define rmemset8(_op_, _c_, _i_) while(_i_--) *_op_++ = _c_
while(op < out+outlen)
if(likely((c = *ip) != e)) {
ip++;
*op++ = c;
} else {
int i;
ip++;
vbget32(ip, i);
if(likely(i)) {
c = *ip++;
i += TMIN;
rmemset8(op, c, i);
} else
*op++ = e;
}
return ip - in;
}
#endif
unsigned _srled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen) {
return _srled8(in+1, out, outlen, *in);
}
unsigned srled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen) {
if(inlen == outlen)
memcpy(out, in, outlen);
else if(inlen == 1)
memset(out, in[0], outlen);
else
return _srled8(in+1, out, outlen, *in);
return inlen;
}
//------------------------------------- TurboRLE ------------------------------------------
unsigned _trled(const unsigned char *__restrict in, unsigned char *__restrict out, unsigned outlen) {
uint8_t b[256] = {0},*op = out;
const uint8_t *ip;
int m = -1, i, c;
if(outlen < 1)
return 0;
if(!*in++)
return _srled8(in+1, out, outlen, *in)+2;
for(ip = in; ip < in+32; ip++)
for(i = 0; i < 8; ++i)
if(((*ip) >> i) & 1)
b[(ip-in)<<3 | i] = ++m+1;
if(outlen >= 32)
while(op < out+(outlen-32)) {
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
if(b[*ip]) goto a; *op++ = *ip++;
__builtin_prefetch(ip+256, 0);
continue;
a:
c = b[*ip++];
vbzget(ip, i, m, c-1);
c = *ip++;
i += 3;
rmemset(op,c,i);
}
while(op < out+outlen) {
if(likely(!(c = b[*ip])))
*op++ = *ip++;
else {
ip++;
vbzget(ip, i, m, c-1);
c = *ip++;
i += 3;
rmemset8(op,c,i);
}
}
return ip - in;
}
unsigned trled(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen) {
if(inlen == outlen)
memcpy(out, in, outlen);
else if(inlen == 1)
memset(out, in[0], outlen);
else
return _trled(in, out, outlen);
return inlen;
}
#undef USIZE
#undef rmemset
#undef SRLE8
#define USIZE 16
#include "trled.c"
#undef rmemset
#undef USIZE
#undef runcpy
#define USIZE 32
#include "trled.c"
#undef rmemset
#undef USIZE
#undef runcpy
#define USIZE 64
#include "trled.c"
#undef rmemset
#undef USIZE
#else
#ifdef MEMSAFE
#define rmemset(_op_, _c_, _i_) while(_i_--) *_op_++ = _c_
#elif defined(__SSE__) && USIZE < 64
#define rmemset(_op_, _c_, _i_) do { \
__m128i *_up = (__m128i *)_op_, cv = TEMPLATE2(_mm_set1_epi, USIZE)(_c_);\
_op_ += _i_;\
do { _mm_storeu_si128( _up, cv); _mm_storeu_si128(_up+1, cv); _up+=2; } while(_up < (__m128i *)_op_);\
} while(0)
#else
#define _cset64(_cc,_c_) _cc = _c_
#define _cset32(_cc,_c_) _cc = _c_; _cc = _cc<<32|_cc
#define _cset16(_cc,_c_) _cc = _c_; _cc = _cc<<48|_cc<<32|_cc<<16|_cc
#define _cset8( _cc,_c_) _cc = (uint32_t)_c_<<24 | (uint32_t)_c_<<16 | (uint32_t)_c_<<8 | (uint32_t)_c_; _cc = _cc<<32|_cc
#define rmemset(_op_, _c_, _i_) do { uint64_t _cc; uint8_t *_up = (uint8_t *)_op_; _op_ +=_i_;\
TEMPLATE2(_cset, USIZE)(_cc,_c_);\
do {\
TEMPLATE2(ctou, USIZE)(_up) = _c_; _up += USIZE/8;\
TEMPLATE2(ctou, USIZE)(_up) = _c_; _up += USIZE/8;\
} while(_up < (uint8_t *)_op_);\
} while(0)
#endif
#define uint_t TEMPLATE3(uint, USIZE, _t)
#if !SRLE8
unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned char *__restrict cout, unsigned outlen, uint_t e) {
uint_t *out = (uint_t *)cout, *op = out, c;
const unsigned char *ip = in;
while(op < out+outlen/sizeof(uint_t)) { __builtin_prefetch(ip +384, 0);
if(likely((c = *(uint_t *)ip) != e)) {
ip += sizeof(uint_t);
*op++ = c;
} else {
int i;
ip += sizeof(uint_t);
vbget32(ip, i);
if(likely(i)) {
c = *(uint_t *)ip;
ip += sizeof(uint_t);
i += 3;
rmemset(op, c, i);
} else
*op++ = e;
}
}
#if USIZE > 8
{ unsigned char *p = (unsigned char *)op;
while(p < cout+outlen) *p++ = *ip++;
}
#endif
return ip - in;
}
#endif
unsigned TEMPLATE2(srled, USIZE)(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, unsigned outlen, uint_t e) {
if(inlen == outlen)
memcpy(out, in, outlen);
else if(inlen == 1)
memset(out, in[0], outlen);
else
return TEMPLATE2(_srled, USIZE)(in, out, outlen, e);
return inlen;
}
#endif