Files
TurboPFor-Integer-Compression/ext/trlec.c
2018-03-02 17:11:31 +01:00

350 lines
9.8 KiB
C

/**
Copyright (C) powturbo 2015-2018
GPL v2 License
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
TurboRLE - "Most efficient and fastest Run Length Encoding"
**/
#ifndef USIZE
#include <string.h>
#ifdef __SSE__
#include <emmintrin.h>
#endif
#include "trle_.h"
#include "trle.h"
//------------------------------------- Histogram ---------------------------------------------------------
static inline unsigned hist(const unsigned char *__restrict in, unsigned inlen, unsigned *cc) { // Optimized for x86
unsigned c0[256+8]={0},c1[256+8]={0},c2[256+8]={0},c3[256+8]={0},c4[256+8]={0},c5[256+8]={0},c6[256+8]={0},c7[256+8]={0};
const unsigned char *ip;
unsigned cp = *(unsigned *)in,a;
int i;
for(ip = in; ip != in+(inlen&~(16-1));) {
unsigned c = cp, d = *(unsigned *)(ip+=4); cp = *(unsigned *)(ip+=4);
c0[(unsigned char) c ]++;
c1[(unsigned char) d ]++;
c2[(unsigned char)(c>>8)]++; c>>=16;
c3[(unsigned char)(d>>8)]++; d>>=16;
c4[(unsigned char) c ]++;
c5[(unsigned char) d ]++;
c6[ c>>8 ]++;
c7[ d>>8 ]++;
c = cp; d = *(unsigned *)(ip+=4); cp = *(unsigned *)(ip+=4);
c0[(unsigned char) c ]++;
c1[(unsigned char) d ]++;
c2[(unsigned char)(c>>8)]++; c>>=16;
c3[(unsigned char)(d>>8)]++; d>>=16;
c4[(unsigned char) c ]++;
c5[(unsigned char) d ]++;
c6[ c>>8 ]++;
c7[ d>>8 ]++;
}
while(ip < in+inlen) c0[*ip++]++;
for(i = 0; i < 256; i++)
cc[i] += c0[i]+c1[i]+c2[i]+c3[i]+c4[i]+c5[i]+c6[i]+c7[i];
a = 256;
while(a > 1 && !cc[a-1]) a--;
return a;
}
//------------------------------------- RLE with Escape char ------------------------------------------------------------------
#define SRLE8 32
#define USIZE 8
#include "trlec.c"
#if SRLE8
#define SRLEC8(pp, ip, op, e) do {\
unsigned i = ip - pp;\
if(i > 3) { *op++ = e; i -= 3; vbput32(op, i); *op++ = c; }\
else if(c == e) {\
while(i--) { *op++ = e; vbput32(op, 0); }\
} else while(i--) *op++ = c;\
} while(0)
unsigned _srlec8(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint8_t e) {
const uint8_t *ip = in, *pp = in - 1;
uint8_t *op = out,c;
if(inlen > SRLE8)
while(ip < in+(inlen-1-SRLE8)) {
#if 0 //def __SSE__ // SSE slower than scalar
__m128i cv = _mm_set1_epi8(*ip);
unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(ip+1)), cv)); if(mask != 0xffffu) goto a; ip += 16;
mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128((const __m128i*)(ip+1)), cv)); if(mask != 0xffffu) goto a; ip += 16;
continue;
a: c = *ip;
ip += __builtin_ctz((unsigned short)(~mask));
SRLEC8(pp, ip, op, e);
pp = ip++;
#elif __WORDSIZE == 64
{unsigned long long z;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#if SRLE8 >= 32
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz64(z)>>3;
SRLEC8(pp, ip, op, e);
pp = ip++;
}
#else
{ unsigned z;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
#if SRLE8 >= 16
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz32(z)>>3;
SRLEC8(pp, ip, op, e);
pp = ip++;
}
#endif
}
for(;ip < in+inlen; ip++)
if(*ip != ip[1]) {
c = *ip;
SRLEC8(pp,ip, op, e);
pp = ip;
}
c = *ip;
SRLEC8(pp, ip, op, e);
return op - out;
}
#endif
unsigned srlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out) {
unsigned m = 0xffffffffu, mi = 0, i, b[256] = {0},a;
size_t l;
if(inlen < 1) return 0;
a = hist(in,inlen,b);
if(b[a-1] == inlen) {
*out = *in;
return 1;
}
for(i = 0; i < 256; i++)
if(b[i] <= m)
m = b[i],mi = i;
*out = mi;
l = _srlec8(in, inlen, out+1, mi)+1;
if(l < inlen)
return l;
memcpy(out, in, inlen);
return inlen;
}
//------------------------------------------------- TurboRLE ------------------------------------------
struct u { unsigned c,i; };
#define PUTC(op, x) *op++ = x
#define TRLEC(pp, ip, op, _goto_) do {\
unsigned _i = ip - pp;\
if(_i >= TMIN) {\
unsigned char *q = op; \
vbzput(op, _i-TMIN, m, rmap); \
if((op-q) + 1 < _i) { *op++ = c; _goto_; } op=q;\
} while(_i--) PUTC(op,c);\
} while(0)
#define TRLEC0(pp, ip, op, _goto_) do { unsigned _i = ip - pp;\
if(_i >= TMIN) { vbzput(op, _i-TMIN, m, rmap); *op++ = c; } else while(_i--) PUTC(op,c);\
} while(0)
unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out) {
int m,i;
unsigned b[256] = {0}, rmap[256],a;
struct u u[256],*v; // sort
unsigned char *op;
const unsigned char *ip,*pp;
uint8_t c;
if(inlen < 1) return 0;
a = hist(in,inlen,b);
if(b[a-1] == inlen) {
*out = *in;
return 1;
}
for(i = 0; i < 256; i++) u[i].c = b[i], u[i].i = i,b[i]=0;
for(v = u + 1; v < u + 256; ++v)
if(v->c < v[-1].c) {
struct u *w, tmp = *v;
for(w = v; w > u && tmp.c < w[-1].c; --w) *w = w[-1];
*w = tmp;
}
for(m = -1,i = 0; i < 256 && !u[i].c; i++)
b[u[i].i]++, ++m;
op = out;
if(m < 0) { // no unused bytes found
size_t l;
*op++ = 0;
*op++ = u[0].i;
l = _srlec8(in, inlen, op, u[0].i)+2;
if(l < inlen) return l;
memcpy(out, in, inlen);
return inlen;
}
*op++ = 1;
memset(op, 0, 32);
for(m = -1,i = 0; i < 256; i++)
if(b[i]) {
op[i>>3] |= 1<<(i&7);
rmap[++m] = i;
}
op += 32;
ip = in; pp=in-1;
if(inlen > SRLE8)
while(ip < in+(inlen-1-SRLE8)) {
unsigned long long z;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#if SRLE8 >= 32
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
#endif
__builtin_prefetch(ip +256, 0);
continue;
a: c = *ip;
ip += ctz64(z)>>3;
TRLEC(pp, ip, op, goto laba);
laba:pp = ip++;
}
for(;ip < in+inlen; ip++) {
if(*ip != *(ip+1)) {
c = *ip;
TRLEC(pp, ip, op, goto labb);
labb:pp = ip;
}
}
c = *ip;
TRLEC(pp,ip, op, goto labc);
labc:
if(op - out < inlen)
return op - out;
memcpy(out, in, inlen);
return inlen;
}
#undef USIZE
#undef SRLE8
#define USIZE 16
#include "trlec.c"
#undef USIZE
#define USIZE 32
#include "trlec.c"
#undef USIZE
#define USIZE 64
#include "trlec.c"
#undef USIZE
#else
#define uint_t TEMPLATE3(uint, USIZE, _t)
#define SRLEC(pp, ip, op, e) do {\
unsigned i = ip - pp;\
if(i > 3) { *(uint_t *)op = e; op+=sizeof(uint_t); i -= 3; vbput32(op, i); *(uint_t *)op = c; op+=sizeof(uint_t); }\
else if(c == e) {\
while(i--) { *(uint_t *)op = e; op+=sizeof(uint_t); vbput32(op, 0); }\
} else while(i--) { *(uint_t *)op = c; op+=sizeof(uint_t); }\
} while(0)
#if !SRLE8
unsigned TEMPLATE2(_srlec, USIZE)(const unsigned char *__restrict cin, unsigned inlen, unsigned char *__restrict out, uint_t e) {
unsigned char *op = out;
uint_t *in = (uint_t *)cin, *pp = in-1, *ip=in,c;
unsigned n = inlen/sizeof(uint_t);
unsigned char *p;
if(n > 4)
for(; ip < in+(n-1-4);) {
#if 0
if(* ip == ip[1])
if(*++ip == ip[1])
if(*++ip == ip[1])
if(*++ip == ip[1]) {
ip++; __builtin_prefetch(ip +256, 0);
continue;
}
#else
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip;
if(*ip != ip[1]) goto a; ++ip; __builtin_prefetch(ip +256, 0);
continue;
a:;
#endif
c = *ip;
SRLEC(pp,ip, op, e);
pp = ip++;
}
for(;ip < in+n; ip++)
if(*ip != ip[1]) {
c = *ip;
SRLEC(pp,ip, op, e);
pp = ip;
}
c = *ip;
SRLEC(pp, ip, op, e);
#if USIZE > 8
p = (unsigned char *)ip;
while(p < cin+inlen)
*op++ = *p++;
#endif
return op - out;
}
#endif
#undef SRLEC
unsigned TEMPLATE2(srlec, USIZE)(const unsigned char *__restrict in, unsigned inlen, unsigned char *__restrict out, uint_t e) {
size_t l = TEMPLATE2(_srlec, USIZE)(in, inlen, out, e);
if(l < inlen)
return l;
memcpy(out, in, inlen);
return inlen;
}
#endif