TurboRLE: decode
This commit is contained in:
24
trled.c
Executable file → Normal file
24
trled.c
Executable file → Normal file
@ -41,6 +41,12 @@
|
|||||||
#endif
|
#endif
|
||||||
#include "sse_neon.h"
|
#include "sse_neon.h"
|
||||||
|
|
||||||
|
#ifdef __ARM_NEON
|
||||||
|
#define PREFETCH(_ip_,_rw_)
|
||||||
|
#else
|
||||||
|
#define PREFETCH(_ip_,_rw_) __builtin_prefetch(_ip_,_rw_)
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "trle.h"
|
#include "trle.h"
|
||||||
#include "trle_.h"
|
#include "trle_.h"
|
||||||
//------------------------------------- RLE 8 with Escape char ------------------------------------------------------------------
|
//------------------------------------- RLE 8 with Escape char ------------------------------------------------------------------
|
||||||
@ -74,9 +80,9 @@ unsigned _srled8(const unsigned char *__restrict in, unsigned char *__restrict o
|
|||||||
v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v, ev)); if(mask) goto a; ip += 16; op += 16;
|
v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(_mm_cmpeq_epi8(v, ev)); if(mask) goto a; ip += 16; op += 16;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
__builtin_prefetch(ip+512, 0);
|
PREFETCH(ip+512, 0);
|
||||||
continue;
|
continue;
|
||||||
a: r = ctz32(mask); ip += r+1; __builtin_prefetch(ip+512, 0);
|
a: r = ctz32(mask); ip += r+1; PREFETCH(ip+512, 0);
|
||||||
op += r;
|
op += r;
|
||||||
#else
|
#else
|
||||||
if(likely((c = *ip++) != e)) { *op++ = c; continue; }
|
if(likely((c = *ip++) != e)) { *op++ = c; continue; }
|
||||||
@ -198,7 +204,7 @@ unsigned _trled(const unsigned char *__restrict in, unsigned char *__restrict ou
|
|||||||
vlzget(ip, i, m, c-1);
|
vlzget(ip, i, m, c-1);
|
||||||
c = *ip++;
|
c = *ip++;
|
||||||
i += TMIN;
|
i += TMIN;
|
||||||
rmemset(op,c,i); __builtin_prefetch(ip+512, 0);
|
rmemset(op,c,i); PREFETCH(ip+512, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
while(op < out+outlen) {
|
while(op < out+outlen) {
|
||||||
@ -294,22 +300,22 @@ unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned c
|
|||||||
uint32_t mask;
|
uint32_t mask;
|
||||||
__m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(TEMPLATE2(_mm256_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 32; op += 256/USIZE;
|
__m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(TEMPLATE2(_mm256_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 32; op += 256/USIZE;
|
||||||
v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(TEMPLATE2(_mm256_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 32; op += 256/USIZE;
|
v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(TEMPLATE2(_mm256_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 32; op += 256/USIZE;
|
||||||
__builtin_prefetch(ip+512, 0);
|
PREFETCH(ip+512, 0);
|
||||||
continue;
|
continue;
|
||||||
a: r = ctz32(mask)/(USIZE/8);
|
a: r = ctz32(mask)/(USIZE/8);
|
||||||
op += r;
|
op += r;
|
||||||
ip += (r+1)*sizeof(uint_t); __builtin_prefetch(ip+512, 0);
|
ip += (r+1)*sizeof(uint_t); PREFETCH(ip+512, 0);
|
||||||
#elif (__SSE__ != 0 /*|| __ARM_NEON != 0*/) && USIZE != 64
|
#elif (__SSE__ != 0 /*|| __ARM_NEON != 0*/) && USIZE != 64
|
||||||
uint32_t mask;
|
uint32_t mask;
|
||||||
__m128i v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(TEMPLATE2(_mm_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 16; op += 128/USIZE;
|
__m128i v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(TEMPLATE2(_mm_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 16; op += 128/USIZE;
|
||||||
v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(TEMPLATE2(_mm_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 16; op += 128/USIZE;
|
v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(TEMPLATE2(_mm_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 16; op += 128/USIZE;
|
||||||
v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(TEMPLATE2(_mm_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 16; op += 128/USIZE;
|
v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(TEMPLATE2(_mm_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 16; op += 128/USIZE;
|
||||||
v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(TEMPLATE2(_mm_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 16; op += 128/USIZE;
|
v = _mm_loadu_si128((__m128i*)ip); _mm_storeu_si128((__m128i *)op, v); mask = _mm_movemask_epi8(TEMPLATE2(_mm_cmpeq_epi,USIZE)(v, ev)); if(mask) goto a; ip += 16; op += 128/USIZE;
|
||||||
__builtin_prefetch(ip+512, 0);
|
PREFETCH(ip+512, 0);
|
||||||
continue;
|
continue;
|
||||||
a: r = ctz32(mask)/(USIZE/8);
|
a: r = ctz32(mask)/(USIZE/8);
|
||||||
op += r;
|
op += r;
|
||||||
ip += (r+1)*sizeof(uint_t); __builtin_prefetch(ip+512, 0);
|
ip += (r+1)*sizeof(uint_t); PREFETCH(ip+512, 0);
|
||||||
#else
|
#else
|
||||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||||
@ -318,9 +324,9 @@ unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned c
|
|||||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; __builtin_prefetch(ip +512, 0);
|
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; PREFETCH(ip +512, 0);
|
||||||
continue;
|
continue;
|
||||||
a: ip += sizeof(uint_t); __builtin_prefetch(ip +512, 0);
|
a: ip += sizeof(uint_t); PREFETCH(ip +512, 0);
|
||||||
#endif
|
#endif
|
||||||
vlget32(ip, r);
|
vlget32(ip, r);
|
||||||
if(likely(r) >= 3) {
|
if(likely(r) >= 3) {
|
||||||
|
Reference in New Issue
Block a user