diff --git a/bitpack.c b/bitpack.c index 74542bb..0aa9641 100644 --- a/bitpack.c +++ b/bitpack.c @@ -179,7 +179,8 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons for(ip = in, in += n; ip < in;) { \ TEMPLATE3(uint, _usize_, _t) o,x;\ unsigned iplen = in - ip,b; \ - if(iplen > _csize_) iplen = _csize_; PREFETCH(ip+512,0);\ + if(iplen > _csize_) iplen = _csize_;\ + PREFETCH(ip+512,0);\ o = TEMPLATE2(bit,_usize_)(ip, iplen, &x); b = TEMPLATE2(bsr,_usize_)(o);\ *op++ = b; op = TEMPLATE2(bitpacka, _usize_)[b](ip, iplen, op);\ ip += iplen;\ diff --git a/bitunpack.c b/bitunpack.c index 9eb2af7..cf3e184 100644 --- a/bitunpack.c +++ b/bitunpack.c @@ -156,7 +156,7 @@ size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restri #define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_) #define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_) #else -#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) +#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) && !defined(__clang__) static inline __m128i _mm_cvtsi64_si128(__int64 a) { return _mm_loadl_epi64((__m128i*)&a); } #endif static ALIGNED(unsigned char, permv[256][8], 32) = { diff --git a/bitutil.c b/bitutil.c index 6486bae..5edca0a 100644 --- a/bitutil.c +++ b/bitutil.c @@ -122,7 +122,7 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { vo1 = _mm_or_si128(vo1, v1); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); - } start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14)); + } start = mm_cvtsi128_si16(_mm_srli_si128(vs,14)); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0); #else @@ -183,7 +183,7 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { //----- Undelta: In-place prefix sum (min. Delta = 0) ------------------- #define DD(i) _ip[i] = (start += _ip[i] + _md); -#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const _md = _md_;\ +#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const int _md = _md_;\ for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { DD(0); DD(1); DD(2); DD(3); }\ for(;_ip != _in_+_n_; _ip++) DD(0);\ } @@ -240,7 +240,7 @@ uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,*op = out,u,d,startd=0; BITZDE(uint64_t, in, n, mindelta, 64,o |= u;*op++ = u); return o;} #define ZDD(i) u = _ip[i]; d = u - start; _ip[i] = zigzagdec64(u)+(int64_t)startd+_md; startd = d; start = u -#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const _md = _md_;\ +#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const int _md = _md_;\ for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZDD(0); ZDD(1); ZDD(2); ZDD(3); }\ for(;_ip != _in_+_n_; _ip++) ZDD(0);\ } @@ -443,7 +443,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { vo1 = _mm_or_si128(vo1, v1); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); - } start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14)); + } start = mm_cvtsi128_si16(_mm_srli_si128(vs,14)); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0); #else @@ -488,7 +488,7 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) { vo1 = _mm_or_si128(vo1, v1); vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0)); vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0)); - } start = _mm_cvtsi128_si16(_mm_srli_si128(vs,12)); + } start = mm_cvtsi128_si16(_mm_srli_si128(vs,12)); vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0); vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0); #else diff --git a/bitutil.h b/bitutil.h index 7428060..e311b41 100644 --- a/bitutil.h +++ b/bitutil.h @@ -187,9 +187,9 @@ static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _m #define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_) #define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_) -//---------------- Convert _mm_cvtsi128_siXX ------------------------------------------- -static ALWAYS_INLINE uint8_t _mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); } -static ALWAYS_INLINE uint16_t _mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); } +//---------------- Convert mm_cvtsi128_siXX ------------------------------------------- +static ALWAYS_INLINE uint8_t mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); } +static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); } #endif //--------- memset ----------------------------------------- diff --git a/fp.c b/fp.c index e181d58..36f4852 100644 --- a/fp.c +++ b/fp.c @@ -155,7 +155,7 @@ size_t TEMPLATE2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t sv = TEMPLATE2(mm_xore_epi, USIZE)(v0,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *) p, sv); sv = v0; sv = TEMPLATE2(mm_xore_epi, USIZE)(v1,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1; } - start = (uint_t)TEMPLATE2(_mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8)); + start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8)); b = TEMPLATE2(mm_hor_epi, USIZE)(bv); #else for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); } @@ -240,7 +240,7 @@ size_t TEMPLATE2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t _mm_storeu_si128((__m128i *) op, v0); _mm_storeu_si128((__m128i *)(op+16/(USIZE/8)), sv); } - start = (uint_t)TEMPLATE2(_mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8)); + start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8)); #else for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); } #endif diff --git a/icapp.c b/icapp.c index 0509f18..75bfe94 100644 --- a/icapp.c +++ b/icapp.c @@ -1141,7 +1141,7 @@ unsigned bench8(unsigned char *in, unsigned n, unsigned char *out, unsigned char default: goto end; } if(l) { - char s[65]; printf("%-35 ", bestr(id, 8, s, codstr(codid), codlev)); + char s[65]; printf("%-35s ", bestr(id, 8, s, codstr(codid), codlev)); if(cpy) rc = memcheck(in,m*(USIZE),cpy); if(!rc) printf("\t%s\n", inname?inname:""); diff --git a/time_.h b/time_.h index 4f32b26..d6a3233 100644 --- a/time_.h +++ b/time_.h @@ -51,7 +51,7 @@ typedef struct timespec tm_t; #ifdef __corei7__ #define RDTSC_INI(_c_) do { unsigned _cl, _ch; \ - __asm volatile ("couid\n\t" \ + __asm volatile ("cpuid\n\t" \ "rdtsc\n\t" \ "mov %%edx, %0\n" \ "mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \ @@ -142,11 +142,11 @@ static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); } #endif //---------------------------------------- bench ---------------------------------------------------------------------- -// for each a function call is repeated until exceding tm_tx seconds. +// for each a function call is repeated until exceeding tm_tx seconds. // A run duration is always tm_tx seconds // The number of runs can be set with the program options -I and -J (specify -I15 -J15 for more precision) -// sleep after each 8 runs to avoid cpu trottling. +// sleep after each 8 runs to avoid cpu throttling. #define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0) // benchmark loop @@ -160,11 +160,11 @@ static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); } /*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\ /*other runs: break the loop only after 'tm_rm' repeats */ \ _tm_t = tmdiff(_tm_t0, tmtime());\ - /*set min time, recalculte repeats tm_rm based on tm_tx, recalculte number of runs based on tm_TX*/\ + /*set min time, recalculate repeats tm_rm based on tm_tx, recalculate number of runs based on tm_TX*/\ if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("[%d,%d] ", tm_rm, _tm_Rn);*/ } tm_tm = _tm_t; _tm_c++; }\ else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\ if(tm_verbose) { printf("%8.2f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\ - if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu trottling*/\ + if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu throttling*/\ }\ } diff --git a/transpose.c b/transpose.c index 2db63cc..5e50c80 100644 --- a/transpose.c +++ b/transpose.c @@ -596,7 +596,7 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o for(ip = in,op = out; ip != in+v; ip += ESIZE*32, op += ESIZE*32/STRIDE) { unsigned char *p = op; PREFETCH(ip+ESIZE*192,0); - __m256i iv[ESIZE],ov[ESIZE]; + __m256i iv[ESIZE],ov[ESIZE == 2 ? ESIZE + 2 : ESIZE]; #if ESIZE == 2 ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0); ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1); @@ -724,7 +724,7 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o #endif for(op = out,ip = in; op != out+v; ip += ESIZE*32/STRIDE, op += ESIZE*32) { unsigned char *p = ip; PREFETCH(ip+ESIZE*192,0); - __m256i iv[ESIZE], ov[ESIZE]; + __m256i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE]; #if STRIDE > ESIZE NBL0(0,1); NBL( 2,3); NB(0,1,iv[0]); NB(2,3,iv[1]); @@ -842,7 +842,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o #endif for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op; PREFETCH(ip+(ESIZE*16)*ESIZE,0); - __m128i iv[ESIZE],ov[ESIZE]; + __m128i iv[ESIZE],ov[ESIZE == 2 ? ESIZE + 2 : ESIZE]; #if defined(__SSSE3__) || defined(__ARM_NEON) #if ESIZE == 2 #ifdef __ARM_NEON @@ -1100,7 +1100,7 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o for(op = out,ip = in; op != out+v; op+=ESIZE*16,ip += ESIZE*16/STRIDE) { unsigned char *p=ip; PREFETCH(ip+(ESIZE*16/STRIDE)*ESIZE,0); - __m128i iv[ESIZE], ov[ESIZE]; + __m128i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE]; #if STRIDE > ESIZE //------------ Nibble transpose ------------------- ov[0] = _mm_loadl_epi64((__m128i *) p ); diff --git a/trlec.c b/trlec.c index 0ffdc05..9fbd1e3 100644 --- a/trlec.c +++ b/trlec.c @@ -79,7 +79,7 @@ static unsigned cntcalc32(const unsigned char *__restrict in, unsigned inlen, cn #define PUTE(_op_, _e_) do { PUTC(_op_, _e_); vlput32(_op_, 0); } while(0) #define SZ64 if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8; -#define SZ32 if((z = (ctou32(ip) ^ ctou32(ip+1)))) break; ip += 4; +#define SZ32 if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4; #define SRLEPUT8(_pp_, _ip_, _e_, _op_) do {\ unsigned _r = (_ip_ - _pp_)+1;\ @@ -252,7 +252,8 @@ unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char TRLEPUT(pp, ip, m, rmap, op); pp = ++ip; } - if(ip < ie) PUTC(op, *ip++); AS(ip == ie, "Fatal ip>ie=%d ", (int)(ip-ie)); + if(ip < ie) PUTC(op, *ip++); + AS(ip == ie, "Fatal ip>ie=%d ", (int)(ip-ie)); if(op - out < inlen) return op - out; // RETURN length = rle diff --git a/trled.c b/trled.c index f886066..d156834 100644 --- a/trled.c +++ b/trled.c @@ -355,14 +355,22 @@ unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned c op += r; ip += (r+1)*sizeof(uint_t); PREFETCH(ip+512, 0); #else - if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; - if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; - if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; - if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; - if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; - if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; - if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; - if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; PREFETCH(ip +512, 0); + if(((c = ctout(ip)) == e)) goto a; + ip += sizeof(uint_t); *op++ = c; + if(((c = ctout(ip)) == e)) goto a; + ip += sizeof(uint_t); *op++ = c; + if(((c = ctout(ip)) == e)) goto a; + ip += sizeof(uint_t); *op++ = c; + if(((c = ctout(ip)) == e)) goto a; + ip += sizeof(uint_t); *op++ = c; + if(((c = ctout(ip)) == e)) goto a; + ip += sizeof(uint_t); *op++ = c; + if(((c = ctout(ip)) == e)) goto a; + ip += sizeof(uint_t); *op++ = c; + if(((c = ctout(ip)) == e)) goto a; + ip += sizeof(uint_t); *op++ = c; + if(((c = ctout(ip)) == e)) goto a; + ip += sizeof(uint_t); *op++ = c; PREFETCH(ip +512, 0); continue; a: ip += sizeof(uint_t); PREFETCH(ip +512, 0); #endif diff --git a/vp4c.c b/vp4c.c index f9237f2..4a0b799 100644 --- a/vp4c.c +++ b/vp4c.c @@ -359,7 +359,8 @@ unsigned char *TEMPLATE2(_P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsig #if HYBRID > 0 && USIZE >= 16 if(bx <= USIZE) { #endif - for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i]; out += PAD8(n); //if(eqx == xn && bx) { out[-1] |=0x80; TEMPLATE2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else + for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i]; //if(eqx == xn && bx) { out[-1] |=0x80; TEMPLATE2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else + out += PAD8(n); out = TEMPLATE2(bitpack, USIZE)(inx, xn, out, bx); //if(eq == n && b) { out[-1]|= 0x80; TEMPLATE2(ctou, USIZE)(out)=a; out += (b+7)/8; } else out = TEMPLATE2(BITPACK, USIZE)(_in, n, out, b); #if HYBRID > 0 && USIZE >= 16 diff --git a/vsimple.c b/vsimple.c index 0eccc6d..ec9ad65 100644 --- a/vsimple.c +++ b/vsimple.c @@ -449,7 +449,8 @@ unsigned char *TEMPLATE2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, u unsigned b = ((*ip++) >> 5)+1; *op = *(unsigned long long *)ip; if(unlikely(b!=8)) - *op &= (1ull<<(b*8))-1; op++; ip += b; + *op &= (1ull<<(b*8))-1; + op++; ip += b; break; } #endif