Merge pull request #82 from pps83/masterz
Multiple compilation fixes for VS 2022
This commit is contained in:
@ -179,7 +179,8 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
|
||||
for(ip = in, in += n; ip < in;) { \
|
||||
TEMPLATE3(uint, _usize_, _t) o,x;\
|
||||
unsigned iplen = in - ip,b; \
|
||||
if(iplen > _csize_) iplen = _csize_; PREFETCH(ip+512,0);\
|
||||
if(iplen > _csize_) iplen = _csize_;\
|
||||
PREFETCH(ip+512,0);\
|
||||
o = TEMPLATE2(bit,_usize_)(ip, iplen, &x); b = TEMPLATE2(bsr,_usize_)(o);\
|
||||
*op++ = b; op = TEMPLATE2(bitpacka, _usize_)[b](ip, iplen, op);\
|
||||
ip += iplen;\
|
||||
|
||||
@ -156,7 +156,7 @@ size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restri
|
||||
#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_)
|
||||
#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_)
|
||||
#else
|
||||
#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
|
||||
#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) && !defined(__clang__)
|
||||
static inline __m128i _mm_cvtsi64_si128(__int64 a) { return _mm_loadl_epi64((__m128i*)&a); }
|
||||
#endif
|
||||
static ALIGNED(unsigned char, permv[256][8], 32) = {
|
||||
|
||||
10
bitutil.c
10
bitutil.c
@ -122,7 +122,7 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
vo1 = _mm_or_si128(vo1, v1);
|
||||
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
|
||||
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
|
||||
} start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14));
|
||||
} start = mm_cvtsi128_si16(_mm_srli_si128(vs,14));
|
||||
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
|
||||
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
|
||||
#else
|
||||
@ -183,7 +183,7 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
|
||||
|
||||
//----- Undelta: In-place prefix sum (min. Delta = 0) -------------------
|
||||
#define DD(i) _ip[i] = (start += _ip[i] + _md);
|
||||
#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const _md = _md_;\
|
||||
#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const int _md = _md_;\
|
||||
for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { DD(0); DD(1); DD(2); DD(3); }\
|
||||
for(;_ip != _in_+_n_; _ip++) DD(0);\
|
||||
}
|
||||
@ -240,7 +240,7 @@ uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin
|
||||
uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,*op = out,u,d,startd=0; BITZDE(uint64_t, in, n, mindelta, 64,o |= u;*op++ = u); return o;}
|
||||
|
||||
#define ZDD(i) u = _ip[i]; d = u - start; _ip[i] = zigzagdec64(u)+(int64_t)startd+_md; startd = d; start = u
|
||||
#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const _md = _md_;\
|
||||
#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const int _md = _md_;\
|
||||
for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZDD(0); ZDD(1); ZDD(2); ZDD(3); }\
|
||||
for(;_ip != _in_+_n_; _ip++) ZDD(0);\
|
||||
}
|
||||
@ -443,7 +443,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
vo1 = _mm_or_si128(vo1, v1);
|
||||
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
|
||||
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
|
||||
} start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14));
|
||||
} start = mm_cvtsi128_si16(_mm_srli_si128(vs,14));
|
||||
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
|
||||
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
|
||||
#else
|
||||
@ -488,7 +488,7 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
|
||||
vo1 = _mm_or_si128(vo1, v1);
|
||||
vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
|
||||
vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
|
||||
} start = _mm_cvtsi128_si16(_mm_srli_si128(vs,12));
|
||||
} start = mm_cvtsi128_si16(_mm_srli_si128(vs,12));
|
||||
vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
|
||||
vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
|
||||
#else
|
||||
|
||||
@ -187,9 +187,9 @@ static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _m
|
||||
#define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_)
|
||||
#define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_)
|
||||
|
||||
//---------------- Convert _mm_cvtsi128_siXX -------------------------------------------
|
||||
static ALWAYS_INLINE uint8_t _mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
|
||||
static ALWAYS_INLINE uint16_t _mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
|
||||
//---------------- Convert mm_cvtsi128_siXX -------------------------------------------
|
||||
static ALWAYS_INLINE uint8_t mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
|
||||
static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
|
||||
#endif
|
||||
|
||||
//--------- memset -----------------------------------------
|
||||
|
||||
4
fp.c
4
fp.c
@ -155,7 +155,7 @@ size_t TEMPLATE2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t
|
||||
sv = TEMPLATE2(mm_xore_epi, USIZE)(v0,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *) p, sv); sv = v0;
|
||||
sv = TEMPLATE2(mm_xore_epi, USIZE)(v1,sv); bv = _mm_or_si128(bv, sv); _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1;
|
||||
}
|
||||
start = (uint_t)TEMPLATE2(_mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
|
||||
start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
|
||||
b = TEMPLATE2(mm_hor_epi, USIZE)(bv);
|
||||
#else
|
||||
for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
|
||||
@ -240,7 +240,7 @@ size_t TEMPLATE2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
|
||||
_mm_storeu_si128((__m128i *) op, v0);
|
||||
_mm_storeu_si128((__m128i *)(op+16/(USIZE/8)), sv);
|
||||
}
|
||||
start = (uint_t)TEMPLATE2(_mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
|
||||
start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
|
||||
#else
|
||||
for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
|
||||
#endif
|
||||
|
||||
2
icapp.c
2
icapp.c
@ -1141,7 +1141,7 @@ unsigned bench8(unsigned char *in, unsigned n, unsigned char *out, unsigned char
|
||||
default: goto end;
|
||||
}
|
||||
if(l) {
|
||||
char s[65]; printf("%-35 ", bestr(id, 8, s, codstr(codid), codlev));
|
||||
char s[65]; printf("%-35s ", bestr(id, 8, s, codstr(codid), codlev));
|
||||
if(cpy) rc = memcheck(in,m*(USIZE),cpy);
|
||||
if(!rc)
|
||||
printf("\t%s\n", inname?inname:"");
|
||||
|
||||
10
time_.h
10
time_.h
@ -51,7 +51,7 @@ typedef struct timespec tm_t;
|
||||
|
||||
#ifdef __corei7__
|
||||
#define RDTSC_INI(_c_) do { unsigned _cl, _ch; \
|
||||
__asm volatile ("couid\n\t" \
|
||||
__asm volatile ("cpuid\n\t" \
|
||||
"rdtsc\n\t" \
|
||||
"mov %%edx, %0\n" \
|
||||
"mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \
|
||||
@ -142,11 +142,11 @@ static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
|
||||
#endif
|
||||
|
||||
//---------------------------------------- bench ----------------------------------------------------------------------
|
||||
// for each a function call is repeated until exceding tm_tx seconds.
|
||||
// for each a function call is repeated until exceeding tm_tx seconds.
|
||||
// A run duration is always tm_tx seconds
|
||||
// The number of runs can be set with the program options -I and -J (specify -I15 -J15 for more precision)
|
||||
|
||||
// sleep after each 8 runs to avoid cpu trottling.
|
||||
// sleep after each 8 runs to avoid cpu throttling.
|
||||
#define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0)
|
||||
|
||||
// benchmark loop
|
||||
@ -160,11 +160,11 @@ static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
|
||||
/*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\
|
||||
/*other runs: break the loop only after 'tm_rm' repeats */ \
|
||||
_tm_t = tmdiff(_tm_t0, tmtime());\
|
||||
/*set min time, recalculte repeats tm_rm based on tm_tx, recalculte number of runs based on tm_TX*/\
|
||||
/*set min time, recalculate repeats tm_rm based on tm_tx, recalculate number of runs based on tm_TX*/\
|
||||
if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("[%d,%d] ", tm_rm, _tm_Rn);*/ } tm_tm = _tm_t; _tm_c++; }\
|
||||
else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\
|
||||
if(tm_verbose) { printf("%8.2f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\
|
||||
if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu trottling*/\
|
||||
if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu throttling*/\
|
||||
}\
|
||||
}
|
||||
|
||||
|
||||
@ -596,7 +596,7 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
|
||||
for(ip = in,op = out; ip != in+v; ip += ESIZE*32, op += ESIZE*32/STRIDE) {
|
||||
unsigned char *p = op; PREFETCH(ip+ESIZE*192,0);
|
||||
__m256i iv[ESIZE],ov[ESIZE];
|
||||
__m256i iv[ESIZE],ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
|
||||
#if ESIZE == 2
|
||||
ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip ), sv0);
|
||||
ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1);
|
||||
@ -724,7 +724,7 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#endif
|
||||
|
||||
for(op = out,ip = in; op != out+v; ip += ESIZE*32/STRIDE, op += ESIZE*32) { unsigned char *p = ip; PREFETCH(ip+ESIZE*192,0);
|
||||
__m256i iv[ESIZE], ov[ESIZE];
|
||||
__m256i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
|
||||
|
||||
#if STRIDE > ESIZE
|
||||
NBL0(0,1); NBL( 2,3); NB(0,1,iv[0]); NB(2,3,iv[1]);
|
||||
@ -842,7 +842,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
#endif
|
||||
|
||||
for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op; PREFETCH(ip+(ESIZE*16)*ESIZE,0);
|
||||
__m128i iv[ESIZE],ov[ESIZE];
|
||||
__m128i iv[ESIZE],ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if ESIZE == 2
|
||||
#ifdef __ARM_NEON
|
||||
@ -1100,7 +1100,7 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
|
||||
|
||||
for(op = out,ip = in; op != out+v; op+=ESIZE*16,ip += ESIZE*16/STRIDE) {
|
||||
unsigned char *p=ip; PREFETCH(ip+(ESIZE*16/STRIDE)*ESIZE,0);
|
||||
__m128i iv[ESIZE], ov[ESIZE];
|
||||
__m128i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
|
||||
|
||||
#if STRIDE > ESIZE //------------ Nibble transpose -------------------
|
||||
ov[0] = _mm_loadl_epi64((__m128i *) p );
|
||||
|
||||
5
trlec.c
5
trlec.c
@ -79,7 +79,7 @@ static unsigned cntcalc32(const unsigned char *__restrict in, unsigned inlen, cn
|
||||
#define PUTE(_op_, _e_) do { PUTC(_op_, _e_); vlput32(_op_, 0); } while(0)
|
||||
|
||||
#define SZ64 if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
|
||||
#define SZ32 if((z = (ctou32(ip) ^ ctou32(ip+1)))) break; ip += 4;
|
||||
#define SZ32 if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
|
||||
|
||||
#define SRLEPUT8(_pp_, _ip_, _e_, _op_) do {\
|
||||
unsigned _r = (_ip_ - _pp_)+1;\
|
||||
@ -252,7 +252,8 @@ unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char
|
||||
TRLEPUT(pp, ip, m, rmap, op);
|
||||
pp = ++ip;
|
||||
}
|
||||
if(ip < ie) PUTC(op, *ip++); AS(ip == ie, "Fatal ip>ie=%d ", (int)(ip-ie));
|
||||
if(ip < ie) PUTC(op, *ip++);
|
||||
AS(ip == ie, "Fatal ip>ie=%d ", (int)(ip-ie));
|
||||
|
||||
if(op - out < inlen)
|
||||
return op - out; // RETURN length = rle
|
||||
|
||||
24
trled.c
24
trled.c
@ -355,14 +355,22 @@ unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned c
|
||||
op += r;
|
||||
ip += (r+1)*sizeof(uint_t); PREFETCH(ip+512, 0);
|
||||
#else
|
||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c; PREFETCH(ip +512, 0);
|
||||
if(((c = ctout(ip)) == e)) goto a;
|
||||
ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a;
|
||||
ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a;
|
||||
ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a;
|
||||
ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a;
|
||||
ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a;
|
||||
ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a;
|
||||
ip += sizeof(uint_t); *op++ = c;
|
||||
if(((c = ctout(ip)) == e)) goto a;
|
||||
ip += sizeof(uint_t); *op++ = c; PREFETCH(ip +512, 0);
|
||||
continue;
|
||||
a: ip += sizeof(uint_t); PREFETCH(ip +512, 0);
|
||||
#endif
|
||||
|
||||
3
vp4c.c
3
vp4c.c
@ -359,7 +359,8 @@ unsigned char *TEMPLATE2(_P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsig
|
||||
#if HYBRID > 0 && USIZE >= 16
|
||||
if(bx <= USIZE) {
|
||||
#endif
|
||||
for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i]; out += PAD8(n); //if(eqx == xn && bx) { out[-1] |=0x80; TEMPLATE2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else
|
||||
for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i]; //if(eqx == xn && bx) { out[-1] |=0x80; TEMPLATE2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else
|
||||
out += PAD8(n);
|
||||
out = TEMPLATE2(bitpack, USIZE)(inx, xn, out, bx); //if(eq == n && b) { out[-1]|= 0x80; TEMPLATE2(ctou, USIZE)(out)=a; out += (b+7)/8; } else
|
||||
out = TEMPLATE2(BITPACK, USIZE)(_in, n, out, b);
|
||||
#if HYBRID > 0 && USIZE >= 16
|
||||
|
||||
Reference in New Issue
Block a user