Merge pull request #82 from pps83/masterz

Multiple compilation fixes for VS 2022
2023-03-02 17:19:12 +01:00
parent ff0d2d98dd 9d62c89353
commit f6783690bb
12 changed files with 46 additions and 34 deletions
--- a/bitpack.c
+++ b/bitpack.c
@ -179,7 +179,8 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
  for(ip = in, in += n; ip < in;) { \
    TEMPLATE3(uint, _usize_, _t) o,x;\
    unsigned iplen = in - ip,b; \
-    if(iplen > _csize_) iplen = _csize_;                            PREFETCH(ip+512,0);\
+    if(iplen > _csize_) iplen = _csize_;\
+    PREFETCH(ip+512,0);\
    o = TEMPLATE2(bit,_usize_)(ip, iplen, &x); b = TEMPLATE2(bsr,_usize_)(o);\
    *op++ = b; op = TEMPLATE2(bitpacka, _usize_)[b](ip, iplen, op);\
    ip += iplen;\
--- a/bitunpack.c
+++ b/bitunpack.c
@ -156,7 +156,7 @@ size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restri
 #define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_)
 #define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_)
    #else
-#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
+#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) && !defined(__clang__)
 static inline __m128i _mm_cvtsi64_si128(__int64 a) {  return _mm_loadl_epi64((__m128i*)&a); }
    #endif
 static ALIGNED(unsigned char, permv[256][8], 32) = {
--- a/bitutil.c
+++ b/bitutil.c
@ -122,7 +122,7 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
    vo1 = _mm_or_si128(vo1, v1);
    vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
    vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
-  }                                                                             start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14));
+  }                                                                             start = mm_cvtsi128_si16(_mm_srli_si128(vs,14));
  vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
  vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
    #else
@ -183,7 +183,7 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {

 //----- Undelta: In-place prefix sum (min. Delta = 0) -------------------
 #define DD(i) _ip[i] = (start += _ip[i] + _md);
-#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const _md = _md_;\
+#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const int _md = _md_;\
  for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { DD(0); DD(1); DD(2); DD(3); }\
  for(;_ip != _in_+_n_; _ip++) DD(0);\
 }
@ -240,7 +240,7 @@ uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin
 uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,*op = out,u,d,startd=0; BITZDE(uint64_t, in, n, mindelta, 64,o |= u;*op++ = u); return o;}

 #define ZDD(i) u = _ip[i]; d = u - start; _ip[i] = zigzagdec64(u)+(int64_t)startd+_md; startd = d; start = u
-#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const _md = _md_;\
+#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const int _md = _md_;\
  for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZDD(0); ZDD(1); ZDD(2); ZDD(3); }\
  for(;_ip != _in_+_n_; _ip++) ZDD(0);\
 }
@ -443,7 +443,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
    vo1 = _mm_or_si128(vo1, v1);
    vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
    vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
-  }                                                                                         start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14));
+  }                                                                                         start = mm_cvtsi128_si16(_mm_srli_si128(vs,14));
  vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
  vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
    #else
@ -488,7 +488,7 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
    vo1 = _mm_or_si128(vo1, v1);
    vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
    vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
-  }                                                                             start = _mm_cvtsi128_si16(_mm_srli_si128(vs,12));
+  }                                                                             start = mm_cvtsi128_si16(_mm_srli_si128(vs,12));
  vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
  vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
    #else
--- a/bitutil.h
+++ b/bitutil.h
@ -187,9 +187,9 @@ static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _m
 #define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_)
 #define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_)

-//---------------- Convert _mm_cvtsi128_siXX -------------------------------------------
-static ALWAYS_INLINE uint8_t  _mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
-static ALWAYS_INLINE uint16_t _mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
+//---------------- Convert mm_cvtsi128_siXX -------------------------------------------
+static ALWAYS_INLINE uint8_t  mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
+static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
  #endif

 //--------- memset -----------------------------------------
--- a/fp.c
+++ b/fp.c
@ -155,7 +155,7 @@ size_t TEMPLATE2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t
              sv = TEMPLATE2(mm_xore_epi, USIZE)(v0,sv);    bv = _mm_or_si128(bv, sv);        _mm_storeu_si128((__m128i *) p,               sv); sv = v0;
              sv = TEMPLATE2(mm_xore_epi, USIZE)(v1,sv);    bv = _mm_or_si128(bv, sv);        _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1;
    }
-    start = (uint_t)TEMPLATE2(_mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
+    start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
    b     = TEMPLATE2(mm_hor_epi, USIZE)(bv);
      #else
    for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
@ -240,7 +240,7 @@ size_t TEMPLATE2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
      _mm_storeu_si128((__m128i *) op,               v0);
      _mm_storeu_si128((__m128i *)(op+16/(USIZE/8)), sv);
    }
-    start = (uint_t)TEMPLATE2(_mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
+    start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
      #else
    for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
      #endif
--- a/icapp.c
+++ b/icapp.c
@ -1141,7 +1141,7 @@ unsigned bench8(unsigned char *in, unsigned n, unsigned char *out, unsigned char
    default: goto end;
  }
  if(l) {
-    char s[65]; printf("%-35 ", bestr(id, 8, s, codstr(codid), codlev));
+    char s[65]; printf("%-35s ", bestr(id, 8, s, codstr(codid), codlev));
    if(cpy) rc = memcheck(in,m*(USIZE),cpy);
    if(!rc)
      printf("\t%s\n", inname?inname:"");
--- a/time_.h
+++ b/time_.h
@ -51,7 +51,7 @@ typedef struct timespec tm_t;

  #ifdef __corei7__
 #define RDTSC_INI(_c_) do { unsigned _cl, _ch;              \
-  __asm volatile ("couid\n\t"                               \
+  __asm volatile ("cpuid\n\t"                               \
                "rdtsc\n\t"                                 \
                "mov %%edx, %0\n"                           \
                "mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \
@ -142,11 +142,11 @@ static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
 #endif

 //---------------------------------------- bench ----------------------------------------------------------------------
-// for each a function call is repeated until exceding tm_tx seconds.
+// for each a function call is repeated until exceeding tm_tx seconds.
 // A run duration is always tm_tx seconds
 // The number of runs can be set with the program options  -I and -J (specify -I15 -J15 for more precision)

-// sleep after each 8 runs to avoid cpu trottling.
+// sleep after each 8 runs to avoid cpu throttling.
 #define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0)

 // benchmark loop
@ -160,11 +160,11 @@ static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
    /*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\
    /*other runs: break the loop only after 'tm_rm' repeats */ \
    _tm_t = tmdiff(_tm_t0, tmtime());\
-    /*set min time, recalculte repeats tm_rm based on tm_tx, recalculte number of runs based on tm_TX*/\
+    /*set min time, recalculate repeats tm_rm based on tm_tx, recalculate number of runs based on tm_TX*/\
    if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("[%d,%d] ", tm_rm, _tm_Rn);*/ } tm_tm = _tm_t; _tm_c++; }\
    else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\
    if(tm_verbose) { printf("%8.2f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\
-    if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu trottling*/\
+    if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu throttling*/\
  }\
 }

--- a/transpose.c
+++ b/transpose.c
@ -596,7 +596,7 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o

  for(ip = in,op = out; ip != in+v; ip += ESIZE*32, op += ESIZE*32/STRIDE) {
    unsigned char *p = op;                                                      PREFETCH(ip+ESIZE*192,0);
-    __m256i iv[ESIZE],ov[ESIZE];
+    __m256i iv[ESIZE],ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
      #if   ESIZE == 2
    ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip    ), sv0);
    ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1);
@ -724,7 +724,7 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
    #endif

  for(op = out,ip = in; op != out+v; ip += ESIZE*32/STRIDE, op += ESIZE*32) { unsigned char *p = ip;    PREFETCH(ip+ESIZE*192,0);
-    __m256i iv[ESIZE], ov[ESIZE];
+    __m256i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];

      #if STRIDE > ESIZE
    NBL0(0,1); NBL( 2,3); NB(0,1,iv[0]); NB(2,3,iv[1]);
@ -842,7 +842,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
      #endif

  for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op;   PREFETCH(ip+(ESIZE*16)*ESIZE,0);
-    __m128i iv[ESIZE],ov[ESIZE];
+    __m128i iv[ESIZE],ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
      #if defined(__SSSE3__) || defined(__ARM_NEON)
        #if   ESIZE == 2
          #ifdef __ARM_NEON
@ -1100,7 +1100,7 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o

  for(op = out,ip = in; op != out+v; op+=ESIZE*16,ip += ESIZE*16/STRIDE) {
    unsigned char *p=ip;                                                        PREFETCH(ip+(ESIZE*16/STRIDE)*ESIZE,0);
-    __m128i iv[ESIZE], ov[ESIZE];
+    __m128i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];

      #if STRIDE > ESIZE //------------ Nibble transpose -------------------
    ov[0] = _mm_loadl_epi64((__m128i *)    p   );
--- a/trlec.c
+++ b/trlec.c
@ -79,7 +79,7 @@ static unsigned cntcalc32(const unsigned char *__restrict in, unsigned inlen, cn
 #define PUTE(_op_, _e_) do { PUTC(_op_, _e_); vlput32(_op_, 0); } while(0)

 #define SZ64 if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
-#define SZ32 if((z = (ctou32(ip) ^ ctou32(ip+1)))) break; ip += 4;
+#define SZ32 if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;

 #define SRLEPUT8(_pp_, _ip_, _e_, _op_) do {\
  unsigned _r = (_ip_ - _pp_)+1;\
@ -252,7 +252,8 @@ unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char
    TRLEPUT(pp, ip, m, rmap, op);
    pp = ++ip;
  }
-  if(ip < ie) PUTC(op, *ip++);                                      AS(ip == ie, "Fatal ip>ie=%d ", (int)(ip-ie));
+  if(ip < ie) PUTC(op, *ip++);
+                                                                    AS(ip == ie, "Fatal ip>ie=%d ", (int)(ip-ie));

  if(op - out < inlen)
    return op - out;                                            // RETURN length = rle
--- a/trled.c
+++ b/trled.c
@ -355,14 +355,22 @@ unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned c
      op += r;
      ip += (r+1)*sizeof(uint_t);                                               PREFETCH(ip+512, 0);
       #else
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;          PREFETCH(ip +512, 0);
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;          PREFETCH(ip +512, 0);
    continue;
    a: ip += sizeof(uint_t);                                                    PREFETCH(ip +512, 0);
       #endif
--- a/vp4c.c
+++ b/vp4c.c
@ -359,7 +359,8 @@ unsigned char *TEMPLATE2(_P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsig
    #if HYBRID > 0 && USIZE >= 16
  if(bx <= USIZE) {
    #endif
-    for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i]; out += PAD8(n);   //if(eqx == xn && bx) { out[-1] |=0x80; TEMPLATE2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else
+    for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i];                   //if(eqx == xn && bx) { out[-1] |=0x80; TEMPLATE2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else
+    out += PAD8(n);
    out = TEMPLATE2(bitpack, USIZE)(inx, xn, out, bx);                          //if(eq == n && b) { out[-1]|= 0x80; TEMPLATE2(ctou, USIZE)(out)=a; out += (b+7)/8; } else
    out = TEMPLATE2(BITPACK, USIZE)(_in, n,  out, b);
    #if HYBRID > 0 && USIZE >= 16
--- a/vsimple.c
+++ b/vsimple.c
@ -449,7 +449,8 @@ unsigned char *TEMPLATE2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, u
          unsigned b = ((*ip++) >> 5)+1;
          *op = *(unsigned long long *)ip;
          if(unlikely(b!=8))
-            *op &= (1ull<<(b*8))-1;         op++; ip += b;
+            *op &= (1ull<<(b*8))-1;
+                                            op++; ip += b;
          break;
        }
          #endif