diff --git a/bitpack.c b/bitpack.c
index 74542bb..0aa9641 100644
--- a/bitpack.c
+++ b/bitpack.c
@@ -179,7 +179,8 @@ typedef unsigned char *(*BITPACK_D64)(uint64_t *__restrict out, unsigned n, cons
   for(ip = in, in += n; ip < in;) { \
     TEMPLATE3(uint, _usize_, _t) o,x;\
     unsigned iplen = in - ip,b; \
-    if(iplen > _csize_) iplen = _csize_;                            PREFETCH(ip+512,0);\
+    if(iplen > _csize_) iplen = _csize_;\
+    PREFETCH(ip+512,0);\
     o = TEMPLATE2(bit,_usize_)(ip, iplen, &x); b = TEMPLATE2(bsr,_usize_)(o);\
     *op++ = b; op = TEMPLATE2(bitpacka, _usize_)[b](ip, iplen, op);\
     ip += iplen;\
diff --git a/bitunpack.c b/bitunpack.c
index 9eb2af7..cf3e184 100644
--- a/bitunpack.c
+++ b/bitunpack.c
@@ -156,7 +156,7 @@ size_t bitnfunpack64( unsigned char *__restrict in, size_t n, uint64_t *__restri
 #define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_)
 #define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_)
     #else
-#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
+#if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) && !defined(__clang__)
 static inline __m128i _mm_cvtsi64_si128(__int64 a) {  return _mm_loadl_epi64((__m128i*)&a); }
     #endif
 static ALIGNED(unsigned char, permv[256][8], 32) = {
diff --git a/bitutil.c b/bitutil.c
index 6486bae..5edca0a 100644
--- a/bitutil.c
+++ b/bitutil.c
@@ -122,7 +122,7 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
     vo1 = _mm_or_si128(vo1, v1);
     vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
     vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
-  }                                                                             start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14));
+  }                                                                             start = mm_cvtsi128_si16(_mm_srli_si128(vs,14));
   vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
   vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
     #else
@@ -183,7 +183,7 @@ uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
 
 //----- Undelta: In-place prefix sum (min. Delta = 0) -------------------
 #define DD(i) _ip[i] = (start += _ip[i] + _md);
-#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const _md = _md_;\
+#define BITDD(_t_, _in_, _n_, _md_) { _t_ *_ip; const int _md = _md_;\
   for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { DD(0); DD(1); DD(2); DD(3); }\
   for(;_ip != _in_+_n_; _ip++) DD(0);\
 }
@@ -240,7 +240,7 @@ uint32_t bitzzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin
 uint64_t bitzzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,*op = out,u,d,startd=0; BITZDE(uint64_t, in, n, mindelta, 64,o |= u;*op++ = u); return o;}
 
 #define ZDD(i) u = _ip[i]; d = u - start; _ip[i] = zigzagdec64(u)+(int64_t)startd+_md; startd = d; start = u
-#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const _md = _md_;\
+#define BITZDD(_t_, _in_, _n_, _md_) { _t_ *_ip, startd=0,d,u; const int _md = _md_;\
   for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip += 4) { ZDD(0); ZDD(1); ZDD(2); ZDD(3); }\
   for(;_ip != _in_+_n_; _ip++) ZDD(0);\
 }
@@ -443,7 +443,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
     vo1 = _mm_or_si128(vo1, v1);
     vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
     vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
-  }                                                                                         start = _mm_cvtsi128_si16(_mm_srli_si128(vs,14));
+  }                                                                                         start = mm_cvtsi128_si16(_mm_srli_si128(vs,14));
   vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi16(vo0);
   vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi16(vx0);
     #else
@@ -488,7 +488,7 @@ uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
     vo1 = _mm_or_si128(vo1, v1);
     vx0 = _mm_or_si128(vx0, _mm_xor_si128(v0, vb0));
     vx1 = _mm_or_si128(vx1, _mm_xor_si128(v1, vb0));
-  }                                                                             start = _mm_cvtsi128_si16(_mm_srli_si128(vs,12));
+  }                                                                             start = mm_cvtsi128_si16(_mm_srli_si128(vs,12));
   vo0 = _mm_or_si128(vo0, vo1); o = mm_hor_epi32(vo0);
   vx0 = _mm_or_si128(vx0, vx1); x = mm_hor_epi32(vx0);
     #else
diff --git a/bitutil.h b/bitutil.h
index 7428060..e311b41 100644
--- a/bitutil.h
+++ b/bitutil.h
@@ -187,9 +187,9 @@ static ALWAYS_INLINE uint64_t mm_hor_epi64( __m128i v) { v = _mm_or_si128( v, _m
 #define ADDI16x8(_v_, _sv_, _vi_) _sv_ = _mm_add_epi16(_mm_add_epi16(_sv_, _vi_),_v_)
 #define ADDI32x4(_v_, _sv_, _vi_) _sv_ = _mm_add_epi32(_mm_add_epi32(_sv_, _vi_),_v_)
 
-//---------------- Convert _mm_cvtsi128_siXX -------------------------------------------
-static ALWAYS_INLINE uint8_t  _mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
-static ALWAYS_INLINE uint16_t _mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
+//---------------- Convert mm_cvtsi128_siXX -------------------------------------------
+static ALWAYS_INLINE uint8_t  mm_cvtsi128_si8 (__m128i v) { return (uint8_t )_mm_cvtsi128_si32(v); }
+static ALWAYS_INLINE uint16_t mm_cvtsi128_si16(__m128i v) { return (uint16_t)_mm_cvtsi128_si32(v); }
   #endif
 
 //--------- memset -----------------------------------------
diff --git a/fp.c b/fp.c
index e181d58..36f4852 100644
--- a/fp.c
+++ b/fp.c
@@ -155,7 +155,7 @@ size_t TEMPLATE2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t
               sv = TEMPLATE2(mm_xore_epi, USIZE)(v0,sv);    bv = _mm_or_si128(bv, sv);        _mm_storeu_si128((__m128i *) p,               sv); sv = v0;
               sv = TEMPLATE2(mm_xore_epi, USIZE)(v1,sv);    bv = _mm_or_si128(bv, sv);        _mm_storeu_si128((__m128i *)(p+16/(USIZE/8)), sv); sv = v1;
     }
-    start = (uint_t)TEMPLATE2(_mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
+    start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
     b     = TEMPLATE2(mm_hor_epi, USIZE)(bv);
       #else
     for(p = _p; p != &_p[VSIZE]; p+=4,ip+=4) { FE(0,USIZE); FE(1,USIZE); FE(2,USIZE); FE(3,USIZE); }
@@ -240,7 +240,7 @@ size_t TEMPLATE2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t
       _mm_storeu_si128((__m128i *) op,               v0);
       _mm_storeu_si128((__m128i *)(op+16/(USIZE/8)), sv);
     }
-    start = (uint_t)TEMPLATE2(_mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
+    start = (uint_t)TEMPLATE2(mm_cvtsi128_si,USIZE)(_mm_srli_si128(sv,16-USIZE/8));
       #else
     for(p = _p; p != &_p[VSIZE]; p+=4,op+=4) { FD(0,USIZE); FD(1,USIZE); FD(2,USIZE); FD(3,USIZE); }
       #endif
diff --git a/icapp.c b/icapp.c
index 0509f18..75bfe94 100644
--- a/icapp.c
+++ b/icapp.c
@@ -1141,7 +1141,7 @@ unsigned bench8(unsigned char *in, unsigned n, unsigned char *out, unsigned char
     default: goto end;
   }
   if(l) {
-    char s[65]; printf("%-35 ", bestr(id, 8, s, codstr(codid), codlev));
+    char s[65]; printf("%-35s ", bestr(id, 8, s, codstr(codid), codlev));
     if(cpy) rc = memcheck(in,m*(USIZE),cpy);
     if(!rc)
       printf("\t%s\n", inname?inname:"");
diff --git a/time_.h b/time_.h
index 4f32b26..d6a3233 100644
--- a/time_.h
+++ b/time_.h
@@ -51,7 +51,7 @@ typedef struct timespec tm_t;
 
   #ifdef __corei7__
 #define RDTSC_INI(_c_) do { unsigned _cl, _ch;              \
-  __asm volatile ("couid\n\t"                               \
+  __asm volatile ("cpuid\n\t"                               \
                 "rdtsc\n\t"                                 \
                 "mov %%edx, %0\n"                           \
                 "mov %%eax, %1\n": "=r" (_ch), "=r" (_cl):: \
@@ -142,11 +142,11 @@ static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
 #endif
 
 //---------------------------------------- bench ----------------------------------------------------------------------
-// for each a function call is repeated until exceding tm_tx seconds.
+// for each a function call is repeated until exceeding tm_tx seconds.
 // A run duration is always tm_tx seconds
 // The number of runs can be set with the program options  -I and -J (specify -I15 -J15 for more precision)
 
-// sleep after each 8 runs to avoid cpu trottling.
+// sleep after each 8 runs to avoid cpu throttling.
 #define TMSLEEP do { tm_T = tmtime(); if(tmiszero(tm_0)) tm_0 = tm_T; else if(tmdiff(tm_0, tm_T) > tm_TX) { if(tm_verbose) { printf("S \b\b");fflush(stdout); } sleep(tm_slp); tm_0=tmtime();} } while(0)
 
 // benchmark loop
@@ -160,11 +160,11 @@ static int tmiszero(tm_t t) { return !(t.tv_sec|t.tv_nsec); }
     /*1st run: break the loop after tm_tx=1 sec, calculate a new repeats 'tm_rm' to avoid calling time() after each function call*/\
     /*other runs: break the loop only after 'tm_rm' repeats */ \
     _tm_t = tmdiff(_tm_t0, tmtime());\
-    /*set min time, recalculte repeats tm_rm based on tm_tx, recalculte number of runs based on tm_TX*/\
+    /*set min time, recalculate repeats tm_rm based on tm_tx, recalculate number of runs based on tm_TX*/\
     if(_tm_t < tm_tm) { if(tm_tm == DBL_MAX) { tm_rm = _tm_r; _tm_Rn = tm_TX/_tm_t; _tm_Rn = _tm_Rn<_tm_Rx?_tm_Rn:_tm_Rx; /*printf("[%d,%d] ", tm_rm, _tm_Rn);*/ } tm_tm = _tm_t; _tm_c++; }\
     else if(_tm_t > tm_tm*1.15) TMSLEEP;/*force sleep at 15% divergence*/\
     if(tm_verbose) { printf("%8.2f %2d_%.2d\b\b\b\b\b\b\b\b\b\b\b\b\b\b",TMBS(_len_, tm_tm/tm_rm),_tm_R+1,_tm_c),fflush(stdout); }\
-    if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu trottling*/\
+    if((_tm_R & 7)==7) sleep(tm_slp); /*pause 20 secs after each 8 runs to avoid cpu throttling*/\
   }\
 }
 
diff --git a/transpose.c b/transpose.c
index 2db63cc..5e50c80 100644
--- a/transpose.c
+++ b/transpose.c
@@ -596,7 +596,7 @@ void TEMPLATE2(TPENC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
 
   for(ip = in,op = out; ip != in+v; ip += ESIZE*32, op += ESIZE*32/STRIDE) {
     unsigned char *p = op;                                                      PREFETCH(ip+ESIZE*192,0);
-    __m256i iv[ESIZE],ov[ESIZE];
+    __m256i iv[ESIZE],ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
       #if   ESIZE == 2
     ov[0] = _mm256_shuffle_epi8(LD256((__m256i *) ip    ), sv0);
     ov[1] = _mm256_shuffle_epi8(LD256((__m256i *)(ip+32)), sv1);
@@ -724,7 +724,7 @@ void TEMPLATE2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
     #endif
 
   for(op = out,ip = in; op != out+v; ip += ESIZE*32/STRIDE, op += ESIZE*32) { unsigned char *p = ip;    PREFETCH(ip+ESIZE*192,0);
-    __m256i iv[ESIZE], ov[ESIZE];
+    __m256i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
 
       #if STRIDE > ESIZE
     NBL0(0,1); NBL( 2,3); NB(0,1,iv[0]); NB(2,3,iv[1]);
@@ -842,7 +842,7 @@ void TEMPLATE2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
       #endif
 
   for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op;   PREFETCH(ip+(ESIZE*16)*ESIZE,0);
-    __m128i iv[ESIZE],ov[ESIZE];
+    __m128i iv[ESIZE],ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
       #if defined(__SSSE3__) || defined(__ARM_NEON)
         #if   ESIZE == 2
           #ifdef __ARM_NEON
@@ -1100,7 +1100,7 @@ void TEMPLATE2(TPDEC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *o
 
   for(op = out,ip = in; op != out+v; op+=ESIZE*16,ip += ESIZE*16/STRIDE) {
     unsigned char *p=ip;                                                        PREFETCH(ip+(ESIZE*16/STRIDE)*ESIZE,0);
-    __m128i iv[ESIZE], ov[ESIZE];
+    __m128i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
 
       #if STRIDE > ESIZE //------------ Nibble transpose -------------------
     ov[0] = _mm_loadl_epi64((__m128i *)    p   );
diff --git a/trlec.c b/trlec.c
index 0ffdc05..9fbd1e3 100644
--- a/trlec.c
+++ b/trlec.c
@@ -79,7 +79,7 @@ static unsigned cntcalc32(const unsigned char *__restrict in, unsigned inlen, cn
 #define PUTE(_op_, _e_) do { PUTC(_op_, _e_); vlput32(_op_, 0); } while(0)
 
 #define SZ64 if((z = (ctou64(ip) ^ ctou64(ip+1)))) goto a; ip += 8;
-#define SZ32 if((z = (ctou32(ip) ^ ctou32(ip+1)))) break; ip += 4;
+#define SZ32 if((z = (ctou32(ip) ^ ctou32(ip+1)))) goto a; ip += 4;
 
 #define SRLEPUT8(_pp_, _ip_, _e_, _op_) do {\
   unsigned _r = (_ip_ - _pp_)+1;\
@@ -252,7 +252,8 @@ unsigned trlec(const unsigned char *__restrict in, unsigned inlen, unsigned char
     TRLEPUT(pp, ip, m, rmap, op);
     pp = ++ip;
   }
-  if(ip < ie) PUTC(op, *ip++);                                      AS(ip == ie, "Fatal ip>ie=%d ", (int)(ip-ie));
+  if(ip < ie) PUTC(op, *ip++);
+                                                                    AS(ip == ie, "Fatal ip>ie=%d ", (int)(ip-ie));
 
   if(op - out < inlen)
     return op - out;                                            // RETURN length = rle
diff --git a/trled.c b/trled.c
index f886066..d156834 100644
--- a/trled.c
+++ b/trled.c
@@ -355,14 +355,22 @@ unsigned TEMPLATE2(_srled, USIZE)(const unsigned char *__restrict in, unsigned c
       op += r;
       ip += (r+1)*sizeof(uint_t);                                               PREFETCH(ip+512, 0);
        #else
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;
-    if(((c = ctout(ip)) == e)) goto a; ip += sizeof(uint_t); *op++ = c;          PREFETCH(ip +512, 0);
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;
+    if(((c = ctout(ip)) == e)) goto a;
+    ip += sizeof(uint_t); *op++ = c;          PREFETCH(ip +512, 0);
     continue;
     a: ip += sizeof(uint_t);                                                    PREFETCH(ip +512, 0);
        #endif
diff --git a/vp4c.c b/vp4c.c
index f9237f2..4a0b799 100644
--- a/vp4c.c
+++ b/vp4c.c
@@ -359,7 +359,8 @@ unsigned char *TEMPLATE2(_P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsig
     #if HYBRID > 0 && USIZE >= 16
   if(bx <= USIZE) {
     #endif
-    for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i]; out += PAD8(n);   //if(eqx == xn && bx) { out[-1] |=0x80; TEMPLATE2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else
+    for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i];                   //if(eqx == xn && bx) { out[-1] |=0x80; TEMPLATE2(ctou, USIZE)(out)=ax; out += (bx+7)/8; } else
+    out += PAD8(n);
     out = TEMPLATE2(bitpack, USIZE)(inx, xn, out, bx);                          //if(eq == n && b) { out[-1]|= 0x80; TEMPLATE2(ctou, USIZE)(out)=a; out += (b+7)/8; } else
     out = TEMPLATE2(BITPACK, USIZE)(_in, n,  out, b);
     #if HYBRID > 0 && USIZE >= 16
diff --git a/vsimple.c b/vsimple.c
index 0eccc6d..ec9ad65 100644
--- a/vsimple.c
+++ b/vsimple.c
@@ -449,7 +449,8 @@ unsigned char *TEMPLATE2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, u
           unsigned b = ((*ip++) >> 5)+1;
           *op = *(unsigned long long *)ip;
           if(unlikely(b!=8))
-            *op &= (1ull<<(b*8))-1;         op++; ip += b;
+            *op &= (1ull<<(b*8))-1;
+                                            op++; ip += b;
           break;
         }
           #endif