diff --git a/conf.h b/conf.h
index ed540b4..be6face 100644
--- a/conf.h
+++ b/conf.h
@@ -39,11 +39,12 @@
 #define popcnt64(_x_)   __builtin_popcountll(_x_)
 
     #if defined(__i386__) || defined(__x86_64__)
-//__bsr32:     1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
-//  bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
+//x,__bsr32:     1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5
+//  x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6,
 static inline int    __bsr32(               int x) {             asm("bsr  %1,%0" : "=r" (x) : "rm" (x) ); return x; }
 static inline int      bsr32(               int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; }
-static inline int      bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; }
+static inline int      bsr64(uint64_t x          ) { return x?64 - __builtin_clzll(x):0; }
+static inline int    __bsr64(uint64_t x          ) { return   63 - __builtin_clzll(x);   }
 
 static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
 static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; }
@@ -63,7 +64,7 @@ static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s);
 #define ctz64(_x_) __builtin_ctzll(_x_)
 #define ctz32(_x_) __builtin_ctz(_x_)    // 0:32  ctz32(1<<a) = a (a=1..31)
 #define clz64(_x_) __builtin_clzll(_x_)
-#define clz32(_x_) __builtin_clz(_x_)
+#define clz32(_x_) __builtin_clz(_x_)    // 00000000 00000000 00000000 01000000 = 25
 
 //#define bswap8(x)    (x)
     #if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
@@ -131,8 +132,11 @@ static inline int clz64(uint64_t x) { unsigned long z;   _BitScanReverse64(&z, x
 #define strcasecmp  _stricmp
 #define strncasecmp _strnicmp
 #define strtoull    _strtoui64
+static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); }
   #endif
 
+#define __bsr8(_x_)  __bsr32(_x_)
+#define __bsr16(_x_) __bsr32(_x_)
 #define bsr8(_x_)  bsr32(_x_)
 #define bsr16(_x_) bsr32(_x_)
 #define ctz8(_x_)  ctz32(_x_)
@@ -214,10 +218,30 @@ struct _PACKED doubleu   { double             d; };
 #endif
 
 //---------------------misc ---------------------------------------------------
-#define BZHI64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
-#define BZHI32(_u_, _b_) ((_u_) & ((1u  <<(_b_))-1))
-#define BZHI16(_u_, _b_) BZHI32(_u_, _b_)
-#define BZHI8(_u_, _b_)  BZHI32(_u_, _b_)
+#define BZHI64F(_u_, _b_) 				 ((_u_) & ((1ull<<(_b_))-1))  // _b_ < 64
+#define BZHI32F(_u_, _b_)                ((_u_) & ((1u  <<(_b_))-1))  // _b_ < 32
+#define BZHI64( _u_, _b_)                (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1)))  // Constant
+#define BZHI32( _u_, _b_)                (_b_ == 32?        0xffffffffu  :((_u_) & ((1u  <<(_b_))-1)))
+#define BZHI16( _u_, _b_)                BZHI32(_u_, _b_)
+#define BZHI8(  _u_, _b_)                BZHI32(_u_, _b_)
+
+    #ifdef __AVX2__
+      #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#include <intrin.h>
+      #else
+#include <x86intrin.h>
+      #endif
+#define bzhi32(_u_, _b_)                 _bzhi_u32(_u_, _b_)
+
+      #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86))
+#define bzhi64(_u_, _b_)                 ((_u_) & ((1ull<<(_b_))-1))
+      #else
+#define bzhi64(_u_, _b_)                 _bzhi_u64(_u_, _b_)
+      #endif
+    #else
+#define bzhi_u64(_u_, _b_)               BZHI64(_u_, _b_) 
+#define bzhi_u32(_u_, _b_)               BZHI32(_u_, _b_) 
+    #endif
 
 #define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1))
 #define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1)))
@@ -230,6 +254,9 @@ struct _PACKED doubleu   { double             d; };
 
 #define CACHE_LINE_SIZE     64
 #define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)
+
+#define CLAMP(_x_, _low_, _high_)  (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_)))
+
 //--- NDEBUG -------
 #include <stdio.h>
   #ifdef _MSC_VER