diff --git a/conf.h b/conf.h index ed540b4..be6face 100644 --- a/conf.h +++ b/conf.h @@ -39,11 +39,12 @@ #define popcnt64(_x_) __builtin_popcountll(_x_) #if defined(__i386__) || defined(__x86_64__) -//__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5 -// bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6, +//x,__bsr32: 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5 +// x,bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6, static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; } static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; } -static inline int bsr64(uint64_t x) { return x?64 - __builtin_clzll(x):0; } +static inline int bsr64(uint64_t x ) { return x?64 - __builtin_clzll(x):0; } +static inline int __bsr64(uint64_t x ) { return 63 - __builtin_clzll(x); } static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } @@ -63,7 +64,7 @@ static inline unsigned ror64(unsigned x, int s) { return x >> s | x << (64 - s); #define ctz64(_x_) __builtin_ctzll(_x_) #define ctz32(_x_) __builtin_ctz(_x_) // 0:32 ctz32(1< 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8 @@ -131,8 +132,11 @@ static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x #define strcasecmp _stricmp #define strncasecmp _strnicmp #define strtoull _strtoui64 +static inline double round(double num) { return (num > 0.0) ? floor(num + 0.5) : ceil(num - 0.5); } #endif +#define __bsr8(_x_) __bsr32(_x_) +#define __bsr16(_x_) __bsr32(_x_) #define bsr8(_x_) bsr32(_x_) #define bsr16(_x_) bsr32(_x_) #define ctz8(_x_) ctz32(_x_) @@ -214,10 +218,30 @@ struct _PACKED doubleu { double d; }; #endif //---------------------misc --------------------------------------------------- -#define BZHI64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) -#define BZHI32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) -#define BZHI16(_u_, _b_) BZHI32(_u_, _b_) -#define BZHI8(_u_, _b_) BZHI32(_u_, _b_) +#define BZHI64F(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) // _b_ < 64 +#define BZHI32F(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) // _b_ < 32 +#define BZHI64( _u_, _b_) (_b_ == 64?0xffffffffffffffffull:((_u_) & ((1ull<<(_b_))-1))) // Constant +#define BZHI32( _u_, _b_) (_b_ == 32? 0xffffffffu :((_u_) & ((1u <<(_b_))-1))) +#define BZHI16( _u_, _b_) BZHI32(_u_, _b_) +#define BZHI8( _u_, _b_) BZHI32(_u_, _b_) + + #ifdef __AVX2__ + #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#include + #else +#include + #endif +#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_) + + #if !(defined(_M_X64) || defined(__amd64__)) && (defined(__i386__) || defined(_M_IX86)) +#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) + #else +#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_) + #endif + #else +#define bzhi_u64(_u_, _b_) BZHI64(_u_, _b_) +#define bzhi_u32(_u_, _b_) BZHI32(_u_, _b_) + #endif #define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1)) #define ALIGN_DOWN(__ptr, __a) ((void *)((uintptr_t)(__ptr) & ~(uintptr_t)((__a) - 1))) @@ -230,6 +254,9 @@ struct _PACKED doubleu { double d; }; #define CACHE_LINE_SIZE 64 #define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4) + +#define CLAMP(_x_, _low_, _high_) (((_x_) > (_high_)) ? (_high_) : (((_x_) < (_low_)) ? (_low_) : (_x_))) + //--- NDEBUG ------- #include #ifdef _MSC_VER