diff --git a/bitpack_avx2.c b/bitpack_avx2.c new file mode 100644 index 0000000..16c6330 --- /dev/null +++ b/bitpack_avx2.c @@ -0,0 +1,2 @@ +#define AVX2_ON +#include "bitpack.c" diff --git a/bitpack_sse.c b/bitpack_sse.c new file mode 100644 index 0000000..0d521db --- /dev/null +++ b/bitpack_sse.c @@ -0,0 +1,2 @@ +#define SSE2_ON +#include "bitpack.c" diff --git a/bitunpack.c b/bitunpack.c index 08cde96..c90e995 100644 --- a/bitunpack.c +++ b/bitunpack.c @@ -764,7 +764,7 @@ size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__re #define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_) #define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_) #else -static unsigned char permv[256][8] __attribute__((aligned(32))) = { +static ALIGNED(unsigned char, permv[256][8], 32) = { 0,0,0,0,0,0,0,0, 0,1,1,1,1,1,1,1, 1,0,1,1,1,1,1,1, diff --git a/bitunpack_avx2.c b/bitunpack_avx2.c new file mode 100644 index 0000000..26d1115 --- /dev/null +++ b/bitunpack_avx2.c @@ -0,0 +1,2 @@ +#define AVX2_ON +#include "bitunpack.c" diff --git a/bitunpack_sse.c b/bitunpack_sse.c new file mode 100644 index 0000000..fe0f4de --- /dev/null +++ b/bitunpack_sse.c @@ -0,0 +1,2 @@ +#define SSE2_ON +#include "bitunpack.c" diff --git a/conf.h b/conf.h index aec756a..25e5bfb 100644 --- a/conf.h +++ b/conf.h @@ -85,7 +85,7 @@ static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap3 #define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA) #endif -#define ALIGNED(x) __declspec(align(x)) +#define ALIGNED(t,v,n) __declspec(align(n)) t v #define ALWAYS_INLINE __forceinline #define NOINLINE __declspec(noinline) #define THREADLOCAL __declspec(thread) @@ -109,8 +109,12 @@ static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x #define bswap32(x) _byteswap_ulong(x) #define bswap64(x) _byteswap_uint64(x) -#define popcnt32(x) __popcnt(x) +#define popcnt32(x) __popcnt(x) +#ifdef _WIN64 #define popcnt64(x) __popcnt64(x) +#else +#define popcnt64(x) (popcnt32(x) + popcnt32(x>>32)) +#endif #define sleep(x) Sleep(x/1000) #define fseeko _fseeki64 diff --git a/fp.c b/fp.c index 1c3b5b3..a348d0b 100644 --- a/fp.c +++ b/fp.c @@ -40,7 +40,11 @@ #define bitflush( _bw_,_br_,_op_) ctou64(_op_) = _bw_, _op_ += (_br_+7)>>3, _bw_=_br_=0 #ifdef __AVX2__ +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#include +#else #include +#endif #else #define _bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) #define _bzhi_u32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) diff --git a/makefile.vs b/makefile.vs index 3c04e74..d2f407a 100644 --- a/makefile.vs +++ b/makefile.vs @@ -8,8 +8,9 @@ CC = cl /nologo LD = link /nologo AR = lib /nologo -CFLAGS = /MD /O2 -I. +CFLAGS = /MD /O2 -I. /W0 LDFLAGS = +ARCH = LIB_LIB = libic.lib LIB_DLL = ic.dll @@ -17,29 +18,29 @@ LIB_IMP = ic.lib OBJS = bitpack.obj bitunpack.obj vp4c.obj vp4d.obj transpose.obj bitutil.obj fp.obj vsimple.obj vint.obj -!if "$(AVX2)" == "1" -OBJS = $(OBJS) bitpack_avx2.obj bitunpack_avx2.obj transpose_avx2.obj vp4c_avx2.obj vp4d_avx2.obj -DEFS = $(DEFS) /D__AVX2__ -!endif - !if "$(NSIMD)" == "1" -DEFS = $(DEFS) /DNSIMD +CFLAGS = $(CFLAGS) /DNSIMD !else OBJS = $(OBJS) transpose_sse.obj bitpack_sse.obj bitunpack_sse.obj vp4c_sse.obj vp4d_sse.obj -DEFS = $(DEFS) /D__SSE2__ /D__SSSE3__ -CFLAGS = $(CFLAGS) /DUSE_SSE +CFLAGS = $(CFLAGS) /D__SSE__ /D__SSE2__ /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__ /DUSE_SSE +ARCH = /arch:SSE2 +!if "$(AVX2)" == "1" +OBJS = $(OBJS) bitpack_avx2.obj bitunpack_avx2.obj transpose_avx2.obj vp4c_avx2.obj vp4d_avx2.obj +CFLAGS = $(CFLAGS) /D__AVX2__ /DUSE_AVX2 +ARCH = /arch:AVX2 +!endif !endif !if "$(CODEC1)" == "1" -DEFS = $(DEFS) /DCODEC1 +CFLAGS = $(CFLAGS) /DCODEC1 !endif !IF "$(CODEC2)" == "1" -DEFS = $(DEFS) /DCODEC2 +CFLAGS = $(CFLAGS) /DCODEC2 !endif !IF "($(BLOSC)" == "1" -DEFS = $(DEFS) /DBLOSC +CFLAGS = $(CFLAGS) /DBLOSC !endif DLL_OBJS = $(OBJS:.obj=.dllobj) @@ -49,60 +50,14 @@ all: $(LIB_LIB) icbench.exe #$(LIB_DLL) $(LIB_IMP) #------------ -vp4c.obj: vp4c.c - $(CC) /O2 $(CFLAGS) -c vp4c.c /Fovp4c.obj - -vp4c_sse.obj: vp4c.c - $(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSSE3__ /arch:SSSE3 /D__SSE2__ /arch:SSE2 /c vp4c.c /Fovp4c_sse.obj - -vp4c_avx2.obj: vp4c.c - $(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c vp4c.c /Fovp4c_avx2.obj -#------------ -vp4d.obj: vp4d.c - $(CC) /O2 $(CFLAGS) -c vp4d.c /Fovp4d.obj - -vp4d_sse.obj: vp4d.c - $(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSSE3__ /arch:SSSE3 /D__SSE2__ /arch:SSE2 /c vp4d.c /Fovp4d_sse.obj - -vp4d_avx2.obj: vp4d.c - $(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c vp4d.c /Fovp4d_avx2.obj -#------------ -bitpack.obj: bitpack.c - $(CC) $(CFLAGS) -c bitpack.c /Fobitpack.obj - -bitpack_sse.obj: bitpack.c - $(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSE2__ /arch:SSE2 /c bitpack.c /Fobitpack_sse.obj - -bitpack_avx2.obj: bitpack.c - $(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c bitpack.c /Fobitpack_avx2.obj - -#------------ -bitunpack.obj: bitunpack.c - $(CC) /O2 $(CFLAGS) -c bitunpack.c /Fobitunpack.obj - -bitunpack_sse.obj: bitunpack.c - $(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSSE3__ /arch:SSSE3 /D__SSE2__ /arch:SSE2 /c bitunpack.c /Fobitunpack_sse.obj - -bitunpack_avx2.obj: bitunpack.c - $(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c bitunpack.c /Fobitunpack_avx2.obj - -transpose.obj: transpose.c - $(CC) /O2 $(CFLAGS) -c transpose.c /Fotranspose.obj - -transpose_sse.obj: transpose.c - $(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSE2__ /arch:SSE2 /c transpose.c /Fotranspose_sse.obj - -transpose_avx2.obj: transpose.c - $(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c transpose.c /Fotranspose_avx2.obj - .c.obj: - $(CC) -c /Fo$@ /O2 $(CFLAGS) /arch:SSSE3 /D__SSSE3__ $(DEFS) $** + $(CC) -c /Fo$@ /O2 $(CFLAGS) $(ARCH) $** .cc.obj: - $(CC) -c /Fo$@ /O2 $(CFLAGS) /arch:SSSE3 /D__SSSE3__ $(DEFS) $** + $(CC) -c /Fo$@ /O2 $(CFLAGS) $(ARCH) $** .c.dllobj: - $(CC) -c /Fo$@ /O2 $(CFLAGS) $(DEFS) /DLIB_DLL $** + $(CC) -c /Fo$@ /O2 $(CFLAGS) $(ARCH) /DLIB_DLL $** $(LIB_LIB): $(OBJS) $(AR) $(ARFLAGS) -out:$@ $(OBJS) diff --git a/transpose.c b/transpose.c index 428494a..c311d05 100644 --- a/transpose.c +++ b/transpose.c @@ -120,7 +120,9 @@ #include "transpose.c" //--------------------- CPU detection ------------------------------------------- -#if (_MSC_VER >=1300) || defined (__INTEL_COMPILER) +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#include +#elif defined(__INTEL_COMPILER) #include #endif diff --git a/transpose_avx2.c b/transpose_avx2.c new file mode 100644 index 0000000..f7948d8 --- /dev/null +++ b/transpose_avx2.c @@ -0,0 +1,2 @@ +#define AVX2_ON +#include "transpose.c" diff --git a/transpose_sse.c b/transpose_sse.c new file mode 100644 index 0000000..84c18de --- /dev/null +++ b/transpose_sse.c @@ -0,0 +1,2 @@ +#define SSE2_ON +#include "transpose.c" diff --git a/vp4c_avx2.c b/vp4c_avx2.c new file mode 100644 index 0000000..db5c4ac --- /dev/null +++ b/vp4c_avx2.c @@ -0,0 +1,2 @@ +#define AVX2_ON +#include "vp4c.c" diff --git a/vp4c_sse.c b/vp4c_sse.c new file mode 100644 index 0000000..5bdbdd5 --- /dev/null +++ b/vp4c_sse.c @@ -0,0 +1,2 @@ +#define SSE2_ON +#include "vp4c.c" diff --git a/vp4d.c b/vp4d.c index 3de938f..f3cd75e 100644 --- a/vp4d.c +++ b/vp4d.c @@ -371,7 +371,7 @@ ALWAYS_INLINE unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict } unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) { - unsigned b, bx, i; + unsigned b, bx = 0, i; if(!n) return in; b = *in++; if((b & 0xc0) == 0xc0) { // all items are equal @@ -431,7 +431,7 @@ size_t TEMPLATE2(P4NDEC, USIZE)(unsigned char *__restrict in, size_t n, uint_t * --n; #endif for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) { - unsigned b = *ip++, bx, i; __builtin_prefetch(ip+512);//ip = TEMPLATE2(P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start)); + unsigned b = *ip++, bx = 0, i; __builtin_prefetch(ip+512);//ip = TEMPLATE2(P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start)); if((b & 0xc0) == 0xc0) { b &= 0x3f; diff --git a/vp4d_avx2.c b/vp4d_avx2.c new file mode 100644 index 0000000..b454a42 --- /dev/null +++ b/vp4d_avx2.c @@ -0,0 +1,2 @@ +#define AVX2_ON +#include "vp4d.c" diff --git a/vp4d_sse.c b/vp4d_sse.c new file mode 100644 index 0000000..c8cc159 --- /dev/null +++ b/vp4d_sse.c @@ -0,0 +1,2 @@ +#define SSE2_ON +#include "vp4d.c"