2
bitpack_avx2.c
Normal file
2
bitpack_avx2.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define AVX2_ON
|
||||||
|
#include "bitpack.c"
|
2
bitpack_sse.c
Normal file
2
bitpack_sse.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define SSE2_ON
|
||||||
|
#include "bitpack.c"
|
@ -764,7 +764,7 @@ size_t bitnfunpack128v32( unsigned char *__restrict in, size_t n, uint32_t *__re
|
|||||||
#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_)
|
#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_)
|
||||||
#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_)
|
#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_)
|
||||||
#else
|
#else
|
||||||
static unsigned char permv[256][8] __attribute__((aligned(32))) = {
|
static ALIGNED(unsigned char, permv[256][8], 32) = {
|
||||||
0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,
|
||||||
0,1,1,1,1,1,1,1,
|
0,1,1,1,1,1,1,1,
|
||||||
1,0,1,1,1,1,1,1,
|
1,0,1,1,1,1,1,1,
|
||||||
|
2
bitunpack_avx2.c
Normal file
2
bitunpack_avx2.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define AVX2_ON
|
||||||
|
#include "bitunpack.c"
|
2
bitunpack_sse.c
Normal file
2
bitunpack_sse.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define SSE2_ON
|
||||||
|
#include "bitunpack.c"
|
6
conf.h
6
conf.h
@ -85,7 +85,7 @@ static inline unsigned short bswap16(unsigned short x) { return __builtin_bswap3
|
|||||||
#define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA)
|
#define __builtin_prefetch(x,a) _mm_prefetch(x, _MM_HINT_NTA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define ALIGNED(x) __declspec(align(x))
|
#define ALIGNED(t,v,n) __declspec(align(n)) t v
|
||||||
#define ALWAYS_INLINE __forceinline
|
#define ALWAYS_INLINE __forceinline
|
||||||
#define NOINLINE __declspec(noinline)
|
#define NOINLINE __declspec(noinline)
|
||||||
#define THREADLOCAL __declspec(thread)
|
#define THREADLOCAL __declspec(thread)
|
||||||
@ -110,7 +110,11 @@ static inline int clz64(uint64_t x) { unsigned long z; _BitScanReverse64(&z, x
|
|||||||
#define bswap64(x) _byteswap_uint64(x)
|
#define bswap64(x) _byteswap_uint64(x)
|
||||||
|
|
||||||
#define popcnt32(x) __popcnt(x)
|
#define popcnt32(x) __popcnt(x)
|
||||||
|
#ifdef _WIN64
|
||||||
#define popcnt64(x) __popcnt64(x)
|
#define popcnt64(x) __popcnt64(x)
|
||||||
|
#else
|
||||||
|
#define popcnt64(x) (popcnt32(x) + popcnt32(x>>32))
|
||||||
|
#endif
|
||||||
|
|
||||||
#define sleep(x) Sleep(x/1000)
|
#define sleep(x) Sleep(x/1000)
|
||||||
#define fseeko _fseeki64
|
#define fseeko _fseeki64
|
||||||
|
4
fp.c
4
fp.c
@ -40,7 +40,11 @@
|
|||||||
#define bitflush( _bw_,_br_,_op_) ctou64(_op_) = _bw_, _op_ += (_br_+7)>>3, _bw_=_br_=0
|
#define bitflush( _bw_,_br_,_op_) ctou64(_op_) = _bw_, _op_ += (_br_+7)>>3, _bw_=_br_=0
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
|
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
|
||||||
|
#include <intrin.h>
|
||||||
|
#else
|
||||||
#include <x86intrin.h>
|
#include <x86intrin.h>
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
#define _bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
|
#define _bzhi_u64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1))
|
||||||
#define _bzhi_u32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1))
|
#define _bzhi_u32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1))
|
||||||
|
77
makefile.vs
77
makefile.vs
@ -8,8 +8,9 @@
|
|||||||
CC = cl /nologo
|
CC = cl /nologo
|
||||||
LD = link /nologo
|
LD = link /nologo
|
||||||
AR = lib /nologo
|
AR = lib /nologo
|
||||||
CFLAGS = /MD /O2 -I.
|
CFLAGS = /MD /O2 -I. /W0
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
ARCH =
|
||||||
|
|
||||||
LIB_LIB = libic.lib
|
LIB_LIB = libic.lib
|
||||||
LIB_DLL = ic.dll
|
LIB_DLL = ic.dll
|
||||||
@ -17,29 +18,29 @@ LIB_IMP = ic.lib
|
|||||||
|
|
||||||
OBJS = bitpack.obj bitunpack.obj vp4c.obj vp4d.obj transpose.obj bitutil.obj fp.obj vsimple.obj vint.obj
|
OBJS = bitpack.obj bitunpack.obj vp4c.obj vp4d.obj transpose.obj bitutil.obj fp.obj vsimple.obj vint.obj
|
||||||
|
|
||||||
!if "$(AVX2)" == "1"
|
|
||||||
OBJS = $(OBJS) bitpack_avx2.obj bitunpack_avx2.obj transpose_avx2.obj vp4c_avx2.obj vp4d_avx2.obj
|
|
||||||
DEFS = $(DEFS) /D__AVX2__
|
|
||||||
!endif
|
|
||||||
|
|
||||||
!if "$(NSIMD)" == "1"
|
!if "$(NSIMD)" == "1"
|
||||||
DEFS = $(DEFS) /DNSIMD
|
CFLAGS = $(CFLAGS) /DNSIMD
|
||||||
!else
|
!else
|
||||||
OBJS = $(OBJS) transpose_sse.obj bitpack_sse.obj bitunpack_sse.obj vp4c_sse.obj vp4d_sse.obj
|
OBJS = $(OBJS) transpose_sse.obj bitpack_sse.obj bitunpack_sse.obj vp4c_sse.obj vp4d_sse.obj
|
||||||
DEFS = $(DEFS) /D__SSE2__ /D__SSSE3__
|
CFLAGS = $(CFLAGS) /D__SSE__ /D__SSE2__ /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__ /DUSE_SSE
|
||||||
CFLAGS = $(CFLAGS) /DUSE_SSE
|
ARCH = /arch:SSE2
|
||||||
|
!if "$(AVX2)" == "1"
|
||||||
|
OBJS = $(OBJS) bitpack_avx2.obj bitunpack_avx2.obj transpose_avx2.obj vp4c_avx2.obj vp4d_avx2.obj
|
||||||
|
CFLAGS = $(CFLAGS) /D__AVX2__ /DUSE_AVX2
|
||||||
|
ARCH = /arch:AVX2
|
||||||
|
!endif
|
||||||
!endif
|
!endif
|
||||||
|
|
||||||
!if "$(CODEC1)" == "1"
|
!if "$(CODEC1)" == "1"
|
||||||
DEFS = $(DEFS) /DCODEC1
|
CFLAGS = $(CFLAGS) /DCODEC1
|
||||||
!endif
|
!endif
|
||||||
|
|
||||||
!IF "$(CODEC2)" == "1"
|
!IF "$(CODEC2)" == "1"
|
||||||
DEFS = $(DEFS) /DCODEC2
|
CFLAGS = $(CFLAGS) /DCODEC2
|
||||||
!endif
|
!endif
|
||||||
|
|
||||||
!IF "($(BLOSC)" == "1"
|
!IF "($(BLOSC)" == "1"
|
||||||
DEFS = $(DEFS) /DBLOSC
|
CFLAGS = $(CFLAGS) /DBLOSC
|
||||||
!endif
|
!endif
|
||||||
|
|
||||||
DLL_OBJS = $(OBJS:.obj=.dllobj)
|
DLL_OBJS = $(OBJS:.obj=.dllobj)
|
||||||
@ -49,60 +50,14 @@ all: $(LIB_LIB) icbench.exe
|
|||||||
#$(LIB_DLL) $(LIB_IMP)
|
#$(LIB_DLL) $(LIB_IMP)
|
||||||
|
|
||||||
#------------
|
#------------
|
||||||
vp4c.obj: vp4c.c
|
|
||||||
$(CC) /O2 $(CFLAGS) -c vp4c.c /Fovp4c.obj
|
|
||||||
|
|
||||||
vp4c_sse.obj: vp4c.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSSE3__ /arch:SSSE3 /D__SSE2__ /arch:SSE2 /c vp4c.c /Fovp4c_sse.obj
|
|
||||||
|
|
||||||
vp4c_avx2.obj: vp4c.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c vp4c.c /Fovp4c_avx2.obj
|
|
||||||
#------------
|
|
||||||
vp4d.obj: vp4d.c
|
|
||||||
$(CC) /O2 $(CFLAGS) -c vp4d.c /Fovp4d.obj
|
|
||||||
|
|
||||||
vp4d_sse.obj: vp4d.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSSE3__ /arch:SSSE3 /D__SSE2__ /arch:SSE2 /c vp4d.c /Fovp4d_sse.obj
|
|
||||||
|
|
||||||
vp4d_avx2.obj: vp4d.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c vp4d.c /Fovp4d_avx2.obj
|
|
||||||
#------------
|
|
||||||
bitpack.obj: bitpack.c
|
|
||||||
$(CC) $(CFLAGS) -c bitpack.c /Fobitpack.obj
|
|
||||||
|
|
||||||
bitpack_sse.obj: bitpack.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSE2__ /arch:SSE2 /c bitpack.c /Fobitpack_sse.obj
|
|
||||||
|
|
||||||
bitpack_avx2.obj: bitpack.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c bitpack.c /Fobitpack_avx2.obj
|
|
||||||
|
|
||||||
#------------
|
|
||||||
bitunpack.obj: bitunpack.c
|
|
||||||
$(CC) /O2 $(CFLAGS) -c bitunpack.c /Fobitunpack.obj
|
|
||||||
|
|
||||||
bitunpack_sse.obj: bitunpack.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSSE3__ /arch:SSSE3 /D__SSE2__ /arch:SSE2 /c bitunpack.c /Fobitunpack_sse.obj
|
|
||||||
|
|
||||||
bitunpack_avx2.obj: bitunpack.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c bitunpack.c /Fobitunpack_avx2.obj
|
|
||||||
|
|
||||||
transpose.obj: transpose.c
|
|
||||||
$(CC) /O2 $(CFLAGS) -c transpose.c /Fotranspose.obj
|
|
||||||
|
|
||||||
transpose_sse.obj: transpose.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DSSE2_ON /D__SSE2__ /arch:SSE2 /c transpose.c /Fotranspose_sse.obj
|
|
||||||
|
|
||||||
transpose_avx2.obj: transpose.c
|
|
||||||
$(CC) /O2 $(CFLAGS) /DAVX2_ON /D__AVX2__ /arch:avx2 /c transpose.c /Fotranspose_avx2.obj
|
|
||||||
|
|
||||||
.c.obj:
|
.c.obj:
|
||||||
$(CC) -c /Fo$@ /O2 $(CFLAGS) /arch:SSSE3 /D__SSSE3__ $(DEFS) $**
|
$(CC) -c /Fo$@ /O2 $(CFLAGS) $(ARCH) $**
|
||||||
|
|
||||||
.cc.obj:
|
.cc.obj:
|
||||||
$(CC) -c /Fo$@ /O2 $(CFLAGS) /arch:SSSE3 /D__SSSE3__ $(DEFS) $**
|
$(CC) -c /Fo$@ /O2 $(CFLAGS) $(ARCH) $**
|
||||||
|
|
||||||
.c.dllobj:
|
.c.dllobj:
|
||||||
$(CC) -c /Fo$@ /O2 $(CFLAGS) $(DEFS) /DLIB_DLL $**
|
$(CC) -c /Fo$@ /O2 $(CFLAGS) $(ARCH) /DLIB_DLL $**
|
||||||
|
|
||||||
$(LIB_LIB): $(OBJS)
|
$(LIB_LIB): $(OBJS)
|
||||||
$(AR) $(ARFLAGS) -out:$@ $(OBJS)
|
$(AR) $(ARFLAGS) -out:$@ $(OBJS)
|
||||||
|
@ -120,7 +120,9 @@
|
|||||||
#include "transpose.c"
|
#include "transpose.c"
|
||||||
|
|
||||||
//--------------------- CPU detection -------------------------------------------
|
//--------------------- CPU detection -------------------------------------------
|
||||||
#if (_MSC_VER >=1300) || defined (__INTEL_COMPILER)
|
#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
|
||||||
|
#include <intrin.h>
|
||||||
|
#elif defined(__INTEL_COMPILER)
|
||||||
#include <x86intrin.h>
|
#include <x86intrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
2
transpose_avx2.c
Normal file
2
transpose_avx2.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define AVX2_ON
|
||||||
|
#include "transpose.c"
|
2
transpose_sse.c
Normal file
2
transpose_sse.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define SSE2_ON
|
||||||
|
#include "transpose.c"
|
2
vp4c_avx2.c
Normal file
2
vp4c_avx2.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define AVX2_ON
|
||||||
|
#include "vp4c.c"
|
2
vp4c_sse.c
Normal file
2
vp4c_sse.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define SSE2_ON
|
||||||
|
#include "vp4c.c"
|
4
vp4d.c
4
vp4d.c
@ -371,7 +371,7 @@ ALWAYS_INLINE unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict
|
|||||||
}
|
}
|
||||||
|
|
||||||
unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) {
|
unsigned char *TEMPLATE2(P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out P4DELTA(uint_t start) ) {
|
||||||
unsigned b, bx, i;
|
unsigned b, bx = 0, i;
|
||||||
if(!n) return in;
|
if(!n) return in;
|
||||||
b = *in++;
|
b = *in++;
|
||||||
if((b & 0xc0) == 0xc0) { // all items are equal
|
if((b & 0xc0) == 0xc0) { // all items are equal
|
||||||
@ -431,7 +431,7 @@ size_t TEMPLATE2(P4NDEC, USIZE)(unsigned char *__restrict in, size_t n, uint_t *
|
|||||||
--n;
|
--n;
|
||||||
#endif
|
#endif
|
||||||
for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) {
|
for(op = out; op != out+(n&~(CSIZE-1)); op += CSIZE) {
|
||||||
unsigned b = *ip++, bx, i; __builtin_prefetch(ip+512);//ip = TEMPLATE2(P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start));
|
unsigned b = *ip++, bx = 0, i; __builtin_prefetch(ip+512);//ip = TEMPLATE2(P4DEC, USIZE)(ip, CSIZE, op P4DELTA(start));
|
||||||
|
|
||||||
if((b & 0xc0) == 0xc0) {
|
if((b & 0xc0) == 0xc0) {
|
||||||
b &= 0x3f;
|
b &= 0x3f;
|
||||||
|
2
vp4d_avx2.c
Normal file
2
vp4d_avx2.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define AVX2_ON
|
||||||
|
#include "vp4d.c"
|
2
vp4d_sse.c
Normal file
2
vp4d_sse.c
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
#define SSE2_ON
|
||||||
|
#include "vp4d.c"
|
Reference in New Issue
Block a user