diff --git a/TurboPFor-Integer-Compression/lib/transpose_.c b/TurboPFor-Integer-Compression/lib/transpose_.c new file mode 100644 index 0000000..cd1a1b0 --- /dev/null +++ b/TurboPFor-Integer-Compression/lib/transpose_.c @@ -0,0 +1,472 @@ +#include +#include "include_/conf.h" +#include "include_/transpose.h" + +static unsigned _cpuisa; +//--------------------- CPU detection ------------------------------------------- + #if defined(__i386__) || defined(__x86_64__) + #if _MSC_VER >=1300 +#include + #elif defined (__INTEL_COMPILER) +#include + #endif + +static inline void cpuid(int reg[4], int id) { + #if defined (_MSC_VER) //|| defined (__INTEL_COMPILER) + __cpuidex(reg, id, 0); + #elif defined(__i386__) || defined(__x86_64__) + __asm("cpuid" : "=a"(reg[0]),"=b"(reg[1]),"=c"(reg[2]),"=d"(reg[3]) : "a"(id),"c"(0) : ); + #endif +} + +static inline uint64_t xgetbv (int ctr) { + #if(defined _MSC_VER && (_MSC_FULL_VER >= 160040219) || defined __INTEL_COMPILER) + return _xgetbv(ctr); + #elif defined(__i386__) || defined(__x86_64__) + unsigned a, d; + __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); + return (uint64_t)d << 32 | a; + #else + unsigned a=0, d=0; + return (uint64_t)d << 32 | a; + #endif +} + #endif + +#define AVX512F 0x001 +#define AVX512DQ 0x002 +#define AVX512IFMA 0x004 +#define AVX512PF 0x008 +#define AVX512ER 0x010 +#define AVX512CD 0x020 +#define AVX512BW 0x040 +#define AVX512VL 0x080 +#define AVX512VNNI 0x100 +#define AVX512VBMI 0x200 +#define AVX512VBMI2 0x400 + +#define IS_SSE 0x10 +#define IS_SSE2 0x20 +#define IS_SSE3 0x30 +#define IS_SSSE3 0x32 +#define IS_POWER9 0x34 // powerpc +#define IS_NEON 0x38 // arm neon +#define IS_SSE41 0x40 +#define IS_SSE41x 0x41 //+popcount +#define IS_SSE42 0x42 +#define IS_AVX 0x50 +#define IS_AVX2 0x60 +#define IS_AVX512 0x800 + +unsigned cpuisa(void) { + int c[4] = {0}; + if(_cpuisa) return _cpuisa; + _cpuisa++; + #if defined(__i386__) || defined(__x86_64__) + cpuid(c, 0); + if(c[0]) { + cpuid(c, 1); + //family = ((c >> 8) & 0xf) + ((c >> 20) & 0xff) + //model = ((c >> 4) & 0xf) + ((c >> 12) & 0xf0) + if( c[3] & (1 << 25)) { _cpuisa = IS_SSE; + if( c[3] & (1 << 26)) { _cpuisa = IS_SSE2; + if( c[2] & (1 << 0)) { _cpuisa = IS_SSE3; + // _cpuisa = IS_SSE3SLOW; // Atom SSSE3 slow + if( c[2] & (1 << 9)) { _cpuisa = IS_SSSE3; + if( c[2] & (1 << 19)) { _cpuisa = IS_SSE41; + if( c[2] & (1 << 23)) { _cpuisa = IS_SSE41x; // +popcount + if( c[2] & (1 << 20)) { _cpuisa = IS_SSE42; // SSE4.2 + if((c[2] & (1 << 28)) && + (c[2] & (1 << 27)) && // OSXSAVE + (c[2] & (1 << 26)) && // XSAVE + (xgetbv(0) & 6)==6) { _cpuisa = IS_AVX; // AVX + if(c[2]& (1 << 3)) _cpuisa |= 1; // +FMA3 + if(c[2]& (1 << 16)) _cpuisa |= 2; // +FMA4 + if(c[2]& (1 << 25)) _cpuisa |= 4; // +AES + cpuid(c, 7); + if(c[1] & (1 << 5)) { _cpuisa = IS_AVX2; + if(c[1] & (1 << 16)) { + cpuid(c, 0xd); + if((c[0] & 0x60)==0x60) { _cpuisa = IS_AVX512; + cpuid(c, 7); + if(c[1] & (1<<16)) _cpuisa |= AVX512F; + if(c[1] & (1<<17)) _cpuisa |= AVX512DQ; + if(c[1] & (1<<21)) _cpuisa |= AVX512IFMA; + if(c[1] & (1<<26)) _cpuisa |= AVX512PF; + if(c[1] & (1<<27)) _cpuisa |= AVX512ER; + if(c[1] & (1<<28)) _cpuisa |= AVX512CD; + if(c[1] & (1<<30)) _cpuisa |= AVX512BW; + if(c[1] & (1u<<31)) _cpuisa |= AVX512VL; + if(c[2] & (1<< 1)) _cpuisa |= AVX512VBMI; + if(c[2] & (1<<11)) _cpuisa |= AVX512VNNI; + if(c[2] & (1<< 6)) _cpuisa |= AVX512VBMI2; + }}} + }}}}}}}}} + #elif defined(__powerpc64__) + _cpuisa = IS_POWER9; // power9 + #elif defined(__ARM_NEON) + _cpuisa = IS_NEON; // ARM_NEON + #endif + return _cpuisa; +} + +unsigned cpuini(unsigned cpuisa) { if(cpuisa) _cpuisa = cpuisa; return _cpuisa; } + +char *cpustr(unsigned cpuisa) { + if(!cpuisa) cpuisa = _cpuisa; + #if defined(__i386__) || defined(__x86_64__) + if(cpuisa >= IS_AVX512) { + if(cpuisa & AVX512VBMI2) return "avx512vbmi2"; + if(cpuisa & AVX512VBMI) return "avx512vbmi"; + if(cpuisa & AVX512VNNI) return "avx512vnni"; + if(cpuisa & AVX512VL) return "avx512vl"; + if(cpuisa & AVX512BW) return "avx512bw"; + if(cpuisa & AVX512CD) return "avx512cd"; + if(cpuisa & AVX512ER) return "avx512er"; + if(cpuisa & AVX512PF) return "avx512pf"; + if(cpuisa & AVX512IFMA) return "avx512ifma"; + if(cpuisa & AVX512DQ) return "avx512dq"; + if(cpuisa & AVX512F) return "avx512f"; + return "avx512"; + } + else if(cpuisa >= IS_AVX2) return "avx2"; + else if(cpuisa >= IS_AVX) + switch(cpuisa&0xf) { + case 1: return "avx+fma3"; + case 2: return "avx+fma4"; + case 4: return "avx+aes"; + case 5: return "avx+fma3+aes"; + default:return "avx"; + } + else if(cpuisa >= IS_SSE42) return "sse4.2"; + else if(cpuisa >= IS_SSE41x) return "sse4.1+popcnt"; + else if(cpuisa >= IS_SSE41) return "sse4.1"; + else if(cpuisa >= IS_SSSE3) return "ssse3"; + else if(cpuisa >= IS_SSE3) return "sse3"; + else if(cpuisa >= IS_SSE2) return "sse2"; + else if(cpuisa >= IS_SSE) return "sse"; + #elif defined(__powerpc64__) + if(cpuisa >= IS_POWER9) return "power9"; + #elif defined(__ARM_NEON) + if(cpuisa >= IS_NEON) return "arm_neon"; + #endif + return "none"; +} + +//---------------------------------------------------------------------------------------------------------------------- +typedef void (*TPFUNC)( unsigned char *in, unsigned n, unsigned char *out); + + // 0 1 2 3 4 5 6 7 8 9 16 +static TPFUNC _tpe[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // byte +static TPFUNC _tpd[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 }; + +static TPFUNC _tp4e[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // Nibble +static TPFUNC _tp4d[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 }; + +//-- zigzag delta + #ifndef NTP_ZZAG +static TPFUNC _tpze[] = { 0, 0, tpzenc2, tpzenc3, tpzenc4, 0, 0, 0, tpzenc8, 0, 0, 0, 0, 0, 0, 0, tpzenc16 }; // byte +static TPFUNC _tpzd[] = { 0, 0, tpzdec2, tpzdec3, tpzdec4, 0, 0, 0, tpzdec8, 0, 0, 0, 0, 0, 0, 0, tpzdec16 }; + +static TPFUNC _tp4ze[] = { 0, 0, tpzenc2, tpzenc3, tpzenc4, 0, 0, 0, tpzenc8, 0, 0, 0, 0, 0, 0, 0, tpzenc16 }; // Nibble +static TPFUNC _tp4zd[] = { 0, 0, tpzdec2, tpzdec3, tpzdec4, 0, 0, 0, tpzdec8, 0, 0, 0, 0, 0, 0, 0, tpzdec16 }; + #endif + +//-- xor + #ifndef NTP_XOR +static TPFUNC _tpxe[] = { 0, 0, tpxenc2, tpxenc3, tpxenc4, 0, 0, 0, tpxenc8, 0, 0, 0, 0, 0, 0, 0, tpxenc16 }; // byte +static TPFUNC _tpxd[] = { 0, 0, tpxdec2, tpxdec3, tpxdec4, 0, 0, 0, tpxdec8, 0, 0, 0, 0, 0, 0, 0, tpxdec16 }; + +static TPFUNC _tp4xe[] = { 0, 0, tpxenc2, tpxenc3, tpxenc4, 0, 0, 0, tpxenc8, 0, 0, 0, 0, 0, 0, 0, tpxenc16 }; // Nibble +static TPFUNC _tp4xd[] = { 0, 0, tpxdec2, tpxdec3, tpxdec4, 0, 0, 0, tpxdec8, 0, 0, 0, 0, 0, 0, 0, tpxdec16 }; + #endif + +static int tpset; + +void tpini(int id) { + int i; + if(tpset) return; + tpset++; + i = id?id:cpuisa(); + #if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86) + if(i >= IS_AVX2) { + _tpe[ 2] = tpenc256v2; _tpd[ 2] = tpdec256v2; _tp4e[ 2] = tp4enc256v2; _tp4d[ 2] = tp4dec256v2; + _tpe[ 4] = tpenc256v4; _tpd[ 4] = tpdec256v4; _tp4e[ 4] = tp4enc256v4; _tp4d[ 4] = tp4dec256v4; + _tpe[ 8] = tpenc256v8; _tpd[ 8] = tpdec256v8; _tp4e[ 8] = tp4enc256v8; _tp4d[ 8] = tp4dec256v8; + + #ifndef NTP_ZZAG + _tpze[2] = tpzenc128v2; _tpzd[2] = tpzdec128v2; _tp4ze[2] = tp4zenc128v2; _tp4zd[2] = tp4zdec128v2; // 16 bits: only sse + _tpze[4] = tpzenc256v4; _tpzd[4] = tpzdec256v4; _tp4ze[4] = tp4zenc256v4; _tp4zd[4] = tp4zdec256v4; + _tpze[8] = tpzenc256v8; _tpzd[8] = tpzdec256v8; _tp4ze[8] = tp4zenc256v8; _tp4zd[8] = tp4zdec256v8; + #endif + + #ifndef NTP_XOR + _tpxe[2] = tpxenc128v2; _tpxd[2] = tpxdec128v2; _tp4xe[2] = tp4xenc128v2; _tp4xd[2] = tp4xdec128v2; // 16 bits: only sse + _tpxe[4] = tpxenc256v4; _tpxd[4] = tpxdec256v4; _tp4xe[4] = tp4xenc256v4; _tp4xd[4] = tp4xdec256v4; + _tpxe[8] = tpxenc256v8; _tpxd[8] = tpxdec256v8; _tp4xe[8] = tp4xenc256v8; _tp4xd[8] = tp4xdec256v8; + #endif + } else + #endif + #if defined(__i386__) || defined(__x86_64__) || defined(__ARM_NEON) || defined(__powerpc64__) || defined(_M_X64) || defined(_M_IX86) + if(i >= IS_SSE2) { + _tpe[ 2] = tpenc128v2; _tpd[ 2] = tpdec128v2; _tp4e[ 2] = tp4enc128v2; _tp4d[ 2] = tp4dec128v2; + _tpe[ 4] = tpenc128v4; _tpd[ 4] = tpdec128v4; _tp4e[ 4] = tp4enc128v4; _tp4d[ 4] = tp4dec128v4; + _tpe[ 8] = tpenc128v8; _tpd[ 8] = tpdec128v8; _tp4e[ 8] = tp4enc128v8; _tp4d[ 8] = tp4dec128v8; + if(i == 35) _tpd[8] = tpdec8; // ARM NEON scalar is faster!, TODO:retest on Apple M1 + + #ifndef NTP_ZZAG + _tpze[2] = tpzenc128v2; _tpzd[2] = tpzdec128v2; _tp4ze[2] = tp4zenc128v2; _tp4zd[2] = tp4zdec128v2; + _tpze[4] = tpzenc128v4; _tpzd[4] = tpzdec128v4; _tp4ze[4] = tp4zenc128v4; _tp4zd[4] = tp4zdec128v4; + _tpze[8] = tpzenc128v8; _tpzd[8] = tpzdec128v8; _tp4ze[8] = tp4zenc128v8; _tp4zd[8] = tp4zdec128v8; + if(i == 35) _tpzd[8] = tpzdec8; + #endif + + #ifndef NTP_XOR + _tpxe[2] = tpxenc128v2; _tpxd[2] = tpxdec128v2; _tp4xe[2] = tp4xenc128v2; _tp4xd[2] = tp4xdec128v2; + _tpxe[4] = tpxenc128v4; _tpxd[4] = tpxdec128v4; _tp4xe[4] = tp4xenc128v4; _tp4xd[4] = tp4xdec128v4; + _tpxe[8] = tpxenc128v8; _tpxd[8] = tpxdec128v8; _tp4xe[8] = tp4xenc128v8; _tp4xd[8] = tp4xdec128v8; + if(i == 35) _tpxd[8] = tpxdec8; + #endif +} + #endif + ; +} + +void tpenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tpe[esize])) f(in,n,out); + else { + unsigned i, stride=n/esize; + unsigned char *op,*ip; + for(ip = in,op = out; ip < in+stride*esize; op++) + for(i = 0; i < esize; i++) + op[i*stride] = *ip++; + for(op = out + esize*stride; ip < in+n;) + *op++ = *ip++; + } +} + +void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tpd[esize])) f(in,n,out); + else { + unsigned i, stride = n/esize; + unsigned char *op,*ip; + for(op = out,ip = in; op < out+stride*esize; ip++) + for(i = 0; i < esize; i++) + *op++ = ip[i*stride]; + for(ip = in+esize*stride; op < out+n;) + *op++ = *ip++; + } +} + +void tp4enc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tp4e[esize])) f(in,n,out); + else tpenc(in,n,out,esize); +} + +void tp4dec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tp4d[esize])) f(in,n,out); + else tpdec(in,n,out,esize); +} + +//-- zigzag +void tpzenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tpze[esize])) f(in,n,out); + else { + unsigned i, stride = n/esize; + unsigned char *op, *ip; + for(ip = in,op = out; ip < in+stride*esize; op++) + for(i = 0; i < esize; i++) + op[i*stride] = *ip++; // TODO:zigzag + for(op = out + esize*stride; ip < in+n;) + *op++ = *ip++; // TODO:zigzag + } +} + +void tpzdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tpzd[esize])) f(in,n,out); + else { + unsigned i, stride = n/esize; + unsigned char *op,*ip; + for(op = out,ip = in; op < out+stride*esize; ip++) + for(i = 0; i < esize; i++) + *op++ = ip[i*stride]; + for(ip = in+esize*stride; op < out+n;) + *op++ = *ip++; + } +} + +void tp4zenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tp4ze[esize])) f(in,n,out); + else tpzenc(in,n,out,esize); +} + +void tp4zdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tp4zd[esize])) f(in,n,out); + else tpzdec(in,n,out,esize); +} + +//-- xor +void tpxenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tpxe[esize])) f(in,n,out); + else { + unsigned i, stride = n/esize; + unsigned char *op, *ip; + for(ip = in,op = out; ip < in+stride*esize; op++) + for(i = 0; i < esize; i++) + op[i*stride] = *ip++; // TODO:xor + for(op = out + esize*stride; ip < in+n;) + *op++ = *ip++; // TODO:xor + } +} + +void tpxdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tpxd[esize])) f(in,n,out); + else { + unsigned i, stride = n/esize; + unsigned char *op,*ip; + for(op = out,ip = in; op < out+stride*esize; ip++) + for(i = 0; i < esize; i++) + *op++ = ip[i*stride]; + for(ip = in+esize*stride; op < out+n;) + *op++ = *ip++; + } +} + +void tp4xenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tp4xe[esize])) f(in,n,out); + else tpxenc(in,n,out,esize); +} + +void tp4xdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) { + TPFUNC f; + if(!tpset) tpini(0); + if(esize <= 16 && (f = _tp4xd[esize])) f(in,n,out); + else tpxdec(in,n,out,esize); +} + +#define E for(e = esize-1; e >= 0; e--) +#define ODX2 (x + y * nx) * esize + e +void tp2denc(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out, unsigned esize) { + switch(esize) { + case 2: tp2denc2(in,nx,ny,out); break; + case 4: tp2denc4(in,nx,ny,out); break; + case 8: tp2denc8(in,nx,ny,out); break; + default: { + unsigned x,y; + uint8_t *op = out, *ip = in; + int e; + for( x = 0; x < nx; x++) + for(y = 0; y < ny; y++) E + op[ODX2] = *ip++; + } + } +} + +void tp2ddec(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out, unsigned esize) { + switch(esize) { + case 2: tp2ddec2(in,nx,ny,out); break; + case 4: tp2ddec4(in,nx,ny,out); break; + case 8: tp2ddec8(in,nx,ny,out); break; + default: { unsigned x,y; + uint8_t *op = out, *ip = in; + int e; + for( x = 0; x < nx; x++) + for(y = 0; y < ny; y++) E + *op++ = ip[ODX2]; + } + } +} +#undef ODX2 + +#define ODX3 (x + y * nx + z * ny * nx) * esize + e +void tp3denc(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) { + switch(esize) { + case 2: tp3denc2(in,nx,ny,nz,out); break; + case 4: tp3denc4(in,nx,ny,nz,out); break; + case 8: tp3denc8(in,nx,ny,nz,out); break; + default: { unsigned x,y,z; + uint8_t *op = out, *ip = in; + int e; + for( x = 0; x < nx; x++) + for( y = 0; y < ny; y++) + for(z = 0; z < nz; z++) E + op[ODX3] = *ip++; + } + } +} + +void tp3ddec(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) { + switch(esize) { + case 2: tp3ddec2(in,nx,ny,nz,out); break; + case 4: tp3ddec4(in,nx,ny,nz,out); break; + case 8: tp3ddec8(in,nx,ny,nz,out); break; + default: { unsigned x,y,z; + uint8_t *op = out, *ip = in; + int e; + for(x = 0; x < nx; ++x) + for(y = 0; y < ny; ++y) + for(z = 0; z < nz; ++z) E + *op++ = ip[ODX3]; + } + } +} +#undef ODX3 + +#define ODX4 (w + x * nw + y * nx * nw + z * nx * ny * nw) * esize + e +void tp4denc(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) { + switch(esize) { + case 2: tp4denc2(in,nw,nx,ny,nz,out); break; + case 4: tp4denc4(in,nw,nx,ny,nz,out); break; + case 8: tp4denc8(in,nw,nx,ny,nz,out); break; + default: { + unsigned w, x, y, z; + uint8_t *op = out, *ip = in; + int e; + for( w = 0; w < nw; w++) + for( x = 0; x < nx; x++) + for( y = 0; y < ny; y++) + for(z = 0; z < nz; z++) E + op[ODX4] = *ip++; + } + } +} + +void tp4ddec(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) { + switch(esize) { + case 2: tp4ddec2(in,nw,nx,ny,nz,out); break; + case 4: tp4ddec4(in,nw,nx,ny,nz,out); break; + case 8: tp4ddec8(in,nw,nx,ny,nz,out); break; + default: { + unsigned w,x,y,z; + uint8_t *op = out,*ip = in; + int e; + for( w = 0; w < nw; w++) + for( x = 0; x < nx; ++x) + for( y = 0; y < ny; ++y) + for(z = 0; z < nz; ++z) E + *op++ = ip[ODX4]; + } + } +}