Files
TurboPFor-Integer-Compression/lib/transpose_.c
2023-03-23 11:09:24 +01:00

473 lines
17 KiB
C

#include <string.h>
#include "include_/conf.h"
#include "include_/transpose.h"
static unsigned _cpuisa;
//--------------------- CPU detection -------------------------------------------
#if defined(__i386__) || defined(__x86_64__)
#if _MSC_VER >=1300
#include <intrin.h>
#elif defined (__INTEL_COMPILER)
#include <x86intrin.h>
#endif
static inline void cpuid(int reg[4], int id) {
#if defined (_MSC_VER) //|| defined (__INTEL_COMPILER)
__cpuidex(reg, id, 0);
#elif defined(__i386__) || defined(__x86_64__)
__asm("cpuid" : "=a"(reg[0]),"=b"(reg[1]),"=c"(reg[2]),"=d"(reg[3]) : "a"(id),"c"(0) : );
#endif
}
static inline uint64_t xgetbv (int ctr) {
#if(defined _MSC_VER && (_MSC_FULL_VER >= 160040219) || defined __INTEL_COMPILER)
return _xgetbv(ctr);
#elif defined(__i386__) || defined(__x86_64__)
unsigned a, d;
__asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : );
return (uint64_t)d << 32 | a;
#else
unsigned a=0, d=0;
return (uint64_t)d << 32 | a;
#endif
}
#endif
#define AVX512F 0x001
#define AVX512DQ 0x002
#define AVX512IFMA 0x004
#define AVX512PF 0x008
#define AVX512ER 0x010
#define AVX512CD 0x020
#define AVX512BW 0x040
#define AVX512VL 0x080
#define AVX512VNNI 0x100
#define AVX512VBMI 0x200
#define AVX512VBMI2 0x400
#define IS_SSE 0x10
#define IS_SSE2 0x20
#define IS_SSE3 0x30
#define IS_SSSE3 0x32
#define IS_POWER9 0x34 // powerpc
#define IS_NEON 0x38 // arm neon
#define IS_SSE41 0x40
#define IS_SSE41x 0x41 //+popcount
#define IS_SSE42 0x42
#define IS_AVX 0x50
#define IS_AVX2 0x60
#define IS_AVX512 0x800
unsigned cpuisa(void) {
int c[4] = {0};
if(_cpuisa) return _cpuisa;
_cpuisa++;
#if defined(__i386__) || defined(__x86_64__)
cpuid(c, 0);
if(c[0]) {
cpuid(c, 1);
//family = ((c >> 8) & 0xf) + ((c >> 20) & 0xff)
//model = ((c >> 4) & 0xf) + ((c >> 12) & 0xf0)
if( c[3] & (1 << 25)) { _cpuisa = IS_SSE;
if( c[3] & (1 << 26)) { _cpuisa = IS_SSE2;
if( c[2] & (1 << 0)) { _cpuisa = IS_SSE3;
// _cpuisa = IS_SSE3SLOW; // Atom SSSE3 slow
if( c[2] & (1 << 9)) { _cpuisa = IS_SSSE3;
if( c[2] & (1 << 19)) { _cpuisa = IS_SSE41;
if( c[2] & (1 << 23)) { _cpuisa = IS_SSE41x; // +popcount
if( c[2] & (1 << 20)) { _cpuisa = IS_SSE42; // SSE4.2
if((c[2] & (1 << 28)) &&
(c[2] & (1 << 27)) && // OSXSAVE
(c[2] & (1 << 26)) && // XSAVE
(xgetbv(0) & 6)==6) { _cpuisa = IS_AVX; // AVX
if(c[2]& (1 << 3)) _cpuisa |= 1; // +FMA3
if(c[2]& (1 << 16)) _cpuisa |= 2; // +FMA4
if(c[2]& (1 << 25)) _cpuisa |= 4; // +AES
cpuid(c, 7);
if(c[1] & (1 << 5)) { _cpuisa = IS_AVX2;
if(c[1] & (1 << 16)) {
cpuid(c, 0xd);
if((c[0] & 0x60)==0x60) { _cpuisa = IS_AVX512;
cpuid(c, 7);
if(c[1] & (1<<16)) _cpuisa |= AVX512F;
if(c[1] & (1<<17)) _cpuisa |= AVX512DQ;
if(c[1] & (1<<21)) _cpuisa |= AVX512IFMA;
if(c[1] & (1<<26)) _cpuisa |= AVX512PF;
if(c[1] & (1<<27)) _cpuisa |= AVX512ER;
if(c[1] & (1<<28)) _cpuisa |= AVX512CD;
if(c[1] & (1<<30)) _cpuisa |= AVX512BW;
if(c[1] & (1u<<31)) _cpuisa |= AVX512VL;
if(c[2] & (1<< 1)) _cpuisa |= AVX512VBMI;
if(c[2] & (1<<11)) _cpuisa |= AVX512VNNI;
if(c[2] & (1<< 6)) _cpuisa |= AVX512VBMI2;
}}}
}}}}}}}}}
#elif defined(__powerpc64__)
_cpuisa = IS_POWER9; // power9
#elif defined(__ARM_NEON)
_cpuisa = IS_NEON; // ARM_NEON
#endif
return _cpuisa;
}
unsigned cpuini(unsigned cpuisa) { if(cpuisa) _cpuisa = cpuisa; return _cpuisa; }
char *cpustr(unsigned cpuisa) {
if(!cpuisa) cpuisa = _cpuisa;
#if defined(__i386__) || defined(__x86_64__)
if(cpuisa >= IS_AVX512) {
if(cpuisa & AVX512VBMI2) return "avx512vbmi2";
if(cpuisa & AVX512VBMI) return "avx512vbmi";
if(cpuisa & AVX512VNNI) return "avx512vnni";
if(cpuisa & AVX512VL) return "avx512vl";
if(cpuisa & AVX512BW) return "avx512bw";
if(cpuisa & AVX512CD) return "avx512cd";
if(cpuisa & AVX512ER) return "avx512er";
if(cpuisa & AVX512PF) return "avx512pf";
if(cpuisa & AVX512IFMA) return "avx512ifma";
if(cpuisa & AVX512DQ) return "avx512dq";
if(cpuisa & AVX512F) return "avx512f";
return "avx512";
}
else if(cpuisa >= IS_AVX2) return "avx2";
else if(cpuisa >= IS_AVX)
switch(cpuisa&0xf) {
case 1: return "avx+fma3";
case 2: return "avx+fma4";
case 4: return "avx+aes";
case 5: return "avx+fma3+aes";
default:return "avx";
}
else if(cpuisa >= IS_SSE42) return "sse4.2";
else if(cpuisa >= IS_SSE41x) return "sse4.1+popcnt";
else if(cpuisa >= IS_SSE41) return "sse4.1";
else if(cpuisa >= IS_SSSE3) return "ssse3";
else if(cpuisa >= IS_SSE3) return "sse3";
else if(cpuisa >= IS_SSE2) return "sse2";
else if(cpuisa >= IS_SSE) return "sse";
#elif defined(__powerpc64__)
if(cpuisa >= IS_POWER9) return "power9";
#elif defined(__ARM_NEON)
if(cpuisa >= IS_NEON) return "arm_neon";
#endif
return "none";
}
//----------------------------------------------------------------------------------------------------------------------
typedef void (*TPFUNC)( unsigned char *in, unsigned n, unsigned char *out);
// 0 1 2 3 4 5 6 7 8 9 16
static TPFUNC _tpe[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // byte
static TPFUNC _tpd[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 };
static TPFUNC _tp4e[] = { 0, 0, tpenc2, tpenc3, tpenc4, 0, 0, 0, tpenc8, 0, 0, 0, 0, 0, 0, 0, tpenc16 }; // Nibble
static TPFUNC _tp4d[] = { 0, 0, tpdec2, tpdec3, tpdec4, 0, 0, 0, tpdec8, 0, 0, 0, 0, 0, 0, 0, tpdec16 };
//-- zigzag delta
#ifndef NTP_ZZAG
static TPFUNC _tpze[] = { 0, 0, tpzenc2, tpzenc3, tpzenc4, 0, 0, 0, tpzenc8, 0, 0, 0, 0, 0, 0, 0, tpzenc16 }; // byte
static TPFUNC _tpzd[] = { 0, 0, tpzdec2, tpzdec3, tpzdec4, 0, 0, 0, tpzdec8, 0, 0, 0, 0, 0, 0, 0, tpzdec16 };
static TPFUNC _tp4ze[] = { 0, 0, tpzenc2, tpzenc3, tpzenc4, 0, 0, 0, tpzenc8, 0, 0, 0, 0, 0, 0, 0, tpzenc16 }; // Nibble
static TPFUNC _tp4zd[] = { 0, 0, tpzdec2, tpzdec3, tpzdec4, 0, 0, 0, tpzdec8, 0, 0, 0, 0, 0, 0, 0, tpzdec16 };
#endif
//-- xor
#ifndef NTP_XOR
static TPFUNC _tpxe[] = { 0, 0, tpxenc2, tpxenc3, tpxenc4, 0, 0, 0, tpxenc8, 0, 0, 0, 0, 0, 0, 0, tpxenc16 }; // byte
static TPFUNC _tpxd[] = { 0, 0, tpxdec2, tpxdec3, tpxdec4, 0, 0, 0, tpxdec8, 0, 0, 0, 0, 0, 0, 0, tpxdec16 };
static TPFUNC _tp4xe[] = { 0, 0, tpxenc2, tpxenc3, tpxenc4, 0, 0, 0, tpxenc8, 0, 0, 0, 0, 0, 0, 0, tpxenc16 }; // Nibble
static TPFUNC _tp4xd[] = { 0, 0, tpxdec2, tpxdec3, tpxdec4, 0, 0, 0, tpxdec8, 0, 0, 0, 0, 0, 0, 0, tpxdec16 };
#endif
static int tpset;
void tpini(int id) {
int i;
if(tpset) return;
tpset++;
i = id?id:cpuisa();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86)
if(i >= IS_AVX2) {
_tpe[ 2] = tpenc256v2; _tpd[ 2] = tpdec256v2; _tp4e[ 2] = tp4enc256v2; _tp4d[ 2] = tp4dec256v2;
_tpe[ 4] = tpenc256v4; _tpd[ 4] = tpdec256v4; _tp4e[ 4] = tp4enc256v4; _tp4d[ 4] = tp4dec256v4;
_tpe[ 8] = tpenc256v8; _tpd[ 8] = tpdec256v8; _tp4e[ 8] = tp4enc256v8; _tp4d[ 8] = tp4dec256v8;
#ifndef NTP_ZZAG
_tpze[2] = tpzenc128v2; _tpzd[2] = tpzdec128v2; _tp4ze[2] = tp4zenc128v2; _tp4zd[2] = tp4zdec128v2; // 16 bits: only sse
_tpze[4] = tpzenc256v4; _tpzd[4] = tpzdec256v4; _tp4ze[4] = tp4zenc256v4; _tp4zd[4] = tp4zdec256v4;
_tpze[8] = tpzenc256v8; _tpzd[8] = tpzdec256v8; _tp4ze[8] = tp4zenc256v8; _tp4zd[8] = tp4zdec256v8;
#endif
#ifndef NTP_XOR
_tpxe[2] = tpxenc128v2; _tpxd[2] = tpxdec128v2; _tp4xe[2] = tp4xenc128v2; _tp4xd[2] = tp4xdec128v2; // 16 bits: only sse
_tpxe[4] = tpxenc256v4; _tpxd[4] = tpxdec256v4; _tp4xe[4] = tp4xenc256v4; _tp4xd[4] = tp4xdec256v4;
_tpxe[8] = tpxenc256v8; _tpxd[8] = tpxdec256v8; _tp4xe[8] = tp4xenc256v8; _tp4xd[8] = tp4xdec256v8;
#endif
} else
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(__ARM_NEON) || defined(__powerpc64__) || defined(_M_X64) || defined(_M_IX86)
if(i >= IS_SSE2) {
_tpe[ 2] = tpenc128v2; _tpd[ 2] = tpdec128v2; _tp4e[ 2] = tp4enc128v2; _tp4d[ 2] = tp4dec128v2;
_tpe[ 4] = tpenc128v4; _tpd[ 4] = tpdec128v4; _tp4e[ 4] = tp4enc128v4; _tp4d[ 4] = tp4dec128v4;
_tpe[ 8] = tpenc128v8; _tpd[ 8] = tpdec128v8; _tp4e[ 8] = tp4enc128v8; _tp4d[ 8] = tp4dec128v8;
if(i == 35) _tpd[8] = tpdec8; // ARM NEON scalar is faster!, TODO:retest on Apple M1
#ifndef NTP_ZZAG
_tpze[2] = tpzenc128v2; _tpzd[2] = tpzdec128v2; _tp4ze[2] = tp4zenc128v2; _tp4zd[2] = tp4zdec128v2;
_tpze[4] = tpzenc128v4; _tpzd[4] = tpzdec128v4; _tp4ze[4] = tp4zenc128v4; _tp4zd[4] = tp4zdec128v4;
_tpze[8] = tpzenc128v8; _tpzd[8] = tpzdec128v8; _tp4ze[8] = tp4zenc128v8; _tp4zd[8] = tp4zdec128v8;
if(i == 35) _tpzd[8] = tpzdec8;
#endif
#ifndef NTP_XOR
_tpxe[2] = tpxenc128v2; _tpxd[2] = tpxdec128v2; _tp4xe[2] = tp4xenc128v2; _tp4xd[2] = tp4xdec128v2;
_tpxe[4] = tpxenc128v4; _tpxd[4] = tpxdec128v4; _tp4xe[4] = tp4xenc128v4; _tp4xd[4] = tp4xdec128v4;
_tpxe[8] = tpxenc128v8; _tpxd[8] = tpxdec128v8; _tp4xe[8] = tp4xenc128v8; _tp4xd[8] = tp4xdec128v8;
if(i == 35) _tpxd[8] = tpxdec8;
#endif
}
#endif
;
}
void tpenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tpe[esize])) f(in,n,out);
else {
unsigned i, stride=n/esize;
unsigned char *op,*ip;
for(ip = in,op = out; ip < in+stride*esize; op++)
for(i = 0; i < esize; i++)
op[i*stride] = *ip++;
for(op = out + esize*stride; ip < in+n;)
*op++ = *ip++;
}
}
void tpdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tpd[esize])) f(in,n,out);
else {
unsigned i, stride = n/esize;
unsigned char *op,*ip;
for(op = out,ip = in; op < out+stride*esize; ip++)
for(i = 0; i < esize; i++)
*op++ = ip[i*stride];
for(ip = in+esize*stride; op < out+n;)
*op++ = *ip++;
}
}
void tp4enc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tp4e[esize])) f(in,n,out);
else tpenc(in,n,out,esize);
}
void tp4dec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tp4d[esize])) f(in,n,out);
else tpdec(in,n,out,esize);
}
//-- zigzag
void tpzenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tpze[esize])) f(in,n,out);
else {
unsigned i, stride = n/esize;
unsigned char *op, *ip;
for(ip = in,op = out; ip < in+stride*esize; op++)
for(i = 0; i < esize; i++)
op[i*stride] = *ip++; // TODO:zigzag
for(op = out + esize*stride; ip < in+n;)
*op++ = *ip++; // TODO:zigzag
}
}
void tpzdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tpzd[esize])) f(in,n,out);
else {
unsigned i, stride = n/esize;
unsigned char *op,*ip;
for(op = out,ip = in; op < out+stride*esize; ip++)
for(i = 0; i < esize; i++)
*op++ = ip[i*stride];
for(ip = in+esize*stride; op < out+n;)
*op++ = *ip++;
}
}
void tp4zenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tp4ze[esize])) f(in,n,out);
else tpzenc(in,n,out,esize);
}
void tp4zdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tp4zd[esize])) f(in,n,out);
else tpzdec(in,n,out,esize);
}
//-- xor
void tpxenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tpxe[esize])) f(in,n,out);
else {
unsigned i, stride = n/esize;
unsigned char *op, *ip;
for(ip = in,op = out; ip < in+stride*esize; op++)
for(i = 0; i < esize; i++)
op[i*stride] = *ip++; // TODO:xor
for(op = out + esize*stride; ip < in+n;)
*op++ = *ip++; // TODO:xor
}
}
void tpxdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tpxd[esize])) f(in,n,out);
else {
unsigned i, stride = n/esize;
unsigned char *op,*ip;
for(op = out,ip = in; op < out+stride*esize; ip++)
for(i = 0; i < esize; i++)
*op++ = ip[i*stride];
for(ip = in+esize*stride; op < out+n;)
*op++ = *ip++;
}
}
void tp4xenc(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tp4xe[esize])) f(in,n,out);
else tpxenc(in,n,out,esize);
}
void tp4xdec(unsigned char *in, unsigned n, unsigned char *out, unsigned esize) {
TPFUNC f;
if(!tpset) tpini(0);
if(esize <= 16 && (f = _tp4xd[esize])) f(in,n,out);
else tpxdec(in,n,out,esize);
}
#define E for(e = esize-1; e >= 0; e--)
#define ODX2 (x + y * nx) * esize + e
void tp2denc(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out, unsigned esize) {
switch(esize) {
case 2: tp2denc2(in,nx,ny,out); break;
case 4: tp2denc4(in,nx,ny,out); break;
case 8: tp2denc8(in,nx,ny,out); break;
default: {
unsigned x,y;
uint8_t *op = out, *ip = in;
int e;
for( x = 0; x < nx; x++)
for(y = 0; y < ny; y++) E
op[ODX2] = *ip++;
}
}
}
void tp2ddec(unsigned char *in, unsigned nx, unsigned ny, unsigned char *out, unsigned esize) {
switch(esize) {
case 2: tp2ddec2(in,nx,ny,out); break;
case 4: tp2ddec4(in,nx,ny,out); break;
case 8: tp2ddec8(in,nx,ny,out); break;
default: { unsigned x,y;
uint8_t *op = out, *ip = in;
int e;
for( x = 0; x < nx; x++)
for(y = 0; y < ny; y++) E
*op++ = ip[ODX2];
}
}
}
#undef ODX2
#define ODX3 (x + y * nx + z * ny * nx) * esize + e
void tp3denc(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) {
switch(esize) {
case 2: tp3denc2(in,nx,ny,nz,out); break;
case 4: tp3denc4(in,nx,ny,nz,out); break;
case 8: tp3denc8(in,nx,ny,nz,out); break;
default: { unsigned x,y,z;
uint8_t *op = out, *ip = in;
int e;
for( x = 0; x < nx; x++)
for( y = 0; y < ny; y++)
for(z = 0; z < nz; z++) E
op[ODX3] = *ip++;
}
}
}
void tp3ddec(unsigned char *in, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) {
switch(esize) {
case 2: tp3ddec2(in,nx,ny,nz,out); break;
case 4: tp3ddec4(in,nx,ny,nz,out); break;
case 8: tp3ddec8(in,nx,ny,nz,out); break;
default: { unsigned x,y,z;
uint8_t *op = out, *ip = in;
int e;
for(x = 0; x < nx; ++x)
for(y = 0; y < ny; ++y)
for(z = 0; z < nz; ++z) E
*op++ = ip[ODX3];
}
}
}
#undef ODX3
#define ODX4 (w + x * nw + y * nx * nw + z * nx * ny * nw) * esize + e
void tp4denc(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) {
switch(esize) {
case 2: tp4denc2(in,nw,nx,ny,nz,out); break;
case 4: tp4denc4(in,nw,nx,ny,nz,out); break;
case 8: tp4denc8(in,nw,nx,ny,nz,out); break;
default: {
unsigned w, x, y, z;
uint8_t *op = out, *ip = in;
int e;
for( w = 0; w < nw; w++)
for( x = 0; x < nx; x++)
for( y = 0; y < ny; y++)
for(z = 0; z < nz; z++) E
op[ODX4] = *ip++;
}
}
}
void tp4ddec(unsigned char *in, unsigned nw, unsigned nx, unsigned ny, unsigned nz, unsigned char *out, unsigned esize) {
switch(esize) {
case 2: tp4ddec2(in,nw,nx,ny,nz,out); break;
case 4: tp4ddec4(in,nw,nx,ny,nz,out); break;
case 8: tp4ddec8(in,nw,nx,ny,nz,out); break;
default: {
unsigned w,x,y,z;
uint8_t *op = out,*ip = in;
int e;
for( w = 0; w < nw; w++)
for( x = 0; x < nx; ++x)
for( y = 0; y < ny; ++y)
for(z = 0; z < nz; ++z) E
*op++ = ip[ODX4];
}
}
}