Compare commits

...

10 Commits

Author SHA1 Message Date
e4f6853116 add loongarch64 support
Some checks failed
build / linux (clang, ubuntu-latest) (push) Has been cancelled
build / linux (gcc, ubuntu-latest) (push) Has been cancelled
build / macos (clang, macos-latest) (push) Has been cancelled
2025-05-28 10:04:23 +08:00
06d6aad98b Merge pull request #117 from omarathon/patch-1
ree
2024-02-13 12:32:15 +01:00
b0b651ab77 nit 2024-02-12 00:33:23 +00:00
x
da4fa61f6d TurboRLE: Benchmark App 2023-08-14 12:00:22 +02:00
x
559021fbfe Inverted Index: Partioning/Sharding App 2023-08-14 12:00:22 +02:00
x
6f1140d424 Inverted Index: Query Processing 2023-08-14 12:00:22 +02:00
x
c0e876cfb3 Inverted Index: Indexing 2023-08-14 12:00:22 +02:00
x
009bad8255 TurboPFor: Benchmark app 2023-08-14 12:00:22 +02:00
x
72afb3ae30 TurboPFor: Elias fano 2023-08-14 12:00:22 +02:00
x
4c47a778b1 TurboPFor: Config/Platform 2023-08-14 12:00:21 +02:00
22 changed files with 439 additions and 80 deletions

View File

@ -187,7 +187,8 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\ defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \ defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \ defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
defined(__loongarch_lp64)
#define ctou16(_cp_) (*(unsigned short *)(_cp_)) #define ctou16(_cp_) (*(unsigned short *)(_cp_))
#define ctou32(_cp_) (*(unsigned *)(_cp_)) #define ctou32(_cp_) (*(unsigned *)(_cp_))
#define ctof32(_cp_) (*(float *)(_cp_)) #define ctof32(_cp_) (*(float *)(_cp_))
@ -207,7 +208,7 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
#define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_) #define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_)
#elif defined(__ARM_FEATURE_UNALIGNED) #elif defined(__ARM_FEATURE_UNALIGNED) || defined(__loongarch_lp64)
struct _PACKED longu { uint64_t l; }; struct _PACKED longu { uint64_t l; };
struct _PACKED doubleu { double d; }; struct _PACKED doubleu { double d; };
#define ctou64(_cp_) ((struct longu *)(_cp_))->l #define ctou64(_cp_) ((struct longu *)(_cp_))->l
@ -253,7 +254,7 @@ struct _PACKED doubleu { double d; };
defined(__aarch64__) ||\ defined(__aarch64__) ||\
defined(__mips64) ||\ defined(__mips64) ||\
defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\ defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
defined(__s390x__) defined(__s390x__) || defined(__loongarch_lp64)
#define __WORDSIZE 64 #define __WORDSIZE 64
#else #else
#define __WORDSIZE 32 #define __WORDSIZE 32

View File

@ -385,7 +385,7 @@ size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__rest
size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); } size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); }
size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitx256v, bitxpack256v, bitx, bitxpack); } size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitx256v, bitxpack256v, bitx, bitxpack); }
#elif defined(__SSE3__) || defined(__ARM_NEON) //----------------------------- SSE / AVX --------------------------------------------------------------- #elif defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //----------------------------- SSE / AVX ---------------------------------------------------------------
#define OPPE(__op) #define OPPE(__op)
#define IPPE(__op) #define IPPE(__op)

View File

@ -651,7 +651,7 @@ size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__re
size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitxunpack256v, bitxunpack); } size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitxunpack256v, bitxunpack); }
size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitfunpack256v, bitfunpack); } size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitfunpack256v, bitfunpack); }
#elif defined(__SSE2__) || defined(__ARM_NEON) //------------------------------ SSE2/SSSE3 --------------------------------------------------------- #elif defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //------------------------------ SSE2/SSSE3 ---------------------------------------------------------
#define BITMAX16 16 #define BITMAX16 16
#define BITMAX32 32 #define BITMAX32 32
@ -760,7 +760,7 @@ unsigned char *bitunpack128v64( const unsigned char *__restrict in, unsigned n,
#undef VOZ16 #undef VOZ16
#undef BITUNPACK0 #undef BITUNPACK0
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
#define _ 0x80 #define _ 0x80
ALIGNED(char, _shuffle_32[16][16],16) = { ALIGNED(char, _shuffle_32[16][16],16) = {
{ _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ },
@ -1178,7 +1178,7 @@ unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n,
return (unsigned char *)ip; return (unsigned char *)ip;
} }
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
#define BITMAX16 15 #define BITMAX16 15
#define BITMAX32 31 #define BITMAX32 31
@ -1315,7 +1315,7 @@ unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n
return (unsigned char *)ip; return (unsigned char *)ip;
} }
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
#define BITMAX16 15 #define BITMAX16 15
#define BITMAX32 31 #define BITMAX32 31

View File

@ -228,7 +228,7 @@ uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px) { uint64_t o,x,u0,*ip; BI
uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) { uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
uint16_t o, x, u0 = in[0], *ip = in; uint16_t o, x, u0 = in[0], *ip = in;
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), __m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi16(u0); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi16(u0);
@ -254,7 +254,7 @@ uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) { uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) {
uint32_t o,x,u0 = in[0], *ip = in; uint32_t o,x,u0 = in[0], *ip = in;
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), __m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi32(u0); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi32(u0);
@ -290,7 +290,7 @@ uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint6
uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
uint16_t o, x, *ip = in, u0 = in[0] - start; uint16_t o, x, *ip = in, u0 = in[0] - start;
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vb0 = _mm_set1_epi16(u0), __m128i vb0 = _mm_set1_epi16(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start);
@ -321,7 +321,7 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
uint32_t o = 0, x=0, *ip = in, u0 = in[0] - start; uint32_t o = 0, x=0, *ip = in, u0 = in[0] - start;
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vb0 = _mm_set1_epi32(u0), __m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start);
@ -361,7 +361,7 @@ void bitddec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in, n
void bitddec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 0); } void bitddec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 0); }
void bitddec32(uint32_t *in, unsigned n, unsigned start) { void bitddec32(uint32_t *in, unsigned n, unsigned start) {
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vs = _mm_set1_epi32(start); __m128i vs = _mm_set1_epi32(start);
unsigned *ip = in; unsigned *ip = in;
for(; ip != in+(n&~(8-1)); ip += 8) { for(; ip != in+(n&~(8-1)); ip += 8) {
@ -417,7 +417,7 @@ uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint6
uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
uint32_t o = 0, x=0, *ip = in, u0 = in[0]-start-1; uint32_t o = 0, x=0, *ip = in, u0 = in[0]-start-1;
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vb0 = _mm_set1_epi32(u0), __m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(1);
@ -446,7 +446,7 @@ uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
} }
uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
uint16_t *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); uint16_t *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8);
for(; ip != in+(n&~(8-1)); ip += 8) { for(; ip != in+(n&~(8-1)); ip += 8) {
__m128i iv = _mm_loadu_si128((__m128i *)ip); __m128i iv = _mm_loadu_si128((__m128i *)ip);
@ -461,7 +461,7 @@ uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
} }
unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
unsigned *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); unsigned *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4);
for(; ip != in+(n&~(4-1)); ip += 4) { for(; ip != in+(n&~(4-1)); ip += 4) {
__m128i iv = _mm_loadu_si128((__m128i *)ip); __m128i iv = _mm_loadu_si128((__m128i *)ip);
@ -480,7 +480,7 @@ void bitd1dec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in,
void bitd1dec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 1); } void bitd1dec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 1); }
void bitd1dec32(uint32_t *in, unsigned n, uint32_t start) { void bitd1dec32(uint32_t *in, unsigned n, uint32_t start) {
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vs = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); __m128i vs = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
unsigned *ip = in; unsigned *ip = in;
for(; ip != in+(n&~(4-1)); ip += 4) { for(; ip != in+(n&~(4-1)); ip += 4) {
@ -515,7 +515,7 @@ uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uin
uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x=0,*op = out,u; BITDE(uint16_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; } uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x=0,*op = out,u; BITDE(uint16_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x=0,*op = out,u; BITDE(uint64_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; } uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x=0,*op = out,u; BITDE(uint64_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) { uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) {
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
unsigned *ip = in,b,*op = out; unsigned *ip = in,b,*op = out;
__m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(mindelta), dv; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(mindelta), dv;
for(; ip != in+(n&~(4-1)); ip += 4,op += 4) { for(; ip != in+(n&~(4-1)); ip += 4,op += 4) {
@ -534,6 +534,7 @@ uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin
} }
#else #else
uint32_t b = 0,*op = out, x, *_ip; uint32_t b = 0,*op = out, x, *_ip;
//uint32_t b = 0, *op = out, x = 0, o = 0, u = 0, *_ip;
BITDE(uint32_t, in, n, mindelta, b |= x; *op++ = x); BITDE(uint32_t, in, n, mindelta, b |= x; *op++ = x);
#endif #endif
return b; return b;
@ -569,7 +570,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
uint16_t o, x, *ip = in; uint16_t o, x, *ip = in;
uint32_t u0 = zigzagenc16((int)in[0] - (int)start); uint32_t u0 = zigzagenc16((int)in[0] - (int)start);
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), __m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start);
for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
@ -599,7 +600,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) { uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
uint32_t o, x, *ip=in, uint32_t o, x, *ip=in,
u0 = zigzagenc32((int)in[0] - (int)start); u0 = zigzagenc32((int)in[0] - (int)start);
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vb0 = _mm_set1_epi32(u0), __m128i vb0 = _mm_set1_epi32(u0),
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start); vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start);
@ -631,7 +632,7 @@ uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint
uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x,u,*op = out; BITZENC(uint64_t, int64_t,64,in, n, o |= u; *op++ = u); return o; } uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x,u,*op = out; BITZENC(uint64_t, int64_t,64,in, n, o |= u; *op++ = u); return o; }
uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) { uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) {
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
unsigned *ip = in,b,*op = out; unsigned *ip = in,b,*op = out;
__m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start); __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start);
for(; ip != in+(n&~(8-1)); ip += 8,op += 8) { for(; ip != in+(n&~(8-1)); ip += 8,op += 8) {
@ -672,7 +673,7 @@ void bitzdec8( uint8_t *in, unsigned n, uint8_t start) { BITZDEC(uint8_t, 8,
void bitzdec64(uint64_t *in, unsigned n, uint64_t start) { BITZDEC(uint64_t, 64,in, n); } void bitzdec64(uint64_t *in, unsigned n, uint64_t start) { BITZDEC(uint64_t, 64,in, n); }
void bitzdec16(uint16_t *in, unsigned n, uint16_t start) { void bitzdec16(uint16_t *in, unsigned n, uint16_t start) {
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vs = _mm_set1_epi16(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
uint16_t *ip = in; uint16_t *ip = in;
for(; ip != in+(n&~(8-1)); ip += 8) { for(; ip != in+(n&~(8-1)); ip += 8) {
@ -692,7 +693,7 @@ void bitzdec16(uint16_t *in, unsigned n, uint16_t start) {
} }
void bitzdec32(unsigned *in, unsigned n, unsigned start) { void bitzdec32(unsigned *in, unsigned n, unsigned start) {
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vs = _mm_set1_epi32(start); __m128i vs = _mm_set1_epi32(start);
unsigned *ip = in; unsigned *ip = in;
for(; ip != in+(n&~(8-1)); ip += 8) { for(; ip != in+(n&~(8-1)); ip += 8) {
@ -729,7 +730,7 @@ uint64_t bitx64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64
uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
uint16_t o = 0, *ip = in; uint16_t o = 0, *ip = in;
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vo0 = _mm_setzero_si128(), __m128i vo0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(),
vs = _mm_set1_epi16(start); vs = _mm_set1_epi16(start);
@ -752,7 +753,7 @@ uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
uint32_t bitx32(unsigned *in, unsigned n, uint32_t *px, uint32_t start) { uint32_t bitx32(unsigned *in, unsigned n, uint32_t *px, uint32_t start) {
uint32_t o = 0, *ip = in; uint32_t o = 0, *ip = in;
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vo0 = _mm_setzero_si128(), __m128i vo0 = _mm_setzero_si128(),
vo1 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(),
vs = _mm_set1_epi32(start); vs = _mm_set1_epi32(start);
@ -787,7 +788,7 @@ void bitxdec8( uint8_t *in, unsigned n, uint8_t start) { BITXDEC(uint8_t, in,
void bitxdec64(uint64_t *in, unsigned n, uint64_t start) { BITXDEC(uint64_t, in, n); } void bitxdec64(uint64_t *in, unsigned n, uint64_t start) { BITXDEC(uint64_t, in, n); }
void bitxdec16(uint16_t *in, unsigned n, uint16_t start) { void bitxdec16(uint16_t *in, unsigned n, uint16_t start) {
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vs = _mm_set1_epi16(start); __m128i vs = _mm_set1_epi16(start);
uint16_t *ip = in; uint16_t *ip = in;
for(; ip != in+(n&~(8-1)); ip += 8) { for(; ip != in+(n&~(8-1)); ip += 8) {
@ -806,7 +807,7 @@ void bitxdec16(uint16_t *in, unsigned n, uint16_t start) {
} }
void bitxdec32(unsigned *in, unsigned n, unsigned start) { void bitxdec32(unsigned *in, unsigned n, unsigned start) {
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i vs = _mm_set1_epi32(start); __m128i vs = _mm_set1_epi32(start);
unsigned *ip = in; unsigned *ip = in;
for(; ip != in+(n&~(8-1)); ip += 8) { for(; ip != in+(n&~(8-1)); ip += 8) {

View File

@ -27,7 +27,11 @@
#include <string.h> #include <string.h>
#include <stdlib.h> #include <stdlib.h>
#ifdef __APPLE__ #ifdef __APPLE__
#ifdef HAVE_MALLOC_MALLOC
#include <sys/malloc.h> #include <sys/malloc.h>
#else
#include <malloc/malloc.h>
#endif
#else #else
#include <malloc.h> #include <malloc.h>
#endif #endif

View File

@ -174,7 +174,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start)
#if defined(__AVX2__) && USIZE >= 32 #if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = T2(_mm256_set1_epi, USIZE)(start); __m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a) #define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = T2(_mm_set1_epi, USIZE)(start); __m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif #endif
@ -191,7 +191,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start)
} }
start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1); start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
b = T2(mm256_hor_epi, USIZE)(bv); b = T2(mm256_hor_epi, USIZE)(bv);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
__m128i bv = _mm_setzero_si128(); __m128i bv = _mm_setzero_si128();
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),ip+=32/(USIZE/8)) { for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),ip+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) ip); __m128i v0 = _mm_loadu_si128((__m128i *) ip);
@ -217,7 +217,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start)
_mm256_storeu_si256((__m256i *) p, v0); _mm256_storeu_si256((__m256i *) p, v0);
_mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1);
} }
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) { for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) p); __m128i v0 = _mm_loadu_si128((__m128i *) p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
@ -255,7 +255,7 @@ size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start)
#if defined(__AVX2__) && USIZE >= 32 #if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = T2(_mm256_set1_epi, USIZE)(start); __m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a) #define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = T2(_mm_set1_epi, USIZE)(start); __m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif #endif
@ -277,7 +277,7 @@ size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start)
_mm256_storeu_si256((__m256i *)(op+32/(USIZE/8)), sv); _mm256_storeu_si256((__m256i *)(op+32/(USIZE/8)), sv);
} }
start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1); start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),op+=32/(USIZE/8)) { for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),op+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *)p); __m128i v0 = _mm_loadu_si128((__m128i *)p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
@ -318,7 +318,7 @@ size_t T2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start
#if defined(__AVX2__) && USIZE >= 32 #if defined(__AVX2__) && USIZE >= 32
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
__m256i sv = T2(_mm256_set1_epi, USIZE)(start); __m256i sv = T2(_mm256_set1_epi, USIZE)(start);
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
#define _mm_set1_epi64(a) _mm_set1_epi64x(a) #define _mm_set1_epi64(a) _mm_set1_epi64x(a)
__m128i sv = T2(_mm_set1_epi, USIZE)(start); __m128i sv = T2(_mm_set1_epi, USIZE)(start);
#endif #endif
@ -339,7 +339,7 @@ size_t T2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start
_mm256_storeu_si256((__m256i *) p, v0); _mm256_storeu_si256((__m256i *) p, v0);
_mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1);
} }
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) { for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) {
__m128i v0 = _mm_loadu_si128((__m128i *) p); __m128i v0 = _mm_loadu_si128((__m128i *) p);
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
@ -823,7 +823,7 @@ size_t T2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start)
return 1+n*sizeof(out[0]); return 1+n*sizeof(out[0]);
} }
BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd); BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd);
#if (defined(__SSE2__) /*|| defined(__ARM_NEON)*/) && USIZE == 32 #if (defined(__SSE2__) /*|| defined(__ARM_NEON) || defined(__loongarch_lp64)*/) && USIZE == 32
__m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4*pd,3*pd,2*pd,1*pd); __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4*pd,3*pd,2*pd,1*pd);
for(r += NL, _op = op; op != _op+(r&~7);) { for(r += NL, _op = op; op != _op+(r&~7);) {
sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3) sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3)
@ -926,7 +926,7 @@ size_t T2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start)
return 1+n*sizeof(out[0]); return 1+n*sizeof(out[0]);
} }
BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd); BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd);
#if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32 #if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
__m128i sv = _mm_set1_epi32(start); __m128i sv = _mm_set1_epi32(start);
for(r += NL, _op = op; op != _op+(r&~7);) { for(r += NL, _op = op; op != _op+(r&~7);) {
_mm_storeu_si128((__m128i *)op, sv); op += 4; _mm_storeu_si128((__m128i *)op, sv); op += 4;

View File

@ -31,7 +31,11 @@
#include <limits.h> #include <limits.h>
#include <time.h> /* time_t, struct tm, time, mktime */ #include <time.h> /* time_t, struct tm, time, mktime */
#ifdef __APPLE__ #ifdef __APPLE__
#ifdef HAVE_MALLOC_MALLOC
#include <sys/malloc.h> #include <sys/malloc.h>
#else
#include <malloc/malloc.h>
#endif
#else #else
#include <malloc.h> #include <malloc.h>
#endif #endif
@ -1029,7 +1033,7 @@ unsigned char *bestr(unsigned id, unsigned b, unsigned char *s, char *prms, int
"%3d:bitnfpack256v%d TurboPack256 FOR ", "%3d:bitnfpack256v%d TurboPack256 FOR ",
"%3d:bitns1pack128v32 TurboPack256 delt4_1", "%3d:bitns1pack128v32 TurboPack256 delt4_1",
"%3d:bitnxpack128v%d TurboPack128 xor ", "%3d:bitnxpack128v%d TurboPack128 xor ",
"%3d:bitnxpack256v%d TurboPack128 xor ", "%3d:bitnxpack256v%d TurboPack256 xor ",
"%3d:vsenc%d TurboVSimple ", "%3d:vsenc%d TurboVSimple ",
"%3d:vszenc%d TurboVSimple zigzag ", "%3d:vszenc%d TurboVSimple zigzag ",

View File

@ -31,11 +31,15 @@
#include <limits.h> #include <limits.h>
#include <getopt.h> #include <getopt.h>
#include <sys/stat.h> #include <sys/stat.h>
#ifdef __APPLE__ #ifdef __APPLE__
#ifdef HAVE_MALLOC_MALLOC
#include <sys/malloc.h> #include <sys/malloc.h>
#else #else
#include <malloc/malloc.h>
#endif
#else
#include <malloc.h> #include <malloc.h>
#endif #endif
#include "include_/conf.h" #include "include_/conf.h"
#include "include_/vint.h" #include "include_/vint.h"

View File

@ -33,11 +33,15 @@
#include <unistd.h> #include <unistd.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <fcntl.h> #include <fcntl.h>
#ifdef __APPLE__ #ifdef __APPLE__
#ifdef HAVE_MALLOC_MALLOC
#include <sys/malloc.h> #include <sys/malloc.h>
#else #else
#include <malloc/malloc.h>
#endif
#else
#include <malloc.h> #include <malloc.h>
#endif #endif
#ifdef _WIN32 #ifdef _WIN32
#include <windows.h> #include <windows.h>

View File

@ -27,11 +27,15 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#ifdef __APPLE__ #ifdef __APPLE__
#ifdef HAVE_MALLOC_MALLOC
#include <sys/malloc.h> #include <sys/malloc.h>
#else #else
#include <malloc/malloc.h>
#endif
#else
#include <malloc.h> #include <malloc.h>
#endif #endif
#include <getopt.h> #include <getopt.h>
#include "include_/conf.h" #include "include_/conf.h"

View File

@ -40,6 +40,8 @@
#include <emmintrin.h> #include <emmintrin.h>
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#elif defined(__loongarch_lp64)
#include <lsxintrin.h>
#endif #endif
#if defined(_MSC_VER) && _MSC_VER < 1600 #if defined(_MSC_VER) && _MSC_VER < 1600
#include "vs/stdint.h" #include "vs/stdint.h"
@ -112,7 +114,7 @@ static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
} }
#endif #endif
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
#define mm_srai_epi64_63(_v_, _s_) _mm_srai_epi32(_mm_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 1, 1)), 31) #define mm_srai_epi64_63(_v_, _s_) _mm_srai_epi32(_mm_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 1, 1)), 31)
static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16( v,15)); } static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16( v,15)); }
@ -230,7 +232,7 @@ static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_
_sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_mindelta_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\ _sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_mindelta_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\
} while(0) } while(0)
#elif defined(__SSE2__) || defined(__ARM_NEON) // ------------- #elif defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) // -------------
// SIMD set value (memset) // SIMD set value (memset)
#define BITZERO32(_out_, _n_, _v_) do {\ #define BITZERO32(_out_, _n_, _v_) do {\
__m128i _sv_ = _mm_set1_epi32(_v_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\ __m128i _sv_ = _mm_set1_epi32(_v_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
@ -322,7 +324,7 @@ static ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
} }
#endif #endif
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
static ALWAYS_INLINE __m128i mm_rbit_epi16(__m128i v) { return mm_rbit_epi8(mm_rev_epi16(v)); } static ALWAYS_INLINE __m128i mm_rbit_epi16(__m128i v) { return mm_rbit_epi8(mm_rev_epi16(v)); }
static ALWAYS_INLINE __m128i mm_rbit_epi32(__m128i v) { return mm_rbit_epi8(mm_rev_epi32(v)); } static ALWAYS_INLINE __m128i mm_rbit_epi32(__m128i v) { return mm_rbit_epi8(mm_rev_epi32(v)); }
static ALWAYS_INLINE __m128i mm_rbit_epi64(__m128i v) { return mm_rbit_epi8(mm_rev_epi64(v)); } static ALWAYS_INLINE __m128i mm_rbit_epi64(__m128i v) { return mm_rbit_epi8(mm_rev_epi64(v)); }

9
lib/include_/conf.h Executable file → Normal file
View File

@ -112,7 +112,7 @@ static ALWAYS_INLINE unsigned ror64(unsigned x, int s) { return x >> s | x << (6
#define ALIGNED(t,v,n) __declspec(align(n)) t v #define ALIGNED(t,v,n) __declspec(align(n)) t v
#define ALWAYS_INLINE __forceinline #define ALWAYS_INLINE __forceinline
#define NOINLINE __declspec(noinline) #define NOINLINE __declspec(noinline)
#define _PACKED __attribute__ ((packed)) #define _PACKED //__attribute__ ((packed))
#define THREADLOCAL __declspec(thread) #define THREADLOCAL __declspec(thread)
#define likely(x) (x) #define likely(x) (x)
#define unlikely(x) (x) #define unlikely(x) (x)
@ -203,7 +203,8 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\ defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \ defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \ defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
defined(__loongarch_lp64)
#define ctou16(_cp_) (*(unsigned short *)(_cp_)) #define ctou16(_cp_) (*(unsigned short *)(_cp_))
#define ctou32(_cp_) (*(unsigned *)(_cp_)) #define ctou32(_cp_) (*(unsigned *)(_cp_))
#define ctof16(_cp_) (*(_Float16 *)(_cp_)) #define ctof16(_cp_) (*(_Float16 *)(_cp_))
@ -225,7 +226,7 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
#define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_) #define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_)
#elif defined(__ARM_FEATURE_UNALIGNED) #elif defined(__ARM_FEATURE_UNALIGNED) || defined(__loongarch_lp64)
struct _PACKED longu { uint64_t l; }; struct _PACKED longu { uint64_t l; };
struct _PACKED doubleu { double d; }; struct _PACKED doubleu { double d; };
#define ctou64(_cp_) ((struct longu *)(_cp_))->l #define ctou64(_cp_) ((struct longu *)(_cp_))->l
@ -276,7 +277,7 @@ struct _PACKED doubleu { double d; };
defined(__aarch64__) ||\ defined(__aarch64__) ||\
defined(__mips64) ||\ defined(__mips64) ||\
defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\ defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
defined(__s390x__) defined(__s390x__) || defined(__loongarch_lp64)
#define __WORDSIZE 64 #define __WORDSIZE 64
#else #else
#define __WORDSIZE 32 #define __WORDSIZE 32

View File

@ -302,6 +302,324 @@ static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) { uint
static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); } static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); }
#endif #endif
#elif defined(__loongarch_lp64)
#include <lsxintrin.h>
// sse instruct to loongarch lsx instruct mapping
#ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
#define _mm_set_epi8(u15,u14,u13,u12,\
u11,u10, u9, u8,\
u7,u6,u5,u4,\
u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (v4u32)__lsx_vld( _u, 0);})
#define _mm_set_epi16( u7,u6,u5,u4,\
u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (v4u32)__lsx_vld( _u, 0);})
#define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; __lsx_vld(_u, 0);})
#define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (v4u32)__lsx_vld(_u);})
#else
static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8,
uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4,
uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) {
uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (__m128i)__lsx_vld(u, 0); }
static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4,
uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (__m128i)__lsx_vld(u, 0); }
static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return __lsx_vld(u, 0); }
static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (__m128i)__lsx_vld(u, 0); }
#endif
#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7)
#define _mm_setr_epi32(u3,u2,u1,u0) _mm_set_epi32( u0,u1,u2,u3)
#define _mm_setr_epi64x(u1,u0) _mm_set_epi64x(u0,u0)
#define _mm_set1_epi8( _u8_ ) (__m128i)__lsx_vreplgr2vr_b(_u8_)
#define _mm_set1_epi16( _u16_) (__m128i)__lsx_vreplgr2vr_h(_u16_)
#define _mm_set1_epi32( _u32_) __lsx_vreplgr2vr_w(_u32_)
#define _mm_set1_epi64x(_u64_) (__m128i)__lsx_vreplgr2vr_d(_u64_)
#define _mm_setzero_si128() __lsx_vreplgr2vr_w( 0 )
#define _mm_cvtss_f32(_v_) __lsx_vpickve2gr_s((__m128)(_v_), 0)
#define _mm_setzero_ps() (__m128)__lsx_vldi(0)
#define _mm_set1_ps(_f32_) (__m128)__lsx_vreplfr2vr_s(_f32_)
//---------------------------------------------- Arithmetic -----------------------------------------------------------------------
#define _mm_add_epi8( _u_,_v_) (__m128i)__lsx_vadd_b((__m128i)(_u_), (__m128i)(_v_))
#define _mm_add_epi16( _u_,_v_) (__m128i)__lsx_vadd_h((__m128i)(_u_), (__m128i)(_v_))
#define _mm_add_epi32( _u_,_v_) __lsx_vadd_w( _u_, _v_ )
#define _mm_add_epi64( _u_,_v_) (__m128i)__lsx_vadd_d((__m128i)(_u_), (__m128i)(_v_))
#define _mm_sub_epi8( _u_,_v_) (__m128i)__lsx_vsub_b((__m128i)(_u_), (__m128i)(_v_))
#define _mm_sub_epi16( _u_,_v_) (__m128i)__lsx_vsub_h((__m128i)(_u_), (__m128i)(_v_))
#define _mm_sub_epi32( _u_,_v_) (__m128i)__lsx_vsub_w((__m128i)(_u_), (__m128i)(_v_))
#define _mm_sub_epi64( _u_,_v_) (__m128i)__lsx_vsub_d((__m128i)(_u_), (__m128i)(_v_))
#define _mm_subs_epu8( _u_,_v_) (__m128i)__lsx_vsub_bu((__m128i)(_u_), (__m128i)(_v_))
#define _mm_mullo_epi16(_u_, _v_) (__m128i)__lsx_vmulwev_h_h((__m128i)(_u_), (__m128i)(_v_))
#define _mm_mullo_epi32(_u_,_v_) (__m128i)__lsx_vmulwev_w_w((__m128i)(_u_), (__m128i)(_v_))
#define mm_mullo_epu32(_u_,_v_) (__m128i)__lsx_vmulwev_w_wu((__m128i)(_u_), (__m128i)(_v_))
//#define _mm_mulhi_epi16s(_u_,_v_) (__m128i)__lsx_vmulwh_h_h((__m128i)(_u_), (__m128i)(_v_))
static ALWAYS_INLINE __m128i _mm_mulhi_epi16(__m128i u, __m128i v) {
__m128i evens = __lsx_vmulwev_h_w(u, v); // a[0]*b[0], a[2]*b[2], ...
__m128i odds = __lsx_vmulwod_h_w(u, v); // a[1]*b[1], a[3]*b[3], ...
// 2. 右移 16 位,提取高 16 位
evens = __lsx_vsrai_w(evens, 16);
odds = __lsx_vsrai_w(odds, 16);
// 3. 重新打包成 16 位结果
__m128i res = __lsx_vpickev_h(odds, evens); // 交错组合
return res;
}
#define _mm_mul_epu32(_u_, _v_) (__m128i)__lsx_vmulwev_d_wu((__m128i)(_u_), (__m128i)(_v_))
#define _mm_adds_epu16(_u_, _v_) (__m128i)__lsx_vsadd_hu((__m128i)(_u_), (__m128i)(_v_))
static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) {
// 1. 执行16位有符号乘法得到32位中间结果
__m128i mul_even = __lsx_vmulwev_w_h(u, v); // 偶数位相乘 (0*0, 2*2, 4*4, 6*6)
__m128i mul_odd = __lsx_vmulwod_w_h(u, v); // 奇数位相乘 (1*1, 3*3, 5*5, 7*7)
// 2. 水平相加相邻的两个32位结果
__m128i sum = __lsx_vadd_w(mul_even, mul_odd); // [0*0+1*1, 2*2+3*3, 4*4+5*5, 6*6+7*7]
return sum;
}
//---------------------------------------------- Special math functions -----------------------------------------------------------
#define _mm_min_epu8(_u_, _v_) (__m128i)__lsx_vmin_bu((__m128i)(_u_), (__m128i)(_v_))
#define _mm_min_epu16(_u_, _v_) (__m128i)__lsx_vmin_hu((__m128i)(_u_), (__m128i)(_v_))
#define _mm_min_epi16(_u_, _v_) (__m128i)__lsx_vmin_h((__m128i)(_u_), (__m128i)(_v_))
//---------------------------------------------- Logical --------------------------------------------------------------------------
#define mm_testnz_epu32(_u_) (__lsx_vreplgr2vr_w(__lsx_vsrlri_w((__m128i)(_u_), 31)) != __lsx_vreplgr2vr_w(0))
#define mm_testnz_epu8(_u_) (__lsx_vreplgr2vr_b(__lsx_vsrlri_b((__m128i)(_u_), 7)) != __lsx_vreplgr2vr_b(0))
#define _mm_or_si128(_u_, _v_) (__m128i)__lsx_vor_v((__m128i)(_u_), (__m128i)(_v_))
#define _mm_and_si128(_u_, _v_) (__m128i)__lsx_vand_v((__m128i)(_u_), (__m128i)(_v_))
#define _mm_xor_si128(_u_, _v_) (__m128i)__lsx_vxor_v((__m128i)(_u_), (__m128i)(_v_))
//---------------------------------------------- Shift ----------------------------------------------------------------------------
#define mm_slli_epi8(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>7?(__m128i)__lsx_vreplgr2vr_b(0):(__m128i)__lsx_vslli_b((__m128i)(_u_), (_c_)))
#define mm_slli_epi16(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>15?(__m128i)__lsx_vreplgr2vr_h(0):(__m128i)__lsx_vslli_h((__m128i)(_u_), (_c_)))
#define mm_slli_epi32(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>31?(__m128i)__lsx_vreplgr2vr_w(0):(__m128i)__lsx_vslli_w((__m128i)(_u_), (_c_)))
#define mm_slli_epi64(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>63?(__m128i)__lsx_vreplgr2vr_d(0):(__m128i)__lsx_vslli_d((__m128i)(_u_), (_c_)))
#define _mm_slli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(_c_)>15?__lsx_vreplgr2vr_b(0):__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), (__m128i)(_v_), __lsx_vreplgr2vr_b((16 - (_c_)) + ((16 - (_c_)) << 8))))
#define mm_srli_epi8(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)> 7?(__m128i)__lsx_vreplgr2vr_b(0):(__m128i)__lsx_vsrlri_b((__m128i)(_u_), (_c_)))
#define mm_srli_epi16(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>15?(__m128i)__lsx_vreplgr2vr_h(0):(__m128i)__lsx_vsrlri_h((__m128i)(_u_), (_c_)))
#define mm_srli_epi32(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>31?(__m128i)__lsx_vreplgr2vr_w(0):(__m128i)__lsx_vsrlri_w((__m128i)(_u_), (_c_)))
#define mm_srli_epi64(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>63?(__m128i)__lsx_vreplgr2vr_d(0):(__m128i)__lsx_vsrlri_d((__m128i)(_u_), (_c_)))
#define _mm_srli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(_c_)>15?__lsx_vreplgr2vr_b(0): __lsx_vsrlr_b((__m128i)(_v_), __lsx_vreplgr2vr_b((_c_) * 8)) )
#define mm_srai_epi8(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_b((__m128i)(_v_), (_c_)) )
#define mm_srai_epi16(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_h((__m128i)(_v_), (_c_)) )
#define mm_srai_epi32(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_w((__m128i)(_v_), (_c_)) )
#define mm_srai_epi64(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_d((__m128i)(_v_), (_c_)) )
#define _mm_slli_epi8(_u_, _m_) (__m128i)__lsx_vsll_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
#define _mm_slli_epi16(_u_, _m_) (__m128i)__lsx_vsll_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
#define _mm_slli_epi32(_u_, _m_) (__m128i)__lsx_vsll_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
#define _mm_slli_epi64(_u_, _m_) (__m128i)__lsx_vsll_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
#define _mm_srli_epi8( _u_, _m_) (__m128i)__lsx_vsrl_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
#define _mm_srli_epi16(_u_, _m_) (__m128i)__lsx_vsrl_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
#define _mm_srli_epi32(_u_, _m_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
#define _mm_srli_epi64(_u_, _m_) (__m128i)__lsx_vsrl_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
#define _mm_srai_epi8( _u_, _m_) (__m128i)__lsx_vsra_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
#define _mm_srai_epi16(_u_, _m_) (__m128i)__lsx_vsra_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
#define _mm_srai_epi32(_u_, _m_) (__m128i)__lsx_vsra_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
#define _mm_srai_epi64(_u_, _m_) (__m128i)__lsx_vsra_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
#define _mm_sll_epi8(_u_, _v_) (__m128i)__lsx_vsll_b((__m128i)(_u_), (__m128i)(_v_))
#define _mm_sll_epi16(_u_, _v_) (__m128i)__lsx_vsll_h((__m128i)(_u_), (__m128i)(_v_))
#define _mm_sll_epi32(_u_, _v_) (__m128i)__lsx_vsll_w((__m128i)(_u_), (__m128i)(_v_))
#define _mm_sll_epi64(_u_, _v_) (__m128i)__lsx_vsll_d((__m128i)(_u_), (__m128i)(_v_))
#define _mm_srl_epi8( _u_, _v_) (__m128i)__lsx_vsrl_b((__m128i)(_u_), (__m128i)(_v_))
#define _mm_srl_epi16(_u_, _v_) (__m128i)__lsx_vsrl_h((__m128i)(_u_), (__m128i)(_v_))
#define _mm_srl_epi32(_u_, _v_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), (__m128i)(_v_))
#define _mm_srl_epi64(_u_, _v_) (__m128i)__lsx_vsrl_d((__m128i)(_u_), (__m128i)(_v_))
#define _mm_sllv_epi32(_u_, _v_) (__m128i)__lsx_vsll_w((__m128i)(_u_), (__m128i)(_v_))
#define _mm_srlv_epi32(_u_, _v_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), (__m128i)(_v_))
//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
#define _mm_cmpeq_epi8( _u_, _v_) (__m128i)__lsx_vseq_b((__m128i)(_u_), (__m128i)(_v_))
#define _mm_cmpeq_epi16(_u_, _v_) (__m128i)__lsx_vseq_h((__m128i)(_u_), (__m128i)(_v_))
#define _mm_cmpeq_epi32(_u_, _v_) (__m128i)__lsx_vseq_w((__m128i)(_u_), (__m128i)(_v_))
#define _mm_cmpgt_epi8( _u_, _v_) (__m128i)__lsx_vslt_b((__m128i)(_v_), (__m128i)(_u_)) // 注意参数顺序
#define _mm_cmpgt_epi16(_u_, _v_) (__m128i)__lsx_vslt_h((__m128i)(_v_), (__m128i)(_u_))
#define _mm_cmpgt_epi32(_u_, _v_) (__m128i)__lsx_vslt_w((__m128i)(_v_), (__m128i)(_u_))
#define _mm_cmpgt_epu16(_u_, _v_) (__m128i)__lsx_vslt_hu((__m128i)(_v_), (__m128i)(_u_))
#define mm_cmpgt_epu32(_u_, _v_) (__m128i)__lsx_vslt_wu((__m128i)(_v_), (__m128i)(_u_))
//---------------------------------------------- Load -----------------------------------------------------------------------------
#define _mm_loadl_epi64(_u64p_) (__m128i)__lsx_vldrepl_d(_u64p_, 0) // 加载并广播到低64位
#define mm_loadu_epi64p(_u64p_, _u_) (__m128i)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const uint64_t*)(_u64p_), 0)
#define _mm_loadu_si128(_ip_) (__m128i)__lsx_vldx((const __m128i*)(_ip_), 0)
#define _mm_load_si128(_ip_) (__m128i)__lsx_vld((const __m128i*)(_ip_), 0)
#define _mm_load_ps(_ip_) (__m128)__lsx_vld((const float*)(_ip_), 0)
#define _mm_loadu_ps(_ip_) (__m128)__lsx_vldx((const float*)(_ip_), 0)
#define _mm_load1_ps(_ip_) (__m128)__lsx_vreplfr2vr_s(*(const float*)(_ip_))
#define _mm_loadl_pi(_u_, _ip_) (__m128)__lsx_vpickve2gr_w((__m128)(_u_), 1); (__m128)__lsx_vinsgr2vr_w((__m128)(_u_), *(const float*)(_ip_), 0)
#define _mm_loadh_pi(_u_, _ip_) (__m128)__lsx_vpickve2gr_w((__m128)(_u_), 0); (__m128)__lsx_vinsgr2vr_w((__m128)(_u_), *(const float*)(_ip_), 1)
//---------------------------------------------- Store ----------------------------------------------------------------------------
#define _mm_storel_epi64(_ip_, _u_) __lsx_vstelm_d((__m128i)(_u_), (uint64_t*)(_ip_), 0, 0)
#define _mm_storeu_si128(_ip_, _u_) __lsx_vstx((__m128i)(_u_), (__m128i*)(_ip_), 0)
#define _mm_store_ps(_ip_, _u_) __lsx_vst((float32x4_t)(_u_), (float*)(_ip_), 0)
#define _mm_storeu_ps(_ip_, _u_) __lsx_vstx((float32x4_t)(_u_), (float*)(_ip_), 0)
#define _mm_store_ss(_ip_, _u_) __lsx_vstelm_w((__m128)(_u_), (float*)(_ip_), 0, 0)
//---------------------------------------------- Convert --------------------------------------------------------------------------
#define mm_cvtsi64_si128p(_u64p_, _u_) (__m128i)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const uint64_t*)(_u64p_), 0)
#define _mm_cvtsi64_si128(_u_) (__m128i)__lsx_vreplgr2vr_d(_u_)
//---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
#define mm_rbit_epi8(_v_) (__m128i)__lsx_vbitrev_b((__m128i)(_v_), (__m128i)(_v_))
#define mm_rev_epi16(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E})
#define mm_rev_epi32(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C})
#define mm_rev_epi64(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x0706050403020100, 0x0F0E0D0C0B0A0908})
//--------------------------------------------- Insert/extract --------------------------------------------------------------------
#define mm_extract_epi32x(_u_, _u32_, _id_) (*(uint32_t*)&(_u32_) = __lsx_vpickve2gr_wu((__m128i)(_u_), (_id_))
#define _mm_extract_epi64x(_u_, _u64_, _id_) (*(uint64_t*)&(_u64_) = __lsx_vpickve2gr_du((__m128i)(_u_), (_id_))
#define _mm_extract_epi8(_u_, _id_) __lsx_vpickve2gr_b((__m128i)(_u_), (_id_))
#define _mm_extract_epi16(_u_, _id_) __lsx_vpickve2gr_h((__m128i)(_u_), (_id_))
#define _mm_extract_epi32(_u_, _id_) __lsx_vpickve2gr_w((__m128i)(_u_), (_id_))
#define mm_extract_epu32(_u_, _id_) __lsx_vpickve2gr_wu((__m128i)(_u_), (_id_))
#define _mm_cvtsi128_si32(_u_) __lsx_vpickve2gr_w((__m128i)(_u_), 0)
#define _mm_cvtsi128_si64(_u_) __lsx_vpickve2gr_d((__m128i)(_u_), 0)
#define _mm_insert_epu32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const uint32_t*)(_u32p_), (_id_))
#define mm_insert_epi32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const int32_t*)(_u32p_), (_id_))
#define _mm_cvtsi32_si128(_x_) (__m128i)__lsx_vinsgr2vr_w(__lsx_vldi(0), (_x_), 0)
#define _mm_blendv_epi8(_u_, _v_, _m_) (__m128i)__lsx_vbitsel_v((__m128i)(_u_), (__m128i)(_v_), (__m128i)(_m_))
//---------------------------------------------- Miscellaneous --------------------------------------------------------------------
#define _mm_alignr_epi8(_u_, _v_, _m_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_u_), (__m128i){_m_,_m_+1,_m_+2,_m_+3,_m_+4,_m_+5,_m_+6,_m_+7, _m_+8,_m_+9,_m_+10,_m_+11,_m_+12,_m_+13,_m_+14,_m_+15})
#define _mm_packs_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0))
#define _mm_packs_epi32(_u_, _v_) (__m128i)__lsx_vpickev_h(__lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0))
#define _mm_packs_epu16(_u_, _v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_))
#define _mm_packus_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0))
/* static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { */
/* const __m128i zero = __lsx_vldi(0); */
/* const __m128i mask = __lsx_vldi(0x0102040810204080); */
/* __m128i signs = __lsx_vsrli_b(v, 7); // 提取符号位到bit0 */
/* __m128i masked = __lsx_vand_v(signs, mask); // 应用位权重 */
/* __m128i sum = __lsx_vhaddw_wu_hu(__lsx_vhaddw_hu_bu(masked, zero), zero); */
/* return __lsx_vpickve2gr_hu(sum, 0) & 0xFFFF; */
/* } */
static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
// 步骤1:提取每个字节的最高位(符号位)
__m128i signs = __lsx_vsrli_b(v, 7); // 所有字节右移7位,符号位变bit0
// 步骤2:创建位掩码 (LSB-first: 0x01, 0x02, 0x04,...)
const __m128i mask = __lsx_vld((void*)(uint64_t[]){0x0102040810204080}, 0);
// 步骤3:应用位掩码
__m128i masked = __lsx_vand_v(signs, mask);
// 步骤4:水平相加(8-bit → 16-bit → 32-bit)
__m128i sum16 = __lsx_vhaddw_hu_bu(masked, __lsx_vldi(0));
__m128i sum32 = __lsx_vhaddw_wu_hu(sum16, __lsx_vldi(0));
// 步骤5:提取低16位结果
return __lsx_vpickve2gr_hu(sum32, 0) & 0xFFFF;
}
//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
static ALWAYS_INLINE uint8_t mm_movemask_epi8s(__m128i sv) {
const __m128i mask = __lsx_vldi(0x0102040810204080);
__m128i tmp = __lsx_vand_v(sv, mask);
tmp = __lsx_vhaddw_hu_bu(tmp, __lsx_vldi(0));
tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0));
return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0);
}
static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) {
const __m128i mask = __lsx_vldi(0x0102040810204080);
__m128i tmp = __lsx_vand_v(v, mask);
tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0));
return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0);
}
static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) {
// 1. 加载位掩码常量 (0x00000001, 0x00000002, 0x00000004, 0x00000008)
const __m128i mask = __lsx_vldi(0x0000000100000002);
__lsx_vinsgr2vr_d(mask, 0x0000000400000008, 1); // 设置高64位掩码
// 2. 应用位掩码
__m128i masked = __lsx_vand_v(v, mask);
// 3. 水平相加
__m128i sum2 = __lsx_vhaddw_du_wu(masked, __lsx_vldi(0)); // 4x32 -> 2x64
__m128i sum1 = __lsx_vhaddw_qu_du(sum2, __lsx_vldi(0)); // 2x64 -> 1x128
// 4. 提取结果
return (uint32_t)__lsx_vpickve2gr_d(sum1, 0);
}
static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) {
// 1. 加载位掩码常量 (0x0000000000000001, 0x0000000000000002)
const __m128i mask = {1ULL, 2ULL};
// 2. 应用位掩码并直接提取结果
__m128i masked = __lsx_vand_v(v, mask);
return __lsx_vpickve2gr_d(masked, 0) | __lsx_vpickve2gr_d(masked, 1);
}
// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_))
#define _mm_shuffle_epi8(_u_, _v_) (__m128i)__lsx_vshuf_b((__m128i)(_u_), (__m128i)(_u_), (__m128i)(_v_))
#define mm_shuffle_nnnn_epi32(_v_, _m_) (__m128i)__lsx_vreplvei_w((__m128i)(_v_), (_m_))
#ifdef USE_MACROS
#define mm_shuffle_2031_epi32(_u_) ({__m128i rev = __lsx_vshuf4i_w(v, 0x1B); __lsx_vshuf4i_w(rev, 0xD8);})
#define mm_shuffle_3120_epi32(_u_) __lsx_vshuf4i_w(v, 0xD8)
#else
static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) {__m128i rev = __lsx_vshuf4i_w(v, 0x1B); return __lsx_vshuf4i_w(rev, 0xD8);}
static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) {return __lsx_vshuf4i_w(v, 0xD8);}
#endif
#if defined(USE_MACROS) || defined(__clang__)
#define _mm_shuffle_epi32(_u_, _m_) (__m128i)__lsx_vshuf4i_w((__m128i)(_u_), (_m_))
#define _mm_shuffle_epi32s(_u_, _m_) (__m128i)__lsx_vshuf_w((__m128i)(_u_), (__m128i)(_u_), (__m128i){(_m_)&3, ((_m_)>>2)&3, ((_m_)>>4)&3, ((_m_)>>6)&3})
#else
static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _u_, const unsigned _m_) {return (__m128i)__lsx_vshuf4i_w((__m128i)_u_, _m_);}
static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _u_, const unsigned _m_) {
const uint32_t idx0 = (_m_) & 0x3;
const uint32_t idx1 = ((_m_) >> 2) & 0x3;
const uint32_t idx2 = ((_m_) >> 4) & 0x3;
const uint32_t idx3 = ((_m_) >> 6) & 0x3;
return (__m128i)__lsx_vshuf_w((__m128i)_u_, (__m128i)_u_, (__m128i){idx0, idx1, idx2, idx3});
}
#endif
#ifdef USE_MACROS
#define _mm_unpacklo_epi8(_u_,_v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_))
#define _mm_unpacklo_epi16(_u_,_v_) (__m128i)__lsx_vilvl_h((__m128i)(_v_), (__m128i)(_u_))
#define _mm_unpacklo_epi32(_u_,_v_) (__m128i)__lsx_vilvl_w((__m128i)(_v_), (__m128i)(_u_))
#define _mm_unpacklo_epi64(_u_,_v_) (__m128i)__lsx_vilvl_d((__m128i)(_v_), (__m128i)(_u_))
#define _mm_unpackhi_epi8(_u_,_v_) (__m128i)__lsx_vilvh_b((__m128i)(_v_), (__m128i)(_u_))
#define _mm_unpackhi_epi16(_u_,_v_) (__m128i)__lsx_vilvh_h((__m128i)(_v_), (__m128i)(_u_))
#define _mm_unpackhi_epi32(_u_,_v_) (__m128i)__lsx_vilvh_w((__m128i)(_v_), (__m128i)(_u_))
#define _mm_unpackhi_epi64(_u_,_v_) (__m128i)__lsx_vilvh_d((__m128i)(_v_), (__m128i)(_u_))
#else
static ALWAYS_INLINE __m128i _mm_unpacklo_epi8(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_b((__m128i)_v_, (__m128i)_u_);}
static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_h((__m128i)_v_, (__m128i)_u_);}
static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_w((__m128i)_v_, (__m128i)_u_);}
static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_d((__m128i)_v_, (__m128i)_u_);}
static ALWAYS_INLINE __m128i _mm_unpackhi_epi8(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_b((__m128i)_v_, (__m128i)_u_);}
static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_h((__m128i)_v_, (__m128i)_u_);}
static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_w((__m128i)_v_, (__m128i)_u_);}
static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_d((__m128i)_v_, (__m128i)_u_);}
#endif
#else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) -------------- #else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) --------------
#define mm_movemask_epu32(_v_) _mm_movemask_ps(_mm_castsi128_ps(_v_)) #define mm_movemask_epu32(_v_) _mm_movemask_ps(_mm_castsi128_ps(_v_))
#define mm_movemask_epu16(_v_) _mm_movemask_epi8(_v_) #define mm_movemask_epu16(_v_) _mm_movemask_epi8(_v_)

View File

@ -45,6 +45,8 @@
#include <emmintrin.h> #include <emmintrin.h>
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#include "include_/sse_neon.h"
#elif defined(__loongarch_lp64)
#include "include_/sse_neon.h" #include "include_/sse_neon.h"
#endif #endif
@ -728,7 +730,7 @@ void T2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
#endif //TPDEC256V #endif //TPDEC256V
#else //__AVX2__ #else //__AVX2__
#if (defined(__SSE3__) || defined(__ARM_NEON)) && (ESIZE == 2 || ESIZE == 4 || ESIZE == 8) #if (defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (ESIZE == 2 || ESIZE == 4 || ESIZE == 8)
#define ST(_p_,_v_,_i_) _mm_storeu_si128((__m128i *)SIE(_p_,_i_), _v_) #define ST(_p_,_v_,_i_) _mm_storeu_si128((__m128i *)SIE(_p_,_i_), _v_)
#define ST0(_p_,_v_) _mm_storeu_si128((__m128i *)(_p_), _v_) #define ST0(_p_,_v_) _mm_storeu_si128((__m128i *)(_p_), _v_)
@ -737,7 +739,7 @@ void T2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
unsigned stride = v/STRIDE; unsigned stride = v/STRIDE;
unsigned char *op,*ip; unsigned char *op,*ip;
#if defined(__SSE3__) || defined(__ARM_NEON) #if defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
#if ESIZE == 2 #if ESIZE == 2
__m128i sf = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, __m128i sf = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1,
14, 12, 10, 8, 6, 4, 2, 0); 14, 12, 10, 8, 6, 4, 2, 0);
@ -764,7 +766,7 @@ void T2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op; PREFETCH(ip+(ESIZE*16)*ESIZE,0); for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op; PREFETCH(ip+(ESIZE*16)*ESIZE,0);
__m128i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE]; __m128i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
#if ESIZE == 2 #if ESIZE == 2
#ifdef __ARM_NEON #ifdef __ARM_NEON
uint8x16x2_t w = vld2q_u8(ip); uint8x16x2_t w = vld2q_u8(ip);

View File

@ -26,11 +26,15 @@
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#ifdef __APPLE__ #ifdef __APPLE__
#ifdef HAVE_MALLOC_MALLOC
#include <sys/malloc.h> #include <sys/malloc.h>
#else #else
#include <malloc/malloc.h>
#endif
#else
#include <malloc.h> #include <malloc.h>
#endif #endif
#ifdef _MSC_VER #ifdef _MSC_VER
#include "vs/getopt.h" #include "vs/getopt.h"
#else #else

View File

@ -43,6 +43,8 @@
#include <emmintrin.h> #include <emmintrin.h>
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#include "include_/sse_neon.h"
#elif defined(__loongarch_lp64)
#include "include_/sse_neon.h" #include "include_/sse_neon.h"
#endif #endif
#include "include_/conf.h" #include "include_/conf.h"
@ -74,7 +76,7 @@ unsigned _srled8(const unsigned char *__restrict in, unsigned char *__restrict o
#endif #endif
if(outlen >= SRLE8) if(outlen >= SRLE8)
while(op < out+(outlen-SRLE8)) { while(op < out+(outlen-SRLE8)) {
#if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) #if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) || defined(__loongarch_lp64)
uint32_t mask; uint32_t mask;
#ifdef __AVX2__ #ifdef __AVX2__
__m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, ev)); if(mask) goto a; op += 32; ip += 32; __m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, ev)); if(mask) goto a; op += 32; ip += 32;
@ -127,10 +129,12 @@ static inline unsigned _srled8x(const unsigned char *__restrict in, unsigned cha
__m128i ev = _mm_set1_epi8(e); __m128i ev = _mm_set1_epi8(e);
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
uint8x8_t ev = vdup_n_u8(e); uint8x8_t ev = vdup_n_u8(e);
#elif defined(__loongarch_lp64)
__m128i ev = __lsx_vreplgr2vr_b(e);
#endif #endif
if(outlen >= SRLE8) if(outlen >= SRLE8)
while(op < out+(outlen-SRLE8)) { while(op < out+(outlen-SRLE8)) {
#if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) #if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) || defined(__loongarch_lp64)
uint32_t mask; uint32_t mask;
#ifdef __AVX2__ #ifdef __AVX2__
__m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, ev)); if(mask) goto a; op += 32; ip += 32; __m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, ev)); if(mask) goto a; op += 32; ip += 32;
@ -326,7 +330,7 @@ unsigned T2(_srled, USIZE)(const unsigned char *__restrict in, unsigned char *__
#ifdef __AVX2__ #ifdef __AVX2__
#define _mm256_set1_epi64 _mm256_set1_epi64x #define _mm256_set1_epi64 _mm256_set1_epi64x
__m256i ev = T2(_mm256_set1_epi, USIZE)(e); __m256i ev = T2(_mm256_set1_epi, USIZE)(e);
#elif (defined(__SSE__) /*|| defined(__ARM_NEON)*/) #elif (defined(__SSE__) /*|| defined(__ARM_NEON) || defined(__loongarch_lp64)*/)
// #if USIZE != 64 // #if USIZE != 64
#define _mm_set1_epi64 _mm_set1_epi64x #define _mm_set1_epi64 _mm_set1_epi64x
__m128i ev = T2(_mm_set1_epi, USIZE)(e); __m128i ev = T2(_mm_set1_epi, USIZE)(e);

View File

@ -1007,7 +1007,7 @@ unsigned char *T2(V8ENC,32)(uint32_t *__restrict in, unsigned n, unsigned char *
_mm_storeu_si128((__m128i *)op, _mm256_castsi256_si128( ov3)); op += LEN32(m1,2); _mm_storeu_si128((__m128i *)op, _mm256_castsi256_si128( ov3)); op += LEN32(m1,2);
_mm_storeu_si128((__m128i *)op, _mm256_extracti128_si256(ov3,1)); op += LEN32(m1,3); _mm_storeu_si128((__m128i *)op, _mm256_extracti128_si256(ov3,1)); op += LEN32(m1,3);
} }
#elif defined(__SSSE3__) || defined(__ARM_NEON) // https://gist.github.com/aqrit/746d2f5e4ad1909230e2283272333dc1 #elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) // https://gist.github.com/aqrit/746d2f5e4ad1909230e2283272333dc1
VEINI128v32; const __m128i cv1_8 = _mm_set1_epi8(1), cv7f00 = _mm_set1_epi16(0x7f00); VEINI128v32; const __m128i cv1_8 = _mm_set1_epi8(1), cv7f00 = _mm_set1_epi16(0x7f00);
for(ip = in; ip != in+(n&~(32-1)); ip += 32, PNEXT(out,op,8) ) { for(ip = in; ip != in+(n&~(32-1)); ip += 32, PNEXT(out,op,8) ) {
__m128i iv0 = _mm_loadu_si128(ip ), __m128i iv0 = _mm_loadu_si128(ip ),
@ -1149,7 +1149,7 @@ unsigned char *T2(V8DEC,32)(unsigned char *__restrict in, unsigned n, uint32_t
} }
} }
} }
#elif defined(__ARM_NEON) || defined(__SSSE3__) // optimzed for ARM ---------------------------------------------------------- #elif defined(__ARM_NEON) || defined(__loongarch_lp64) || defined(__SSSE3__) // optimzed for ARM ----------------------------------------------------------
VDINI128v32; VDINI128v32;
for(; op != out+(n&~(32-1)); op += 32) { //PREFETCH(ip+384,0); for(; op != out+(n&~(32-1)); op += 32) { //PREFETCH(ip+384,0);
uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4); uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4);
@ -1223,7 +1223,7 @@ unsigned char *T2(V8ENC,16)(uint16_t *__restrict in, unsigned n, unsigned char *
uint16_t *ip,v; uint16_t *ip,v;
unsigned char *op = DATABEG(out,n,16); unsigned char *op = DATABEG(out,n,16);
#if defined(__SSSE3__) || defined(__ARM_NEON) //-------------------------------- #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //--------------------------------
VEINI128v16; const __m128i cv1_8 = _mm_set1_epi8(1); VEINI128v16; const __m128i cv1_8 = _mm_set1_epi8(1);
for(ip = in; ip != in+(n&~(64-1)); ip += 64, PNEXT(out,op,8)) { //PREFETCH(ip+512,0); for(ip = in; ip != in+(n&~(64-1)); ip += 64, PNEXT(out,op,8)) { //PREFETCH(ip+512,0);
__m128i iv0 = _mm_loadu_si128(ip ), __m128i iv0 = _mm_loadu_si128(ip ),
@ -1291,7 +1291,7 @@ unsigned char *T2(V8DEC,16)(unsigned char *__restrict in, unsigned n, uint16_t
unsigned char *ip = DATABEG(in,n,16); unsigned char *ip = DATABEG(in,n,16);
uint16_t v; uint16_t v;
#if defined(__SSSE3__) || defined(__ARM_NEON)//----------------------- #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)//-----------------------
VDINI128v16; VDINI128v16;
for(op = out; op != out+(n&~(64-1)); op += 64) { PREFETCH(ip+512,0); for(op = out; op != out+(n&~(64-1)); op += 64) { PREFETCH(ip+512,0);
uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4); uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4);

View File

@ -328,7 +328,7 @@ unsigned char *T2(VBDDEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_
if(in[0] == 0xfe) { if(in[0] == 0xfe) {
in++; in++;
#if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32 #if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
#if VDELTA == 0 #if VDELTA == 0
if(n) T2(BITZERO, USIZE)(out, n, start); if(n) T2(BITZERO, USIZE)(out, n, start);
#else #else

View File

@ -210,7 +210,7 @@ size_t p4nsdec64(unsigned char *in, size_t n, uint64_t *out) { uint64_t *op,sta
#undef _P4ENC #undef _P4ENC
#undef P4ENC #undef P4ENC
#undef BITPACK #undef BITPACK
#elif defined(__SSE3__) || defined(__ARM_NEON) //-------------------------------------------------- #elif defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //--------------------------------------------------
#define BITDELTA bitdienc #define BITDELTA bitdienc
#define HYBRID 1 #define HYBRID 1
#define P4BITS _p4bits #define P4BITS _p4bits

View File

@ -40,7 +40,7 @@
#define P4DELTA(a) #define P4DELTA(a)
#define P4DELTA_(a) #define P4DELTA_(a)
#if defined(__SSSE3__) || defined(__ARM_NEON) #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
extern char _shuffle_32[16][16]; // defined in bitunpack.c extern char _shuffle_32[16][16]; // defined in bitunpack.c
extern char _shuffle_16[256][16]; extern char _shuffle_16[256][16];
#endif #endif
@ -94,7 +94,7 @@ extern char _shuffle_16[256][16];
#undef BITUNDD #undef BITUNDD
// #elif !defined(__SSE3__) // #elif !defined(__SSE3__)
#elif defined(__SSSE3__) || defined(__ARM_NEON) #elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
#define _P4DEC _p4dec #define _P4DEC _p4dec
#define P4DEC p4dec #define P4DEC p4dec
#define P4NDEC p4ndec #define P4NDEC p4ndec
@ -175,7 +175,7 @@ extern char _shuffle_16[256][16];
#undef USIZE #undef USIZE
#undef DELTA #undef DELTA
// #elif defined(__SSSE3__) || defined(__ARM_NEON) // #elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
#define VSIZE 128 #define VSIZE 128
#define P4DELTA(a) #define P4DELTA(a)
#define P4DELTA_(a) #define P4DELTA_(a)
@ -305,7 +305,7 @@ ALWAYS_INLINE unsigned char *T2(_P4DEC, USIZE)(unsigned char *__restrict in, uns
} //out += 64; } //out += 64;
} }
} }
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && USIZE == 32 #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
{ uint_t *_op = out,*op,*pex = ex; { uint_t *_op = out,*op,*pex = ex;
for(i = 0; i < p4dn; i++) { for(i = 0; i < p4dn; i++) {
for(op=_op; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; for(op=_op; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf;
@ -313,7 +313,7 @@ ALWAYS_INLINE unsigned char *T2(_P4DEC, USIZE)(unsigned char *__restrict in, uns
} _op+=64; } _op+=64;
} }
} }
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && USIZE == 16 #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 16
{ uint_t *_op = out, *op, *pex = ex; { uint_t *_op = out, *op, *pex = ex;
for(i = 0; i < p4dn; i++) { for(i = 0; i < p4dn; i++) {
for(op = _op; bb[i]; bb[i] >>= 8,op += 8) { const unsigned char m = bb[i]; for(op = _op; bb[i]; bb[i] >>= 8,op += 8) { const unsigned char m = bb[i];

View File

@ -27,6 +27,8 @@
#include <emmintrin.h> #include <emmintrin.h>
#elif defined(__ARM_NEON) #elif defined(__ARM_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#include "include_/sse_neon.h"
#elif defined(__loongarch_lp64)
#include "include_/sse_neon.h" #include "include_/sse_neon.h"
#endif #endif
#include <stdio.h> #include <stdio.h>
@ -343,7 +345,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t *
uint_t *q = op; uint_t *q = op;
unsigned r = (w>>4)&0xf; unsigned r = (w>>4)&0xf;
if(!r) { memcpy(op,ip+1, n*(USIZE/8)); return ip+n*(USIZE/8); } if(!r) { memcpy(op,ip+1, n*(USIZE/8)); return ip+n*(USIZE/8); }
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
__m128i zv = _mm_setzero_si128(); __m128i zv = _mm_setzero_si128();
#endif #endif
ip++; ip++;
@ -355,7 +357,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t *
r -= 1; r -= 1;
op += r+1; op += r+1;
while(q < op) { while(q < op) {
#if defined(__SSE2__) || defined(__ARM_NEON) #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
_mm_storeu_si128((__m128i *)q,zv); q = (uint_t *)((unsigned char *)q+16); _mm_storeu_si128((__m128i *)q,zv); q = (uint_t *)((unsigned char *)q+16);
_mm_storeu_si128((__m128i *)q,zv); q = (uint_t *)((unsigned char *)q+16); _mm_storeu_si128((__m128i *)q,zv); q = (uint_t *)((unsigned char *)q+16);
#else #else
@ -475,7 +477,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t *
else { vbxget32(ip, r); } else { vbxget32(ip, r); }
} }
op += r+1; T2(vbxget, USIZE)(ip,u); op += r+1; T2(vbxget, USIZE)(ip,u);
#if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32 #if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
{ __m128i v = _mm_set1_epi32(u); { __m128i v = _mm_set1_epi32(u);
while(q < op) { while(q < op) {
_mm_storeu_si128((__m128i *)q,v); q += 4; _mm_storeu_si128((__m128i *)q,v); q += 4;

View File

@ -81,9 +81,13 @@ ifeq (,$(findstring clang, $(CC)))
OPT+=-falign-loops OPT+=-falign-loops
endif endif
ifeq ($(ARCH),loongarch64)
CFLAGS+=-mlsx
CXXFLAGS+=-mlsx
endif
CFLAGS+=$(DEBUG) $(OPT) CFLAGS+=$(DEBUG) $(OPT)
#CFLAGS+=-Wno-macro-redefined -Wno-incompatible-pointer-types -Wno-tautological-constant-out-of-range-compare -Wno-discarded-qualifiers #CFLAGS+=-Wno-macro-redefined -Wno-incompatible-pointer-types -Wno-tautological-constant-out-of-range-compare -Wno-discarded-qualifiers
CFLAGS+=-w -Wall -pedantic CFLAGS+=-w -Wall -pedantic -Wno-macro-redefined -Wno-incompatible-pointer-types
CXXFLAGS+=-w CXXFLAGS+=-w
#-Wall -Wincompatible-pointer-types #-Wall -Wincompatible-pointer-types
ifeq ($(OS),$(filter $(OS),Linux GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT Haiku)) ifeq ($(OS),$(filter $(OS),Linux GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT Haiku))