diff --git a/include/ic_.h b/include/ic_.h index 2f2fe89..49b92d0 100644 --- a/include/ic_.h +++ b/include/ic_.h @@ -187,7 +187,8 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\ defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \ defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \ - defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__loongarch_lp64) #define ctou16(_cp_) (*(unsigned short *)(_cp_)) #define ctou32(_cp_) (*(unsigned *)(_cp_)) #define ctof32(_cp_) (*(float *)(_cp_)) @@ -207,7 +208,7 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void #define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_) - #elif defined(__ARM_FEATURE_UNALIGNED) + #elif defined(__ARM_FEATURE_UNALIGNED) || defined(__loongarch_lp64) struct _PACKED longu { uint64_t l; }; struct _PACKED doubleu { double d; }; #define ctou64(_cp_) ((struct longu *)(_cp_))->l @@ -253,7 +254,7 @@ struct _PACKED doubleu { double d; }; defined(__aarch64__) ||\ defined(__mips64) ||\ defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\ - defined(__s390x__) + defined(__s390x__) || defined(__loongarch_lp64) #define __WORDSIZE 64 #else #define __WORDSIZE 32 @@ -659,4 +660,4 @@ static ALWAYS_INLINE void vbget32(unsigned char **_ip, unsigned *_x) { unsigned static ALWAYS_INLINE unsigned vlget32(unsigned char **_ip) { unsigned char *ip = *_ip; unsigned x; _vbget(ip, x, 32, VB_MAX, 4, 3, ;); *_ip = ip; return x; } static ALWAYS_INLINE unsigned vllen32(unsigned x) { return _vblen( x, 32, VB_MAX, 4, 3); } #endif - \ No newline at end of file + diff --git a/lib/bitpack.c b/lib/bitpack.c index d894b0d..0d2e883 100644 --- a/lib/bitpack.c +++ b/lib/bitpack.c @@ -385,7 +385,7 @@ size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__rest size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); } size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitx256v, bitxpack256v, bitx, bitxpack); } - #elif defined(__SSE3__) || defined(__ARM_NEON) //----------------------------- SSE / AVX --------------------------------------------------------------- + #elif defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //----------------------------- SSE / AVX --------------------------------------------------------------- #define OPPE(__op) #define IPPE(__op) diff --git a/lib/bitunpack.c b/lib/bitunpack.c index 4506008..f0d0565 100644 --- a/lib/bitunpack.c +++ b/lib/bitunpack.c @@ -651,7 +651,7 @@ size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__re size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitxunpack256v, bitxunpack); } size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitfunpack256v, bitfunpack); } - #elif defined(__SSE2__) || defined(__ARM_NEON) //------------------------------ SSE2/SSSE3 --------------------------------------------------------- + #elif defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //------------------------------ SSE2/SSSE3 --------------------------------------------------------- #define BITMAX16 16 #define BITMAX32 32 @@ -760,7 +760,7 @@ unsigned char *bitunpack128v64( const unsigned char *__restrict in, unsigned n, #undef VOZ16 #undef BITUNPACK0 - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) #define _ 0x80 ALIGNED(char, _shuffle_32[16][16],16) = { { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, @@ -1178,7 +1178,7 @@ unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n, return (unsigned char *)ip; } - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) #define BITMAX16 15 #define BITMAX32 31 @@ -1315,7 +1315,7 @@ unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n return (unsigned char *)ip; } - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) #define BITMAX16 15 #define BITMAX32 31 diff --git a/lib/bitutil.c b/lib/bitutil.c index 0721544..9f0e17a 100755 --- a/lib/bitutil.c +++ b/lib/bitutil.c @@ -228,7 +228,7 @@ uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px) { uint64_t o,x,u0,*ip; BI uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) { uint16_t o, x, u0 = in[0], *ip = in; - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi16(u0); @@ -254,7 +254,7 @@ uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) { uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) { uint32_t o,x,u0 = in[0], *ip = in; - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi32(u0); @@ -290,7 +290,7 @@ uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint6 uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o, x, *ip = in, u0 = in[0] - start; - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start); @@ -321,7 +321,7 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { uint32_t o = 0, x=0, *ip = in, u0 = in[0] - start; - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vb0 = _mm_set1_epi32(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start); @@ -361,7 +361,7 @@ void bitddec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in, n void bitddec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 0); } void bitddec32(uint32_t *in, unsigned n, unsigned start) { - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vs = _mm_set1_epi32(start); unsigned *ip = in; for(; ip != in+(n&~(8-1)); ip += 8) { @@ -417,7 +417,7 @@ uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint6 uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { uint32_t o = 0, x=0, *ip = in, u0 = in[0]-start-1; - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vb0 = _mm_set1_epi32(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); @@ -446,7 +446,7 @@ uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { } uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) uint16_t *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8); for(; ip != in+(n&~(8-1)); ip += 8) { __m128i iv = _mm_loadu_si128((__m128i *)ip); @@ -461,7 +461,7 @@ uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { } unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) { - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) unsigned *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4); for(; ip != in+(n&~(4-1)); ip += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); @@ -480,7 +480,7 @@ void bitd1dec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in, void bitd1dec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 1); } void bitd1dec32(uint32_t *in, unsigned n, uint32_t start) { - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vs = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); unsigned *ip = in; for(; ip != in+(n&~(4-1)); ip += 4) { @@ -515,7 +515,7 @@ uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uin uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x=0,*op = out,u; BITDE(uint16_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; } uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x=0,*op = out,u; BITDE(uint64_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; } uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) { - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) unsigned *ip = in,b,*op = out; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(mindelta), dv; for(; ip != in+(n&~(4-1)); ip += 4,op += 4) { @@ -534,6 +534,7 @@ uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin } #else uint32_t b = 0,*op = out, x, *_ip; + //uint32_t b = 0, *op = out, x = 0, o = 0, u = 0, *_ip; BITDE(uint32_t, in, n, mindelta, b |= x; *op++ = x); #endif return b; @@ -569,7 +570,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o, x, *ip = in; uint32_t u0 = zigzagenc16((int)in[0] - (int)start); - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start); for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0); @@ -599,7 +600,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) { uint32_t o, x, *ip=in, u0 = zigzagenc32((int)in[0] - (int)start); - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vb0 = _mm_set1_epi32(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start); @@ -631,7 +632,7 @@ uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x,u,*op = out; BITZENC(uint64_t, int64_t,64,in, n, o |= u; *op++ = u); return o; } uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) { - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) unsigned *ip = in,b,*op = out; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start); for(; ip != in+(n&~(8-1)); ip += 8,op += 8) { @@ -672,7 +673,7 @@ void bitzdec8( uint8_t *in, unsigned n, uint8_t start) { BITZDEC(uint8_t, 8, void bitzdec64(uint64_t *in, unsigned n, uint64_t start) { BITZDEC(uint64_t, 64,in, n); } void bitzdec16(uint16_t *in, unsigned n, uint16_t start) { - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vs = _mm_set1_epi16(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128(); uint16_t *ip = in; for(; ip != in+(n&~(8-1)); ip += 8) { @@ -692,7 +693,7 @@ void bitzdec16(uint16_t *in, unsigned n, uint16_t start) { } void bitzdec32(unsigned *in, unsigned n, unsigned start) { - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vs = _mm_set1_epi32(start); unsigned *ip = in; for(; ip != in+(n&~(8-1)); ip += 8) { @@ -729,7 +730,7 @@ uint64_t bitx64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64 uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint16_t o = 0, *ip = in; - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vo0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vs = _mm_set1_epi16(start); @@ -752,7 +753,7 @@ uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) { uint32_t bitx32(unsigned *in, unsigned n, uint32_t *px, uint32_t start) { uint32_t o = 0, *ip = in; - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vo0 = _mm_setzero_si128(), vo1 = _mm_setzero_si128(), vs = _mm_set1_epi32(start); @@ -787,7 +788,7 @@ void bitxdec8( uint8_t *in, unsigned n, uint8_t start) { BITXDEC(uint8_t, in, void bitxdec64(uint64_t *in, unsigned n, uint64_t start) { BITXDEC(uint64_t, in, n); } void bitxdec16(uint16_t *in, unsigned n, uint16_t start) { - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vs = _mm_set1_epi16(start); uint16_t *ip = in; for(; ip != in+(n&~(8-1)); ip += 8) { @@ -806,7 +807,7 @@ void bitxdec16(uint16_t *in, unsigned n, uint16_t start) { } void bitxdec32(unsigned *in, unsigned n, unsigned start) { - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i vs = _mm_set1_epi32(start); unsigned *ip = in; for(; ip != in+(n&~(8-1)); ip += 8) { diff --git a/lib/fp.c b/lib/fp.c index e742527..a1f4638 100644 --- a/lib/fp.c +++ b/lib/fp.c @@ -174,7 +174,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) #if defined(__AVX2__) && USIZE >= 32 #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) __m256i sv = T2(_mm256_set1_epi, USIZE)(start); - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32) #define _mm_set1_epi64(a) _mm_set1_epi64x(a) __m128i sv = T2(_mm_set1_epi, USIZE)(start); #endif @@ -191,7 +191,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) } start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1); b = T2(mm256_hor_epi, USIZE)(bv); - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32) __m128i bv = _mm_setzero_si128(); for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),ip+=32/(USIZE/8)) { __m128i v0 = _mm_loadu_si128((__m128i *) ip); @@ -217,7 +217,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start) _mm256_storeu_si256((__m256i *) p, v0); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1); } - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32) for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) { __m128i v0 = _mm_loadu_si128((__m128i *) p); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); @@ -255,7 +255,7 @@ size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) #if defined(__AVX2__) && USIZE >= 32 #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) __m256i sv = T2(_mm256_set1_epi, USIZE)(start); - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32) #define _mm_set1_epi64(a) _mm_set1_epi64x(a) __m128i sv = T2(_mm_set1_epi, USIZE)(start); #endif @@ -277,7 +277,7 @@ size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) _mm256_storeu_si256((__m256i *)(op+32/(USIZE/8)), sv); } start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1); - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32) for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),op+=32/(USIZE/8)) { __m128i v0 = _mm_loadu_si128((__m128i *)p); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); @@ -318,7 +318,7 @@ size_t T2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start #if defined(__AVX2__) && USIZE >= 32 #define _mm256_set1_epi64(a) _mm256_set1_epi64x(a) __m256i sv = T2(_mm256_set1_epi, USIZE)(start); - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32) #define _mm_set1_epi64(a) _mm_set1_epi64x(a) __m128i sv = T2(_mm_set1_epi, USIZE)(start); #endif @@ -339,7 +339,7 @@ size_t T2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start _mm256_storeu_si256((__m256i *) p, v0); _mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1); } - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32) + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32) for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) { __m128i v0 = _mm_loadu_si128((__m128i *) p); __m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8))); @@ -823,7 +823,7 @@ size_t T2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) return 1+n*sizeof(out[0]); } BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd); - #if (defined(__SSE2__) /*|| defined(__ARM_NEON)*/) && USIZE == 32 + #if (defined(__SSE2__) /*|| defined(__ARM_NEON) || defined(__loongarch_lp64)*/) && USIZE == 32 __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4*pd,3*pd,2*pd,1*pd); for(r += NL, _op = op; op != _op+(r&~7);) { sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3) @@ -926,7 +926,7 @@ size_t T2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start) return 1+n*sizeof(out[0]); } BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd); - #if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32 + #if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32 __m128i sv = _mm_set1_epi32(start); for(r += NL, _op = op; op != _op+(r&~7);) { _mm_storeu_si128((__m128i *)op, sv); op += 4; diff --git a/lib/include_/bitutil_.h b/lib/include_/bitutil_.h index 6985043..254f131 100644 --- a/lib/include_/bitutil_.h +++ b/lib/include_/bitutil_.h @@ -40,6 +40,8 @@ #include #elif defined(__ARM_NEON) #include + #elif defined(__loongarch_lp64) +#include #endif #if defined(_MSC_VER) && _MSC_VER < 1600 #include "vs/stdint.h" @@ -112,7 +114,7 @@ static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) { } #endif - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) #define mm_srai_epi64_63(_v_, _s_) _mm_srai_epi32(_mm_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 1, 1)), 31) static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16( v,15)); } @@ -230,7 +232,7 @@ static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_ _sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_mindelta_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\ } while(0) - #elif defined(__SSE2__) || defined(__ARM_NEON) // ------------- + #elif defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) // ------------- // SIMD set value (memset) #define BITZERO32(_out_, _n_, _v_) do {\ __m128i _sv_ = _mm_set1_epi32(_v_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\ @@ -322,7 +324,7 @@ static ALWAYS_INLINE uint64_t rbit64(uint64_t x) { } #endif - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) static ALWAYS_INLINE __m128i mm_rbit_epi16(__m128i v) { return mm_rbit_epi8(mm_rev_epi16(v)); } static ALWAYS_INLINE __m128i mm_rbit_epi32(__m128i v) { return mm_rbit_epi8(mm_rev_epi32(v)); } static ALWAYS_INLINE __m128i mm_rbit_epi64(__m128i v) { return mm_rbit_epi8(mm_rev_epi64(v)); } diff --git a/lib/include_/conf.h b/lib/include_/conf.h index 1808b98..4e9a82e 100644 --- a/lib/include_/conf.h +++ b/lib/include_/conf.h @@ -203,7 +203,8 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\ defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \ defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \ - defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__loongarch_lp64) #define ctou16(_cp_) (*(unsigned short *)(_cp_)) #define ctou32(_cp_) (*(unsigned *)(_cp_)) #define ctof16(_cp_) (*(_Float16 *)(_cp_)) @@ -225,7 +226,7 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void #define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_) - #elif defined(__ARM_FEATURE_UNALIGNED) + #elif defined(__ARM_FEATURE_UNALIGNED) || defined(__loongarch_lp64) struct _PACKED longu { uint64_t l; }; struct _PACKED doubleu { double d; }; #define ctou64(_cp_) ((struct longu *)(_cp_))->l @@ -276,7 +277,7 @@ struct _PACKED doubleu { double d; }; defined(__aarch64__) ||\ defined(__mips64) ||\ defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\ - defined(__s390x__) + defined(__s390x__) || defined(__loongarch_lp64) #define __WORDSIZE 64 #else #define __WORDSIZE 32 diff --git a/lib/include_/sse_neon.h b/lib/include_/sse_neon.h index 0cc4e9c..32941e2 100644 --- a/lib/include_/sse_neon.h +++ b/lib/include_/sse_neon.h @@ -302,6 +302,324 @@ static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) { uint static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); } #endif +#elif defined(__loongarch_lp64) + +#include +// sse instruct to loongarch lsx instruct mapping + + #ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ---------------------------------------------------------- +#define _mm_set_epi8(u15,u14,u13,u12,\ + u11,u10, u9, u8,\ + u7,u6,u5,u4,\ + u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (v4u32)__lsx_vld( _u, 0);}) +#define _mm_set_epi16( u7,u6,u5,u4,\ + u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (v4u32)__lsx_vld( _u, 0);}) + +#define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; __lsx_vld(_u, 0);}) +#define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (v4u32)__lsx_vld(_u);}) + + #else +static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8, + uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4, + uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) { + uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (__m128i)__lsx_vld(u, 0); } +static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4, + uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (__m128i)__lsx_vld(u, 0); } +static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return __lsx_vld(u, 0); } +static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (__m128i)__lsx_vld(u, 0); } + #endif + +#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7) +#define _mm_setr_epi32(u3,u2,u1,u0) _mm_set_epi32( u0,u1,u2,u3) +#define _mm_setr_epi64x(u1,u0) _mm_set_epi64x(u0,u0) + +#define _mm_set1_epi8( _u8_ ) (__m128i)__lsx_vreplgr2vr_b(_u8_) +#define _mm_set1_epi16( _u16_) (__m128i)__lsx_vreplgr2vr_h(_u16_) +#define _mm_set1_epi32( _u32_) __lsx_vreplgr2vr_w(_u32_) +#define _mm_set1_epi64x(_u64_) (__m128i)__lsx_vreplgr2vr_d(_u64_) +#define _mm_setzero_si128() __lsx_vreplgr2vr_w( 0 ) + +#define _mm_cvtss_f32(_v_) __lsx_vpickve2gr_s((__m128)(_v_), 0) +#define _mm_setzero_ps() (__m128)__lsx_vldi(0) +#define _mm_set1_ps(_f32_) (__m128)__lsx_vreplfr2vr_s(_f32_) +//---------------------------------------------- Arithmetic ----------------------------------------------------------------------- +#define _mm_add_epi8( _u_,_v_) (__m128i)__lsx_vadd_b((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_add_epi16( _u_,_v_) (__m128i)__lsx_vadd_h((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_add_epi32( _u_,_v_) __lsx_vadd_w( _u_, _v_ ) +#define _mm_add_epi64( _u_,_v_) (__m128i)__lsx_vadd_d((__m128i)(_u_), (__m128i)(_v_)) + +#define _mm_sub_epi8( _u_,_v_) (__m128i)__lsx_vsub_b((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_sub_epi16( _u_,_v_) (__m128i)__lsx_vsub_h((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_sub_epi32( _u_,_v_) (__m128i)__lsx_vsub_w((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_sub_epi64( _u_,_v_) (__m128i)__lsx_vsub_d((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_subs_epu8( _u_,_v_) (__m128i)__lsx_vsub_bu((__m128i)(_u_), (__m128i)(_v_)) + +#define _mm_mullo_epi16(_u_, _v_) (__m128i)__lsx_vmulwev_h_h((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_mullo_epi32(_u_,_v_) (__m128i)__lsx_vmulwev_w_w((__m128i)(_u_), (__m128i)(_v_)) +#define mm_mullo_epu32(_u_,_v_) (__m128i)__lsx_vmulwev_w_wu((__m128i)(_u_), (__m128i)(_v_)) + +//#define _mm_mulhi_epi16s(_u_,_v_) (__m128i)__lsx_vmulwh_h_h((__m128i)(_u_), (__m128i)(_v_)) + +static ALWAYS_INLINE __m128i _mm_mulhi_epi16(__m128i u, __m128i v) { + __m128i evens = __lsx_vmulwev_h_w(u, v); // a[0]*b[0], a[2]*b[2], ... + __m128i odds = __lsx_vmulwod_h_w(u, v); // a[1]*b[1], a[3]*b[3], ... + + // 2. 右移 16 位,提取高 16 位 + evens = __lsx_vsrai_w(evens, 16); + odds = __lsx_vsrai_w(odds, 16); + + // 3. 重新打包成 16 位结果 + __m128i res = __lsx_vpickev_h(odds, evens); // 交错组合 + return res; +} + +#define _mm_mul_epu32(_u_, _v_) (__m128i)__lsx_vmulwev_d_wu((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_adds_epu16(_u_, _v_) (__m128i)__lsx_vsadd_hu((__m128i)(_u_), (__m128i)(_v_)) + +static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) { + // 1. 执行16位有符号乘法得到32位中间结果 + __m128i mul_even = __lsx_vmulwev_w_h(u, v); // 偶数位相乘 (0*0, 2*2, 4*4, 6*6) + __m128i mul_odd = __lsx_vmulwod_w_h(u, v); // 奇数位相乘 (1*1, 3*3, 5*5, 7*7) + + // 2. 水平相加相邻的两个32位结果 + __m128i sum = __lsx_vadd_w(mul_even, mul_odd); // [0*0+1*1, 2*2+3*3, 4*4+5*5, 6*6+7*7] + return sum; +} +//---------------------------------------------- Special math functions ----------------------------------------------------------- +#define _mm_min_epu8(_u_, _v_) (__m128i)__lsx_vmin_bu((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_min_epu16(_u_, _v_) (__m128i)__lsx_vmin_hu((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_min_epi16(_u_, _v_) (__m128i)__lsx_vmin_h((__m128i)(_u_), (__m128i)(_v_)) +//---------------------------------------------- Logical -------------------------------------------------------------------------- +#define mm_testnz_epu32(_u_) (__lsx_vreplgr2vr_w(__lsx_vsrlri_w((__m128i)(_u_), 31)) != __lsx_vreplgr2vr_w(0)) +#define mm_testnz_epu8(_u_) (__lsx_vreplgr2vr_b(__lsx_vsrlri_b((__m128i)(_u_), 7)) != __lsx_vreplgr2vr_b(0)) +#define _mm_or_si128(_u_, _v_) (__m128i)__lsx_vor_v((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_and_si128(_u_, _v_) (__m128i)__lsx_vand_v((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_xor_si128(_u_, _v_) (__m128i)__lsx_vxor_v((__m128i)(_u_), (__m128i)(_v_)) +//---------------------------------------------- Shift ---------------------------------------------------------------------------- +#define mm_slli_epi8(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>7?(__m128i)__lsx_vreplgr2vr_b(0):(__m128i)__lsx_vslli_b((__m128i)(_u_), (_c_))) +#define mm_slli_epi16(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>15?(__m128i)__lsx_vreplgr2vr_h(0):(__m128i)__lsx_vslli_h((__m128i)(_u_), (_c_))) +#define mm_slli_epi32(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>31?(__m128i)__lsx_vreplgr2vr_w(0):(__m128i)__lsx_vslli_w((__m128i)(_u_), (_c_))) +#define mm_slli_epi64(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>63?(__m128i)__lsx_vreplgr2vr_d(0):(__m128i)__lsx_vslli_d((__m128i)(_u_), (_c_))) +#define _mm_slli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(_c_)>15?__lsx_vreplgr2vr_b(0):__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), (__m128i)(_v_), __lsx_vreplgr2vr_b((16 - (_c_)) + ((16 - (_c_)) << 8)))) + +#define mm_srli_epi8(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)> 7?(__m128i)__lsx_vreplgr2vr_b(0):(__m128i)__lsx_vsrlri_b((__m128i)(_u_), (_c_))) +#define mm_srli_epi16(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>15?(__m128i)__lsx_vreplgr2vr_h(0):(__m128i)__lsx_vsrlri_h((__m128i)(_u_), (_c_))) +#define mm_srli_epi32(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>31?(__m128i)__lsx_vreplgr2vr_w(0):(__m128i)__lsx_vsrlri_w((__m128i)(_u_), (_c_))) +#define mm_srli_epi64(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>63?(__m128i)__lsx_vreplgr2vr_d(0):(__m128i)__lsx_vsrlri_d((__m128i)(_u_), (_c_))) +#define _mm_srli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(_c_)>15?__lsx_vreplgr2vr_b(0): __lsx_vsrlr_b((__m128i)(_v_), __lsx_vreplgr2vr_b((_c_) * 8)) ) + +#define mm_srai_epi8(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_b((__m128i)(_v_), (_c_)) ) +#define mm_srai_epi16(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_h((__m128i)(_v_), (_c_)) ) +#define mm_srai_epi32(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_w((__m128i)(_v_), (_c_)) ) +#define mm_srai_epi64(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_d((__m128i)(_v_), (_c_)) ) + +#define _mm_slli_epi8(_u_, _m_) (__m128i)__lsx_vsll_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_)) +#define _mm_slli_epi16(_u_, _m_) (__m128i)__lsx_vsll_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_)) +#define _mm_slli_epi32(_u_, _m_) (__m128i)__lsx_vsll_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_)) +#define _mm_slli_epi64(_u_, _m_) (__m128i)__lsx_vsll_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_)) + +#define _mm_srli_epi8( _u_, _m_) (__m128i)__lsx_vsrl_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_)) +#define _mm_srli_epi16(_u_, _m_) (__m128i)__lsx_vsrl_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_)) +#define _mm_srli_epi32(_u_, _m_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_)) +#define _mm_srli_epi64(_u_, _m_) (__m128i)__lsx_vsrl_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_)) + +#define _mm_srai_epi8( _u_, _m_) (__m128i)__lsx_vsra_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_)) +#define _mm_srai_epi16(_u_, _m_) (__m128i)__lsx_vsra_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_)) +#define _mm_srai_epi32(_u_, _m_) (__m128i)__lsx_vsra_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_)) +#define _mm_srai_epi64(_u_, _m_) (__m128i)__lsx_vsra_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_)) + +#define _mm_sll_epi8(_u_, _v_) (__m128i)__lsx_vsll_b((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_sll_epi16(_u_, _v_) (__m128i)__lsx_vsll_h((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_sll_epi32(_u_, _v_) (__m128i)__lsx_vsll_w((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_sll_epi64(_u_, _v_) (__m128i)__lsx_vsll_d((__m128i)(_u_), (__m128i)(_v_)) + +#define _mm_srl_epi8( _u_, _v_) (__m128i)__lsx_vsrl_b((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_srl_epi16(_u_, _v_) (__m128i)__lsx_vsrl_h((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_srl_epi32(_u_, _v_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_srl_epi64(_u_, _v_) (__m128i)__lsx_vsrl_d((__m128i)(_u_), (__m128i)(_v_)) + +#define _mm_sllv_epi32(_u_, _v_) (__m128i)__lsx_vsll_w((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_srlv_epi32(_u_, _v_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), (__m128i)(_v_)) +//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) --------------------------------- +#define _mm_cmpeq_epi8( _u_, _v_) (__m128i)__lsx_vseq_b((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_cmpeq_epi16(_u_, _v_) (__m128i)__lsx_vseq_h((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_cmpeq_epi32(_u_, _v_) (__m128i)__lsx_vseq_w((__m128i)(_u_), (__m128i)(_v_)) + +#define _mm_cmpgt_epi8( _u_, _v_) (__m128i)__lsx_vslt_b((__m128i)(_v_), (__m128i)(_u_)) // 注意参数顺序 +#define _mm_cmpgt_epi16(_u_, _v_) (__m128i)__lsx_vslt_h((__m128i)(_v_), (__m128i)(_u_)) +#define _mm_cmpgt_epi32(_u_, _v_) (__m128i)__lsx_vslt_w((__m128i)(_v_), (__m128i)(_u_)) + +#define _mm_cmpgt_epu16(_u_, _v_) (__m128i)__lsx_vslt_hu((__m128i)(_v_), (__m128i)(_u_)) +#define mm_cmpgt_epu32(_u_, _v_) (__m128i)__lsx_vslt_wu((__m128i)(_v_), (__m128i)(_u_)) +//---------------------------------------------- Load ----------------------------------------------------------------------------- +#define _mm_loadl_epi64(_u64p_) (__m128i)__lsx_vldrepl_d(_u64p_, 0) // 加载并广播到低64位 +#define mm_loadu_epi64p(_u64p_, _u_) (__m128i)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const uint64_t*)(_u64p_), 0) +#define _mm_loadu_si128(_ip_) (__m128i)__lsx_vldx((const __m128i*)(_ip_), 0) +#define _mm_load_si128(_ip_) (__m128i)__lsx_vld((const __m128i*)(_ip_), 0) + +#define _mm_load_ps(_ip_) (__m128)__lsx_vld((const float*)(_ip_), 0) +#define _mm_loadu_ps(_ip_) (__m128)__lsx_vldx((const float*)(_ip_), 0) +#define _mm_load1_ps(_ip_) (__m128)__lsx_vreplfr2vr_s(*(const float*)(_ip_)) +#define _mm_loadl_pi(_u_, _ip_) (__m128)__lsx_vpickve2gr_w((__m128)(_u_), 1); (__m128)__lsx_vinsgr2vr_w((__m128)(_u_), *(const float*)(_ip_), 0) +#define _mm_loadh_pi(_u_, _ip_) (__m128)__lsx_vpickve2gr_w((__m128)(_u_), 0); (__m128)__lsx_vinsgr2vr_w((__m128)(_u_), *(const float*)(_ip_), 1) +//---------------------------------------------- Store ---------------------------------------------------------------------------- +#define _mm_storel_epi64(_ip_, _u_) __lsx_vstelm_d((__m128i)(_u_), (uint64_t*)(_ip_), 0, 0) +#define _mm_storeu_si128(_ip_, _u_) __lsx_vstx((__m128i)(_u_), (__m128i*)(_ip_), 0) + +#define _mm_store_ps(_ip_, _u_) __lsx_vst((float32x4_t)(_u_), (float*)(_ip_), 0) +#define _mm_storeu_ps(_ip_, _u_) __lsx_vstx((float32x4_t)(_u_), (float*)(_ip_), 0) +#define _mm_store_ss(_ip_, _u_) __lsx_vstelm_w((__m128)(_u_), (float*)(_ip_), 0, 0) +//---------------------------------------------- Convert -------------------------------------------------------------------------- +#define mm_cvtsi64_si128p(_u64p_, _u_) (__m128i)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const uint64_t*)(_u64p_), 0) +#define _mm_cvtsi64_si128(_u_) (__m128i)__lsx_vreplgr2vr_d(_u_) +//---------------------------------------------- Reverse bits/bytes --------------------------------------------------------------- +#define mm_rbit_epi8(_v_) (__m128i)__lsx_vbitrev_b((__m128i)(_v_), (__m128i)(_v_)) +#define mm_rev_epi16(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E}) +#define mm_rev_epi32(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C}) +#define mm_rev_epi64(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x0706050403020100, 0x0F0E0D0C0B0A0908}) +//--------------------------------------------- Insert/extract -------------------------------------------------------------------- +#define mm_extract_epi32x(_u_, _u32_, _id_) (*(uint32_t*)&(_u32_) = __lsx_vpickve2gr_wu((__m128i)(_u_), (_id_)) +#define _mm_extract_epi64x(_u_, _u64_, _id_) (*(uint64_t*)&(_u64_) = __lsx_vpickve2gr_du((__m128i)(_u_), (_id_)) + +#define _mm_extract_epi8(_u_, _id_) __lsx_vpickve2gr_b((__m128i)(_u_), (_id_)) +#define _mm_extract_epi16(_u_, _id_) __lsx_vpickve2gr_h((__m128i)(_u_), (_id_)) +#define _mm_extract_epi32(_u_, _id_) __lsx_vpickve2gr_w((__m128i)(_u_), (_id_)) +#define mm_extract_epu32(_u_, _id_) __lsx_vpickve2gr_wu((__m128i)(_u_), (_id_)) +#define _mm_cvtsi128_si32(_u_) __lsx_vpickve2gr_w((__m128i)(_u_), 0) +#define _mm_cvtsi128_si64(_u_) __lsx_vpickve2gr_d((__m128i)(_u_), 0) + +#define _mm_insert_epu32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const uint32_t*)(_u32p_), (_id_)) +#define mm_insert_epi32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const int32_t*)(_u32p_), (_id_)) +#define _mm_cvtsi32_si128(_x_) (__m128i)__lsx_vinsgr2vr_w(__lsx_vldi(0), (_x_), 0) + +#define _mm_blendv_epi8(_u_, _v_, _m_) (__m128i)__lsx_vbitsel_v((__m128i)(_u_), (__m128i)(_v_), (__m128i)(_m_)) +//---------------------------------------------- Miscellaneous -------------------------------------------------------------------- +#define _mm_alignr_epi8(_u_, _v_, _m_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_u_), (__m128i){_m_,_m_+1,_m_+2,_m_+3,_m_+4,_m_+5,_m_+6,_m_+7, _m_+8,_m_+9,_m_+10,_m_+11,_m_+12,_m_+13,_m_+14,_m_+15}) +#define _mm_packs_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0)) +#define _mm_packs_epi32(_u_, _v_) (__m128i)__lsx_vpickev_h(__lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0)) + +#define _mm_packs_epu16(_u_, _v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_)) +#define _mm_packus_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0)) + +/* static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { */ +/* const __m128i zero = __lsx_vldi(0); */ +/* const __m128i mask = __lsx_vldi(0x0102040810204080); */ +/* __m128i signs = __lsx_vsrli_b(v, 7); // 提取符号位到bit0 */ +/* __m128i masked = __lsx_vand_v(signs, mask); // 应用位权重 */ +/* __m128i sum = __lsx_vhaddw_wu_hu(__lsx_vhaddw_hu_bu(masked, zero), zero); */ +/* return __lsx_vpickve2gr_hu(sum, 0) & 0xFFFF; */ +/* } */ + +static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { + // 步骤1:提取每个字节的最高位(符号位) + __m128i signs = __lsx_vsrli_b(v, 7); // 所有字节右移7位,符号位变bit0 + + // 步骤2:创建位掩码 (LSB-first: 0x01, 0x02, 0x04,...) + const __m128i mask = __lsx_vld((void*)(uint64_t[]){0x0102040810204080}, 0); + + // 步骤3:应用位掩码 + __m128i masked = __lsx_vand_v(signs, mask); + + // 步骤4:水平相加(8-bit → 16-bit → 32-bit) + __m128i sum16 = __lsx_vhaddw_hu_bu(masked, __lsx_vldi(0)); + __m128i sum32 = __lsx_vhaddw_wu_hu(sum16, __lsx_vldi(0)); + + // 步骤5:提取低16位结果 + return __lsx_vpickve2gr_hu(sum32, 0) & 0xFFFF; +} + +//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff) + +static ALWAYS_INLINE uint8_t mm_movemask_epi8s(__m128i sv) { + const __m128i mask = __lsx_vldi(0x0102040810204080); + __m128i tmp = __lsx_vand_v(sv, mask); + tmp = __lsx_vhaddw_hu_bu(tmp, __lsx_vldi(0)); + tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0)); + return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0); +} + +static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) { + const __m128i mask = __lsx_vldi(0x0102040810204080); + __m128i tmp = __lsx_vand_v(v, mask); + tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0)); + return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0); +} + +static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) { + // 1. 加载位掩码常量 (0x00000001, 0x00000002, 0x00000004, 0x00000008) + const __m128i mask = __lsx_vldi(0x0000000100000002); + __lsx_vinsgr2vr_d(mask, 0x0000000400000008, 1); // 设置高64位掩码 + + // 2. 应用位掩码 + __m128i masked = __lsx_vand_v(v, mask); + + // 3. 水平相加 + __m128i sum2 = __lsx_vhaddw_du_wu(masked, __lsx_vldi(0)); // 4x32 -> 2x64 + __m128i sum1 = __lsx_vhaddw_qu_du(sum2, __lsx_vldi(0)); // 2x64 -> 1x128 + + // 4. 提取结果 + return (uint32_t)__lsx_vpickve2gr_d(sum1, 0); +} + +static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { + // 1. 加载位掩码常量 (0x0000000000000001, 0x0000000000000002) + const __m128i mask = {1ULL, 2ULL}; + + // 2. 应用位掩码并直接提取结果 + __m128i masked = __lsx_vand_v(v, mask); + return __lsx_vpickve2gr_d(masked, 0) | __lsx_vpickve2gr_d(masked, 1); +} +// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack ----------------------------------------- +#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_)) + +#define _mm_shuffle_epi8(_u_, _v_) (__m128i)__lsx_vshuf_b((__m128i)(_u_), (__m128i)(_u_), (__m128i)(_v_)) + +#define mm_shuffle_nnnn_epi32(_v_, _m_) (__m128i)__lsx_vreplvei_w((__m128i)(_v_), (_m_)) + + #ifdef USE_MACROS +#define mm_shuffle_2031_epi32(_u_) ({__m128i rev = __lsx_vshuf4i_w(v, 0x1B); __lsx_vshuf4i_w(rev, 0xD8);}) +#define mm_shuffle_3120_epi32(_u_) __lsx_vshuf4i_w(v, 0xD8) + #else +static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) {__m128i rev = __lsx_vshuf4i_w(v, 0x1B); return __lsx_vshuf4i_w(rev, 0xD8);} +static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) {return __lsx_vshuf4i_w(v, 0xD8);} + #endif + + #if defined(USE_MACROS) || defined(__clang__) +#define _mm_shuffle_epi32(_u_, _m_) (__m128i)__lsx_vshuf4i_w((__m128i)(_u_), (_m_)) +#define _mm_shuffle_epi32s(_u_, _m_) (__m128i)__lsx_vshuf_w((__m128i)(_u_), (__m128i)(_u_), (__m128i){(_m_)&3, ((_m_)>>2)&3, ((_m_)>>4)&3, ((_m_)>>6)&3}) + #else +static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _u_, const unsigned _m_) {return (__m128i)__lsx_vshuf4i_w((__m128i)_u_, _m_);} + +static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _u_, const unsigned _m_) { + const uint32_t idx0 = (_m_) & 0x3; + const uint32_t idx1 = ((_m_) >> 2) & 0x3; + const uint32_t idx2 = ((_m_) >> 4) & 0x3; + const uint32_t idx3 = ((_m_) >> 6) & 0x3; + return (__m128i)__lsx_vshuf_w((__m128i)_u_, (__m128i)_u_, (__m128i){idx0, idx1, idx2, idx3}); +} + #endif + #ifdef USE_MACROS +#define _mm_unpacklo_epi8(_u_,_v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_)) +#define _mm_unpacklo_epi16(_u_,_v_) (__m128i)__lsx_vilvl_h((__m128i)(_v_), (__m128i)(_u_)) +#define _mm_unpacklo_epi32(_u_,_v_) (__m128i)__lsx_vilvl_w((__m128i)(_v_), (__m128i)(_u_)) +#define _mm_unpacklo_epi64(_u_,_v_) (__m128i)__lsx_vilvl_d((__m128i)(_v_), (__m128i)(_u_)) + +#define _mm_unpackhi_epi8(_u_,_v_) (__m128i)__lsx_vilvh_b((__m128i)(_v_), (__m128i)(_u_)) +#define _mm_unpackhi_epi16(_u_,_v_) (__m128i)__lsx_vilvh_h((__m128i)(_v_), (__m128i)(_u_)) +#define _mm_unpackhi_epi32(_u_,_v_) (__m128i)__lsx_vilvh_w((__m128i)(_v_), (__m128i)(_u_)) +#define _mm_unpackhi_epi64(_u_,_v_) (__m128i)__lsx_vilvh_d((__m128i)(_v_), (__m128i)(_u_)) + #else +static ALWAYS_INLINE __m128i _mm_unpacklo_epi8(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_b((__m128i)_v_, (__m128i)_u_);} +static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_h((__m128i)_v_, (__m128i)_u_);} +static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_w((__m128i)_v_, (__m128i)_u_);} +static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_d((__m128i)_v_, (__m128i)_u_);} + +static ALWAYS_INLINE __m128i _mm_unpackhi_epi8(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_b((__m128i)_v_, (__m128i)_u_);} +static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_h((__m128i)_v_, (__m128i)_u_);} +static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_w((__m128i)_v_, (__m128i)_u_);} +static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_d((__m128i)_v_, (__m128i)_u_);} + #endif + #else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) -------------- #define mm_movemask_epu32(_v_) _mm_movemask_ps(_mm_castsi128_ps(_v_)) #define mm_movemask_epu16(_v_) _mm_movemask_epi8(_v_) diff --git a/lib/transpose.c b/lib/transpose.c index 14ef087..35be2d9 100644 --- a/lib/transpose.c +++ b/lib/transpose.c @@ -45,6 +45,8 @@ #include #elif defined(__ARM_NEON) #include +#include "include_/sse_neon.h" + #elif defined(__loongarch_lp64) #include "include_/sse_neon.h" #endif @@ -728,7 +730,7 @@ void T2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { #endif //TPDEC256V #else //__AVX2__ - #if (defined(__SSE3__) || defined(__ARM_NEON)) && (ESIZE == 2 || ESIZE == 4 || ESIZE == 8) + #if (defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (ESIZE == 2 || ESIZE == 4 || ESIZE == 8) #define ST(_p_,_v_,_i_) _mm_storeu_si128((__m128i *)SIE(_p_,_i_), _v_) #define ST0(_p_,_v_) _mm_storeu_si128((__m128i *)(_p_), _v_) @@ -737,7 +739,7 @@ void T2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { unsigned stride = v/STRIDE; unsigned char *op,*ip; - #if defined(__SSE3__) || defined(__ARM_NEON) + #if defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) #if ESIZE == 2 __m128i sf = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); @@ -764,7 +766,7 @@ void T2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) { for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op; PREFETCH(ip+(ESIZE*16)*ESIZE,0); __m128i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE]; - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) #if ESIZE == 2 #ifdef __ARM_NEON uint8x16x2_t w = vld2q_u8(ip); diff --git a/lib/trled.c b/lib/trled.c index 7edf9b7..1f5b64f 100644 --- a/lib/trled.c +++ b/lib/trled.c @@ -43,6 +43,8 @@ #include #elif defined(__ARM_NEON) #include +#include "include_/sse_neon.h" + #elif defined(__loongarch_lp64) #include "include_/sse_neon.h" #endif #include "include_/conf.h" @@ -74,7 +76,7 @@ unsigned _srled8(const unsigned char *__restrict in, unsigned char *__restrict o #endif if(outlen >= SRLE8) while(op < out+(outlen-SRLE8)) { - #if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) + #if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) || defined(__loongarch_lp64) uint32_t mask; #ifdef __AVX2__ __m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, ev)); if(mask) goto a; op += 32; ip += 32; @@ -127,10 +129,12 @@ static inline unsigned _srled8x(const unsigned char *__restrict in, unsigned cha __m128i ev = _mm_set1_epi8(e); #elif defined(__ARM_NEON) uint8x8_t ev = vdup_n_u8(e); + #elif defined(__loongarch_lp64) + __m128i ev = __lsx_vreplgr2vr_b(e); #endif if(outlen >= SRLE8) while(op < out+(outlen-SRLE8)) { - #if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) + #if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) || defined(__loongarch_lp64) uint32_t mask; #ifdef __AVX2__ __m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, ev)); if(mask) goto a; op += 32; ip += 32; @@ -326,7 +330,7 @@ unsigned T2(_srled, USIZE)(const unsigned char *__restrict in, unsigned char *__ #ifdef __AVX2__ #define _mm256_set1_epi64 _mm256_set1_epi64x __m256i ev = T2(_mm256_set1_epi, USIZE)(e); - #elif (defined(__SSE__) /*|| defined(__ARM_NEON)*/) + #elif (defined(__SSE__) /*|| defined(__ARM_NEON) || defined(__loongarch_lp64)*/) // #if USIZE != 64 #define _mm_set1_epi64 _mm_set1_epi64x __m128i ev = T2(_mm_set1_epi, USIZE)(e); diff --git a/lib/v8.c b/lib/v8.c index d9eb004..38df38b 100644 --- a/lib/v8.c +++ b/lib/v8.c @@ -1007,7 +1007,7 @@ unsigned char *T2(V8ENC,32)(uint32_t *__restrict in, unsigned n, unsigned char * _mm_storeu_si128((__m128i *)op, _mm256_castsi256_si128( ov3)); op += LEN32(m1,2); _mm_storeu_si128((__m128i *)op, _mm256_extracti128_si256(ov3,1)); op += LEN32(m1,3); } - #elif defined(__SSSE3__) || defined(__ARM_NEON) // https://gist.github.com/aqrit/746d2f5e4ad1909230e2283272333dc1 + #elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) // https://gist.github.com/aqrit/746d2f5e4ad1909230e2283272333dc1 VEINI128v32; const __m128i cv1_8 = _mm_set1_epi8(1), cv7f00 = _mm_set1_epi16(0x7f00); for(ip = in; ip != in+(n&~(32-1)); ip += 32, PNEXT(out,op,8) ) { __m128i iv0 = _mm_loadu_si128(ip ), @@ -1149,7 +1149,7 @@ unsigned char *T2(V8DEC,32)(unsigned char *__restrict in, unsigned n, uint32_t } } } - #elif defined(__ARM_NEON) || defined(__SSSE3__) // optimzed for ARM ---------------------------------------------------------- + #elif defined(__ARM_NEON) || defined(__loongarch_lp64) || defined(__SSSE3__) // optimzed for ARM ---------------------------------------------------------- VDINI128v32; for(; op != out+(n&~(32-1)); op += 32) { //PREFETCH(ip+384,0); uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4); @@ -1223,7 +1223,7 @@ unsigned char *T2(V8ENC,16)(uint16_t *__restrict in, unsigned n, unsigned char * uint16_t *ip,v; unsigned char *op = DATABEG(out,n,16); - #if defined(__SSSE3__) || defined(__ARM_NEON) //-------------------------------- + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //-------------------------------- VEINI128v16; const __m128i cv1_8 = _mm_set1_epi8(1); for(ip = in; ip != in+(n&~(64-1)); ip += 64, PNEXT(out,op,8)) { //PREFETCH(ip+512,0); __m128i iv0 = _mm_loadu_si128(ip ), @@ -1291,7 +1291,7 @@ unsigned char *T2(V8DEC,16)(unsigned char *__restrict in, unsigned n, uint16_t unsigned char *ip = DATABEG(in,n,16); uint16_t v; - #if defined(__SSSE3__) || defined(__ARM_NEON)//----------------------- + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)//----------------------- VDINI128v16; for(op = out; op != out+(n&~(64-1)); op += 64) { PREFETCH(ip+512,0); uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4); diff --git a/lib/vint.c b/lib/vint.c index 4b0ce4e..8b885b0 100644 --- a/lib/vint.c +++ b/lib/vint.c @@ -328,7 +328,7 @@ unsigned char *T2(VBDDEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_ if(in[0] == 0xfe) { in++; - #if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32 + #if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32 #if VDELTA == 0 if(n) T2(BITZERO, USIZE)(out, n, start); #else diff --git a/lib/vp4c.c b/lib/vp4c.c index 8e0049a..f3a20ae 100644 --- a/lib/vp4c.c +++ b/lib/vp4c.c @@ -210,7 +210,7 @@ size_t p4nsdec64(unsigned char *in, size_t n, uint64_t *out) { uint64_t *op,sta #undef _P4ENC #undef P4ENC #undef BITPACK - #elif defined(__SSE3__) || defined(__ARM_NEON) //-------------------------------------------------- + #elif defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //-------------------------------------------------- #define BITDELTA bitdienc #define HYBRID 1 #define P4BITS _p4bits diff --git a/lib/vp4d.c b/lib/vp4d.c index b693b6d..9334cea 100644 --- a/lib/vp4d.c +++ b/lib/vp4d.c @@ -40,7 +40,7 @@ #define P4DELTA(a) #define P4DELTA_(a) - #if defined(__SSSE3__) || defined(__ARM_NEON) + #if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) extern char _shuffle_32[16][16]; // defined in bitunpack.c extern char _shuffle_16[256][16]; #endif @@ -94,7 +94,7 @@ extern char _shuffle_16[256][16]; #undef BITUNDD // #elif !defined(__SSE3__) - #elif defined(__SSSE3__) || defined(__ARM_NEON) + #elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) #define _P4DEC _p4dec #define P4DEC p4dec #define P4NDEC p4ndec @@ -175,7 +175,7 @@ extern char _shuffle_16[256][16]; #undef USIZE #undef DELTA -// #elif defined(__SSSE3__) || defined(__ARM_NEON) +// #elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) #define VSIZE 128 #define P4DELTA(a) #define P4DELTA_(a) @@ -305,7 +305,7 @@ ALWAYS_INLINE unsigned char *T2(_P4DEC, USIZE)(unsigned char *__restrict in, uns } //out += 64; } } - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && USIZE == 32 + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32 { uint_t *_op = out,*op,*pex = ex; for(i = 0; i < p4dn; i++) { for(op=_op; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; @@ -313,7 +313,7 @@ ALWAYS_INLINE unsigned char *T2(_P4DEC, USIZE)(unsigned char *__restrict in, uns } _op+=64; } } - #elif (defined(__SSSE3__) || defined(__ARM_NEON)) && USIZE == 16 + #elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 16 { uint_t *_op = out, *op, *pex = ex; for(i = 0; i < p4dn; i++) { for(op = _op; bb[i]; bb[i] >>= 8,op += 8) { const unsigned char m = bb[i]; diff --git a/lib/vsimple.c b/lib/vsimple.c index e9f98dd..b47b62b 100644 --- a/lib/vsimple.c +++ b/lib/vsimple.c @@ -27,6 +27,8 @@ #include #elif defined(__ARM_NEON) #include +#include "include_/sse_neon.h" + #elif defined(__loongarch_lp64) #include "include_/sse_neon.h" #endif #include @@ -343,7 +345,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t * uint_t *q = op; unsigned r = (w>>4)&0xf; if(!r) { memcpy(op,ip+1, n*(USIZE/8)); return ip+n*(USIZE/8); } - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) __m128i zv = _mm_setzero_si128(); #endif ip++; @@ -355,7 +357,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t * r -= 1; op += r+1; while(q < op) { - #if defined(__SSE2__) || defined(__ARM_NEON) + #if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) _mm_storeu_si128((__m128i *)q,zv); q = (uint_t *)((unsigned char *)q+16); _mm_storeu_si128((__m128i *)q,zv); q = (uint_t *)((unsigned char *)q+16); #else @@ -475,7 +477,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t * else { vbxget32(ip, r); } } op += r+1; T2(vbxget, USIZE)(ip,u); - #if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32 + #if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32 { __m128i v = _mm_set1_epi32(u); while(q < op) { _mm_storeu_si128((__m128i *)q,v); q += 4; diff --git a/makefile b/makefile index 6dc5af8..9d2318d 100644 --- a/makefile +++ b/makefile @@ -81,10 +81,14 @@ ifeq (,$(findstring clang, $(CC))) OPT+=-falign-loops endif -CFLAGS+=$(DEBUG) $(OPT) +ifeq ($(ARCH),loongarch64) + CFLAGS+=-mlsx + CXXFLAGS+=-mlsx +endif +CFLAGS+=$(DEBUG) $(OPT) #CFLAGS+=-Wno-macro-redefined -Wno-incompatible-pointer-types -Wno-tautological-constant-out-of-range-compare -Wno-discarded-qualifiers -CFLAGS+=-w -Wall -pedantic -CXXFLAGS+=-w +CFLAGS+=-w -Wall -pedantic -Wno-macro-redefined -Wno-incompatible-pointer-types +CXXFLAGS+=-w #-Wall -Wincompatible-pointer-types ifeq ($(OS),$(filter $(OS),Linux GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT Haiku)) LDFLAGS+=-lrt -lm