Compare commits
10 Commits
e2c2b8a032
...
e4f6853116
Author | SHA1 | Date | |
---|---|---|---|
e4f6853116 | |||
06d6aad98b | |||
b0b651ab77 | |||
da4fa61f6d | |||
559021fbfe | |||
6f1140d424 | |||
c0e876cfb3 | |||
009bad8255 | |||
72afb3ae30 | |||
4c47a778b1 |
@ -187,7 +187,8 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
|
||||
defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
|
||||
defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
|
||||
defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
|
||||
defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
|
||||
defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
|
||||
defined(__loongarch_lp64)
|
||||
#define ctou16(_cp_) (*(unsigned short *)(_cp_))
|
||||
#define ctou32(_cp_) (*(unsigned *)(_cp_))
|
||||
#define ctof32(_cp_) (*(float *)(_cp_))
|
||||
@ -207,7 +208,7 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
|
||||
|
||||
#define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_)
|
||||
|
||||
#elif defined(__ARM_FEATURE_UNALIGNED)
|
||||
#elif defined(__ARM_FEATURE_UNALIGNED) || defined(__loongarch_lp64)
|
||||
struct _PACKED longu { uint64_t l; };
|
||||
struct _PACKED doubleu { double d; };
|
||||
#define ctou64(_cp_) ((struct longu *)(_cp_))->l
|
||||
@ -253,7 +254,7 @@ struct _PACKED doubleu { double d; };
|
||||
defined(__aarch64__) ||\
|
||||
defined(__mips64) ||\
|
||||
defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
|
||||
defined(__s390x__)
|
||||
defined(__s390x__) || defined(__loongarch_lp64)
|
||||
#define __WORDSIZE 64
|
||||
#else
|
||||
#define __WORDSIZE 32
|
||||
@ -659,4 +660,4 @@ static ALWAYS_INLINE void vbget32(unsigned char **_ip, unsigned *_x) { unsigned
|
||||
static ALWAYS_INLINE unsigned vlget32(unsigned char **_ip) { unsigned char *ip = *_ip; unsigned x; _vbget(ip, x, 32, VB_MAX, 4, 3, ;); *_ip = ip; return x; }
|
||||
static ALWAYS_INLINE unsigned vllen32(unsigned x) { return _vblen( x, 32, VB_MAX, 4, 3); }
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -385,7 +385,7 @@ size_t bitnzpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__rest
|
||||
size_t bitnfpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitf, bitfpack256v, bitf, bitfpack); }
|
||||
size_t bitnxpack256v32( uint32_t *__restrict in, size_t n, unsigned char *__restrict out) { uint32_t *ip,start; _BITNDPACKV(in, n, out, 256, 32, bitx256v, bitxpack256v, bitx, bitxpack); }
|
||||
|
||||
#elif defined(__SSE3__) || defined(__ARM_NEON) //----------------------------- SSE / AVX ---------------------------------------------------------------
|
||||
#elif defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //----------------------------- SSE / AVX ---------------------------------------------------------------
|
||||
#define OPPE(__op)
|
||||
#define IPPE(__op)
|
||||
|
||||
|
@ -651,7 +651,7 @@ size_t bitnzunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__re
|
||||
size_t bitnxunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitxunpack256v, bitxunpack); }
|
||||
size_t bitnfunpack256v32( unsigned char *__restrict in, size_t n, uint32_t *__restrict out) { uint32_t *op,start; _BITNDUNPACKV(in, n, out, 256, 32, bitfunpack256v, bitfunpack); }
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON) //------------------------------ SSE2/SSSE3 ---------------------------------------------------------
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //------------------------------ SSE2/SSSE3 ---------------------------------------------------------
|
||||
#define BITMAX16 16
|
||||
#define BITMAX32 32
|
||||
|
||||
@ -760,7 +760,7 @@ unsigned char *bitunpack128v64( const unsigned char *__restrict in, unsigned n,
|
||||
#undef VOZ16
|
||||
#undef BITUNPACK0
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
#define _ 0x80
|
||||
ALIGNED(char, _shuffle_32[16][16],16) = {
|
||||
{ _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ },
|
||||
@ -1178,7 +1178,7 @@ unsigned char *bitfunpack128v32( const unsigned char *__restrict in, unsigned n,
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
#define BITMAX16 15
|
||||
#define BITMAX32 31
|
||||
|
||||
@ -1315,7 +1315,7 @@ unsigned char *bitf1unpack128v32( const unsigned char *__restrict in, unsigned n
|
||||
return (unsigned char *)ip;
|
||||
}
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
#define BITMAX16 15
|
||||
#define BITMAX32 31
|
||||
|
||||
|
@ -228,7 +228,7 @@ uint64_t bit64(uint64_t *in, unsigned n, uint64_t *px) { uint64_t o,x,u0,*ip; BI
|
||||
uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
|
||||
uint16_t o, x, u0 = in[0], *ip = in;
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi16(u0);
|
||||
|
||||
@ -254,7 +254,7 @@ uint16_t bit16(uint16_t *in, unsigned n, uint16_t *px) {
|
||||
uint32_t bit32(uint32_t *in, unsigned n, uint32_t *px) {
|
||||
uint32_t o,x,u0 = in[0], *ip = in;
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(), vb0 = _mm_set1_epi32(u0);
|
||||
|
||||
@ -290,7 +290,7 @@ uint64_t bitd64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint6
|
||||
|
||||
uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
uint16_t o, x, *ip = in, u0 = in[0] - start;
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vb0 = _mm_set1_epi16(u0),
|
||||
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start);
|
||||
@ -321,7 +321,7 @@ uint16_t bitd16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
uint32_t bitd32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
|
||||
uint32_t o = 0, x=0, *ip = in, u0 = in[0] - start;
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vb0 = _mm_set1_epi32(u0),
|
||||
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start);
|
||||
@ -361,7 +361,7 @@ void bitddec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in, n
|
||||
void bitddec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 0); }
|
||||
|
||||
void bitddec32(uint32_t *in, unsigned n, unsigned start) {
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vs = _mm_set1_epi32(start);
|
||||
unsigned *ip = in;
|
||||
for(; ip != in+(n&~(8-1)); ip += 8) {
|
||||
@ -417,7 +417,7 @@ uint64_t bitd164(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint6
|
||||
uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
|
||||
uint32_t o = 0, x=0, *ip = in, u0 = in[0]-start-1;
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vb0 = _mm_set1_epi32(u0),
|
||||
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(1);
|
||||
@ -446,7 +446,7 @@ uint32_t bitd132(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
|
||||
}
|
||||
|
||||
uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
uint16_t *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi16(start), cv = _mm_set1_epi16(8);
|
||||
for(; ip != in+(n&~(8-1)); ip += 8) {
|
||||
__m128i iv = _mm_loadu_si128((__m128i *)ip);
|
||||
@ -461,7 +461,7 @@ uint16_t bits128v16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
}
|
||||
|
||||
unsigned bits128v32(uint32_t *in, unsigned n, uint32_t *px, uint32_t start) {
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
unsigned *ip = in,b; __m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(4);
|
||||
for(; ip != in+(n&~(4-1)); ip += 4) {
|
||||
__m128i iv = _mm_loadu_si128((__m128i *)ip);
|
||||
@ -480,7 +480,7 @@ void bitd1dec16(uint16_t *in, unsigned n, uint16_t start) { BITDD(uint16_t, in,
|
||||
void bitd1dec64(uint64_t *in, unsigned n, uint64_t start) { BITDD(uint64_t, in, n, 1); }
|
||||
|
||||
void bitd1dec32(uint32_t *in, unsigned n, uint32_t start) {
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vs = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1);
|
||||
unsigned *ip = in;
|
||||
for(; ip != in+(n&~(4-1)); ip += 4) {
|
||||
@ -515,7 +515,7 @@ uint8_t bitdienc8( uint8_t *in, unsigned n, uint8_t *out, uint8_t start, uin
|
||||
uint16_t bitdienc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint16_t mindelta) { uint16_t o=0,x=0,*op = out,u; BITDE(uint16_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
|
||||
uint64_t bitdienc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x=0,*op = out,u; BITDE(uint64_t, in, n, mindelta, o |= u; x |= u ^ in[0]; *op++ = u); return o; }
|
||||
uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) {
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
unsigned *ip = in,b,*op = out;
|
||||
__m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start), cv = _mm_set1_epi32(mindelta), dv;
|
||||
for(; ip != in+(n&~(4-1)); ip += 4,op += 4) {
|
||||
@ -534,6 +534,7 @@ uint32_t bitdienc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uin
|
||||
}
|
||||
#else
|
||||
uint32_t b = 0,*op = out, x, *_ip;
|
||||
//uint32_t b = 0, *op = out, x = 0, o = 0, u = 0, *_ip;
|
||||
BITDE(uint32_t, in, n, mindelta, b |= x; *op++ = x);
|
||||
#endif
|
||||
return b;
|
||||
@ -569,7 +570,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
uint16_t o, x, *ip = in;
|
||||
uint32_t u0 = zigzagenc16((int)in[0] - (int)start);
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vb0 = _mm_set1_epi16(u0), vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi16(start);
|
||||
for(; ip != in+(n&~(16-1)); ip += 16) { PREFETCH(ip+512,0);
|
||||
@ -599,7 +600,7 @@ uint16_t bitz16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
uint32_t bitz32(unsigned *in, unsigned n, uint32_t *px, unsigned start) {
|
||||
uint32_t o, x, *ip=in,
|
||||
u0 = zigzagenc32((int)in[0] - (int)start);
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vb0 = _mm_set1_epi32(u0),
|
||||
vo0 = _mm_setzero_si128(), vx0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(), vx1 = _mm_setzero_si128(); __m128i vs = _mm_set1_epi32(start);
|
||||
@ -631,7 +632,7 @@ uint16_t bitzenc16(uint16_t *in, unsigned n, uint16_t *out, uint16_t start, uint
|
||||
uint64_t bitzenc64(uint64_t *in, unsigned n, uint64_t *out, uint64_t start, uint64_t mindelta) { uint64_t o=0,x,u,*op = out; BITZENC(uint64_t, int64_t,64,in, n, o |= u; *op++ = u); return o; }
|
||||
|
||||
uint32_t bitzenc32(uint32_t *in, unsigned n, uint32_t *out, uint32_t start, uint32_t mindelta) {
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
unsigned *ip = in,b,*op = out;
|
||||
__m128i bv = _mm_setzero_si128(), vs = _mm_set1_epi32(start);
|
||||
for(; ip != in+(n&~(8-1)); ip += 8,op += 8) {
|
||||
@ -672,7 +673,7 @@ void bitzdec8( uint8_t *in, unsigned n, uint8_t start) { BITZDEC(uint8_t, 8,
|
||||
void bitzdec64(uint64_t *in, unsigned n, uint64_t start) { BITZDEC(uint64_t, 64,in, n); }
|
||||
|
||||
void bitzdec16(uint16_t *in, unsigned n, uint16_t start) {
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vs = _mm_set1_epi16(start); //, c1 = _mm_set1_epi32(1), cz = _mm_setzero_si128();
|
||||
uint16_t *ip = in;
|
||||
for(; ip != in+(n&~(8-1)); ip += 8) {
|
||||
@ -692,7 +693,7 @@ void bitzdec16(uint16_t *in, unsigned n, uint16_t start) {
|
||||
}
|
||||
|
||||
void bitzdec32(unsigned *in, unsigned n, unsigned start) {
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vs = _mm_set1_epi32(start);
|
||||
unsigned *ip = in;
|
||||
for(; ip != in+(n&~(8-1)); ip += 8) {
|
||||
@ -729,7 +730,7 @@ uint64_t bitx64(uint64_t *in, unsigned n, uint64_t *px, uint64_t start) { uint64
|
||||
uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
uint16_t o = 0, *ip = in;
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vo0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(),
|
||||
vs = _mm_set1_epi16(start);
|
||||
@ -752,7 +753,7 @@ uint16_t bitx16(uint16_t *in, unsigned n, uint16_t *px, uint16_t start) {
|
||||
uint32_t bitx32(unsigned *in, unsigned n, uint32_t *px, uint32_t start) {
|
||||
uint32_t o = 0, *ip = in;
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vo0 = _mm_setzero_si128(),
|
||||
vo1 = _mm_setzero_si128(),
|
||||
vs = _mm_set1_epi32(start);
|
||||
@ -787,7 +788,7 @@ void bitxdec8( uint8_t *in, unsigned n, uint8_t start) { BITXDEC(uint8_t, in,
|
||||
void bitxdec64(uint64_t *in, unsigned n, uint64_t start) { BITXDEC(uint64_t, in, n); }
|
||||
|
||||
void bitxdec16(uint16_t *in, unsigned n, uint16_t start) {
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vs = _mm_set1_epi16(start);
|
||||
uint16_t *ip = in;
|
||||
for(; ip != in+(n&~(8-1)); ip += 8) {
|
||||
@ -806,7 +807,7 @@ void bitxdec16(uint16_t *in, unsigned n, uint16_t start) {
|
||||
}
|
||||
|
||||
void bitxdec32(unsigned *in, unsigned n, unsigned start) {
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i vs = _mm_set1_epi32(start);
|
||||
unsigned *ip = in;
|
||||
for(; ip != in+(n&~(8-1)); ip += 8) {
|
||||
|
@ -27,7 +27,11 @@
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __APPLE__
|
||||
#ifdef HAVE_MALLOC_MALLOC
|
||||
#include <sys/malloc.h>
|
||||
#else
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#else
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
18
lib/fp.c
18
lib/fp.c
@ -174,7 +174,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start)
|
||||
#if defined(__AVX2__) && USIZE >= 32
|
||||
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
|
||||
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
|
||||
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
|
||||
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
|
||||
#endif
|
||||
@ -191,7 +191,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start)
|
||||
}
|
||||
start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
|
||||
b = T2(mm256_hor_epi, USIZE)(bv);
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
|
||||
__m128i bv = _mm_setzero_si128();
|
||||
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),ip+=32/(USIZE/8)) {
|
||||
__m128i v0 = _mm_loadu_si128((__m128i *) ip);
|
||||
@ -217,7 +217,7 @@ size_t T2(fpxenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start)
|
||||
_mm256_storeu_si256((__m256i *) p, v0);
|
||||
_mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1);
|
||||
}
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
|
||||
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) {
|
||||
__m128i v0 = _mm_loadu_si128((__m128i *) p);
|
||||
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
|
||||
@ -255,7 +255,7 @@ size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start)
|
||||
#if defined(__AVX2__) && USIZE >= 32
|
||||
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
|
||||
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
|
||||
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
|
||||
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
|
||||
#endif
|
||||
@ -277,7 +277,7 @@ size_t T2(fpxdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start)
|
||||
_mm256_storeu_si256((__m256i *)(op+32/(USIZE/8)), sv);
|
||||
}
|
||||
start = (uint_t)T2(_mm256_extract_epi,USIZE)(sv, 256/USIZE-1);
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
|
||||
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8),op+=32/(USIZE/8)) {
|
||||
__m128i v0 = _mm_loadu_si128((__m128i *)p);
|
||||
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
|
||||
@ -318,7 +318,7 @@ size_t T2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start
|
||||
#if defined(__AVX2__) && USIZE >= 32
|
||||
#define _mm256_set1_epi64(a) _mm256_set1_epi64x(a)
|
||||
__m256i sv = T2(_mm256_set1_epi, USIZE)(start);
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
|
||||
#define _mm_set1_epi64(a) _mm_set1_epi64x(a)
|
||||
__m128i sv = T2(_mm_set1_epi, USIZE)(start);
|
||||
#endif
|
||||
@ -339,7 +339,7 @@ size_t T2(fpfcmenc,USIZE)(uint_t *in, size_t n, unsigned char *out, uint_t start
|
||||
_mm256_storeu_si256((__m256i *) p, v0);
|
||||
_mm256_storeu_si256((__m256i *)(p+32/(USIZE/8)), v1);
|
||||
}
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && (USIZE == 16 || USIZE == 32)
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (USIZE == 16 || USIZE == 32)
|
||||
for(p = _p; p != &_p[VSIZE]; p+=32/(USIZE/8)) {
|
||||
__m128i v0 = _mm_loadu_si128((__m128i *) p);
|
||||
__m128i v1 = _mm_loadu_si128((__m128i *)(p+16/(USIZE/8)));
|
||||
@ -823,7 +823,7 @@ size_t T2(bvzzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start)
|
||||
return 1+n*sizeof(out[0]);
|
||||
}
|
||||
BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd);
|
||||
#if (defined(__SSE2__) /*|| defined(__ARM_NEON)*/) && USIZE == 32
|
||||
#if (defined(__SSE2__) /*|| defined(__ARM_NEON) || defined(__loongarch_lp64)*/) && USIZE == 32
|
||||
__m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4*pd,3*pd,2*pd,1*pd);
|
||||
for(r += NL, _op = op; op != _op+(r&~7);) {
|
||||
sv = _mm_add_epi32(sv,cv); _mm_storeu_si128((__m128i *)op, sv); sv = mm_shuffle_nnnn_epi32(sv, 3); op += 4; //_mm_shuffle_epi32(sv, _MM_SHUFFLE(3, 3, 3, 3))->mm_shuffle_nnnn_epi32(sv, 3)
|
||||
@ -926,7 +926,7 @@ size_t T2(bvzdec,USIZE)(unsigned char *in, size_t n, uint_t *out, uint_t start)
|
||||
return 1+n*sizeof(out[0]);
|
||||
}
|
||||
BITGET32(bw,br,3,b); bitget32(bw,br,(b+1)<<3,r,ip); bitdnorm(bw,br,ip);//RLE //r+=NL; while(r--) *op++=(start+=pd);
|
||||
#if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32
|
||||
#if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
|
||||
__m128i sv = _mm_set1_epi32(start);
|
||||
for(r += NL, _op = op; op != _op+(r&~7);) {
|
||||
_mm_storeu_si128((__m128i *)op, sv); op += 4;
|
||||
|
@ -31,7 +31,11 @@
|
||||
#include <limits.h>
|
||||
#include <time.h> /* time_t, struct tm, time, mktime */
|
||||
#ifdef __APPLE__
|
||||
#ifdef HAVE_MALLOC_MALLOC
|
||||
#include <sys/malloc.h>
|
||||
#else
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#else
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
@ -1029,7 +1033,7 @@ unsigned char *bestr(unsigned id, unsigned b, unsigned char *s, char *prms, int
|
||||
"%3d:bitnfpack256v%d TurboPack256 FOR ",
|
||||
"%3d:bitns1pack128v32 TurboPack256 delt4_1",
|
||||
"%3d:bitnxpack128v%d TurboPack128 xor ",
|
||||
"%3d:bitnxpack256v%d TurboPack128 xor ",
|
||||
"%3d:bitnxpack256v%d TurboPack256 xor ",
|
||||
"%3d:vsenc%d TurboVSimple ",
|
||||
"%3d:vszenc%d TurboVSimple zigzag ",
|
||||
|
||||
|
10
lib/idxcr.c
10
lib/idxcr.c
@ -31,11 +31,15 @@
|
||||
#include <limits.h>
|
||||
#include <getopt.h>
|
||||
#include <sys/stat.h>
|
||||
#ifdef __APPLE__
|
||||
#ifdef __APPLE__
|
||||
#ifdef HAVE_MALLOC_MALLOC
|
||||
#include <sys/malloc.h>
|
||||
#else
|
||||
#else
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#else
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "include_/conf.h"
|
||||
#include "include_/vint.h"
|
||||
|
10
lib/idxqry.c
10
lib/idxqry.c
@ -33,11 +33,15 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include <fcntl.h>
|
||||
#ifdef __APPLE__
|
||||
#ifdef __APPLE__
|
||||
#ifdef HAVE_MALLOC_MALLOC
|
||||
#include <sys/malloc.h>
|
||||
#else
|
||||
#else
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#else
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
|
10
lib/idxseg.c
10
lib/idxseg.c
@ -27,11 +27,15 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#ifdef __APPLE__
|
||||
#ifdef __APPLE__
|
||||
#ifdef HAVE_MALLOC_MALLOC
|
||||
#include <sys/malloc.h>
|
||||
#else
|
||||
#else
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#else
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <getopt.h>
|
||||
#include "include_/conf.h"
|
||||
|
@ -40,6 +40,8 @@
|
||||
#include <emmintrin.h>
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__loongarch_lp64)
|
||||
#include <lsxintrin.h>
|
||||
#endif
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1600
|
||||
#include "vs/stdint.h"
|
||||
@ -112,7 +114,7 @@ static ALWAYS_INLINE uint64_t mm256_hor_epi64(__m256i v) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
#define mm_srai_epi64_63(_v_, _s_) _mm_srai_epi32(_mm_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 1, 1)), 31)
|
||||
|
||||
static ALWAYS_INLINE __m128i mm_zzage_epi16(__m128i v) { return _mm_xor_si128( mm_slli_epi16(v,1), mm_srai_epi16( v,15)); }
|
||||
@ -230,7 +232,7 @@ static ALWAYS_INLINE __m128i mm_xore_epi32( __m128i v, __m128i sv) { return _mm_
|
||||
_sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_mindelta_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\
|
||||
} while(0)
|
||||
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON) // -------------
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64) // -------------
|
||||
// SIMD set value (memset)
|
||||
#define BITZERO32(_out_, _n_, _v_) do {\
|
||||
__m128i _sv_ = _mm_set1_epi32(_v_), *_ov = (__m128i *)(_out_), *_ove = (__m128i *)(_out_ + _n_);\
|
||||
@ -322,7 +324,7 @@ static ALWAYS_INLINE uint64_t rbit64(uint64_t x) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
static ALWAYS_INLINE __m128i mm_rbit_epi16(__m128i v) { return mm_rbit_epi8(mm_rev_epi16(v)); }
|
||||
static ALWAYS_INLINE __m128i mm_rbit_epi32(__m128i v) { return mm_rbit_epi8(mm_rev_epi32(v)); }
|
||||
static ALWAYS_INLINE __m128i mm_rbit_epi64(__m128i v) { return mm_rbit_epi8(mm_rev_epi64(v)); }
|
||||
|
9
lib/include_/conf.h
Executable file → Normal file
9
lib/include_/conf.h
Executable file → Normal file
@ -112,7 +112,7 @@ static ALWAYS_INLINE unsigned ror64(unsigned x, int s) { return x >> s | x << (6
|
||||
#define ALIGNED(t,v,n) __declspec(align(n)) t v
|
||||
#define ALWAYS_INLINE __forceinline
|
||||
#define NOINLINE __declspec(noinline)
|
||||
#define _PACKED __attribute__ ((packed))
|
||||
#define _PACKED //__attribute__ ((packed))
|
||||
#define THREADLOCAL __declspec(thread)
|
||||
#define likely(x) (x)
|
||||
#define unlikely(x) (x)
|
||||
@ -203,7 +203,8 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
|
||||
defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\
|
||||
defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \
|
||||
defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \
|
||||
defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
|
||||
defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \
|
||||
defined(__loongarch_lp64)
|
||||
#define ctou16(_cp_) (*(unsigned short *)(_cp_))
|
||||
#define ctou32(_cp_) (*(unsigned *)(_cp_))
|
||||
#define ctof16(_cp_) (*(_Float16 *)(_cp_))
|
||||
@ -225,7 +226,7 @@ static ALWAYS_INLINE void ltou64(unsigned long long *x, const void
|
||||
|
||||
#define ltou64(_px_, _cp_) *(_px_) = *(uint64_t *)(_cp_)
|
||||
|
||||
#elif defined(__ARM_FEATURE_UNALIGNED)
|
||||
#elif defined(__ARM_FEATURE_UNALIGNED) || defined(__loongarch_lp64)
|
||||
struct _PACKED longu { uint64_t l; };
|
||||
struct _PACKED doubleu { double d; };
|
||||
#define ctou64(_cp_) ((struct longu *)(_cp_))->l
|
||||
@ -276,7 +277,7 @@ struct _PACKED doubleu { double d; };
|
||||
defined(__aarch64__) ||\
|
||||
defined(__mips64) ||\
|
||||
defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\
|
||||
defined(__s390x__)
|
||||
defined(__s390x__) || defined(__loongarch_lp64)
|
||||
#define __WORDSIZE 64
|
||||
#else
|
||||
#define __WORDSIZE 32
|
||||
|
@ -302,6 +302,324 @@ static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) { uint
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); }
|
||||
#endif
|
||||
|
||||
#elif defined(__loongarch_lp64)
|
||||
|
||||
#include <lsxintrin.h>
|
||||
// sse instruct to loongarch lsx instruct mapping
|
||||
|
||||
#ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
|
||||
#define _mm_set_epi8(u15,u14,u13,u12,\
|
||||
u11,u10, u9, u8,\
|
||||
u7,u6,u5,u4,\
|
||||
u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (v4u32)__lsx_vld( _u, 0);})
|
||||
#define _mm_set_epi16( u7,u6,u5,u4,\
|
||||
u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (v4u32)__lsx_vld( _u, 0);})
|
||||
|
||||
#define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; __lsx_vld(_u, 0);})
|
||||
#define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (v4u32)__lsx_vld(_u);})
|
||||
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8,
|
||||
uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4,
|
||||
uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) {
|
||||
uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (__m128i)__lsx_vld(u, 0); }
|
||||
static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4,
|
||||
uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (__m128i)__lsx_vld(u, 0); }
|
||||
static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return __lsx_vld(u, 0); }
|
||||
static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (__m128i)__lsx_vld(u, 0); }
|
||||
#endif
|
||||
|
||||
#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7)
|
||||
#define _mm_setr_epi32(u3,u2,u1,u0) _mm_set_epi32( u0,u1,u2,u3)
|
||||
#define _mm_setr_epi64x(u1,u0) _mm_set_epi64x(u0,u0)
|
||||
|
||||
#define _mm_set1_epi8( _u8_ ) (__m128i)__lsx_vreplgr2vr_b(_u8_)
|
||||
#define _mm_set1_epi16( _u16_) (__m128i)__lsx_vreplgr2vr_h(_u16_)
|
||||
#define _mm_set1_epi32( _u32_) __lsx_vreplgr2vr_w(_u32_)
|
||||
#define _mm_set1_epi64x(_u64_) (__m128i)__lsx_vreplgr2vr_d(_u64_)
|
||||
#define _mm_setzero_si128() __lsx_vreplgr2vr_w( 0 )
|
||||
|
||||
#define _mm_cvtss_f32(_v_) __lsx_vpickve2gr_s((__m128)(_v_), 0)
|
||||
#define _mm_setzero_ps() (__m128)__lsx_vldi(0)
|
||||
#define _mm_set1_ps(_f32_) (__m128)__lsx_vreplfr2vr_s(_f32_)
|
||||
//---------------------------------------------- Arithmetic -----------------------------------------------------------------------
|
||||
#define _mm_add_epi8( _u_,_v_) (__m128i)__lsx_vadd_b((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_add_epi16( _u_,_v_) (__m128i)__lsx_vadd_h((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_add_epi32( _u_,_v_) __lsx_vadd_w( _u_, _v_ )
|
||||
#define _mm_add_epi64( _u_,_v_) (__m128i)__lsx_vadd_d((__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
#define _mm_sub_epi8( _u_,_v_) (__m128i)__lsx_vsub_b((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_sub_epi16( _u_,_v_) (__m128i)__lsx_vsub_h((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_sub_epi32( _u_,_v_) (__m128i)__lsx_vsub_w((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_sub_epi64( _u_,_v_) (__m128i)__lsx_vsub_d((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_subs_epu8( _u_,_v_) (__m128i)__lsx_vsub_bu((__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
#define _mm_mullo_epi16(_u_, _v_) (__m128i)__lsx_vmulwev_h_h((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_mullo_epi32(_u_,_v_) (__m128i)__lsx_vmulwev_w_w((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define mm_mullo_epu32(_u_,_v_) (__m128i)__lsx_vmulwev_w_wu((__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
//#define _mm_mulhi_epi16s(_u_,_v_) (__m128i)__lsx_vmulwh_h_h((__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
static ALWAYS_INLINE __m128i _mm_mulhi_epi16(__m128i u, __m128i v) {
|
||||
__m128i evens = __lsx_vmulwev_h_w(u, v); // a[0]*b[0], a[2]*b[2], ...
|
||||
__m128i odds = __lsx_vmulwod_h_w(u, v); // a[1]*b[1], a[3]*b[3], ...
|
||||
|
||||
// 2. 右移 16 位,提取高 16 位
|
||||
evens = __lsx_vsrai_w(evens, 16);
|
||||
odds = __lsx_vsrai_w(odds, 16);
|
||||
|
||||
// 3. 重新打包成 16 位结果
|
||||
__m128i res = __lsx_vpickev_h(odds, evens); // 交错组合
|
||||
return res;
|
||||
}
|
||||
|
||||
#define _mm_mul_epu32(_u_, _v_) (__m128i)__lsx_vmulwev_d_wu((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_adds_epu16(_u_, _v_) (__m128i)__lsx_vsadd_hu((__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) {
|
||||
// 1. 执行16位有符号乘法得到32位中间结果
|
||||
__m128i mul_even = __lsx_vmulwev_w_h(u, v); // 偶数位相乘 (0*0, 2*2, 4*4, 6*6)
|
||||
__m128i mul_odd = __lsx_vmulwod_w_h(u, v); // 奇数位相乘 (1*1, 3*3, 5*5, 7*7)
|
||||
|
||||
// 2. 水平相加相邻的两个32位结果
|
||||
__m128i sum = __lsx_vadd_w(mul_even, mul_odd); // [0*0+1*1, 2*2+3*3, 4*4+5*5, 6*6+7*7]
|
||||
return sum;
|
||||
}
|
||||
//---------------------------------------------- Special math functions -----------------------------------------------------------
|
||||
#define _mm_min_epu8(_u_, _v_) (__m128i)__lsx_vmin_bu((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_min_epu16(_u_, _v_) (__m128i)__lsx_vmin_hu((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_min_epi16(_u_, _v_) (__m128i)__lsx_vmin_h((__m128i)(_u_), (__m128i)(_v_))
|
||||
//---------------------------------------------- Logical --------------------------------------------------------------------------
|
||||
#define mm_testnz_epu32(_u_) (__lsx_vreplgr2vr_w(__lsx_vsrlri_w((__m128i)(_u_), 31)) != __lsx_vreplgr2vr_w(0))
|
||||
#define mm_testnz_epu8(_u_) (__lsx_vreplgr2vr_b(__lsx_vsrlri_b((__m128i)(_u_), 7)) != __lsx_vreplgr2vr_b(0))
|
||||
#define _mm_or_si128(_u_, _v_) (__m128i)__lsx_vor_v((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_and_si128(_u_, _v_) (__m128i)__lsx_vand_v((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_xor_si128(_u_, _v_) (__m128i)__lsx_vxor_v((__m128i)(_u_), (__m128i)(_v_))
|
||||
//---------------------------------------------- Shift ----------------------------------------------------------------------------
|
||||
#define mm_slli_epi8(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>7?(__m128i)__lsx_vreplgr2vr_b(0):(__m128i)__lsx_vslli_b((__m128i)(_u_), (_c_)))
|
||||
#define mm_slli_epi16(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>15?(__m128i)__lsx_vreplgr2vr_h(0):(__m128i)__lsx_vslli_h((__m128i)(_u_), (_c_)))
|
||||
#define mm_slli_epi32(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>31?(__m128i)__lsx_vreplgr2vr_w(0):(__m128i)__lsx_vslli_w((__m128i)(_u_), (_c_)))
|
||||
#define mm_slli_epi64(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>63?(__m128i)__lsx_vreplgr2vr_d(0):(__m128i)__lsx_vslli_d((__m128i)(_u_), (_c_)))
|
||||
#define _mm_slli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(_c_)>15?__lsx_vreplgr2vr_b(0):__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), (__m128i)(_v_), __lsx_vreplgr2vr_b((16 - (_c_)) + ((16 - (_c_)) << 8))))
|
||||
|
||||
#define mm_srli_epi8(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)> 7?(__m128i)__lsx_vreplgr2vr_b(0):(__m128i)__lsx_vsrlri_b((__m128i)(_u_), (_c_)))
|
||||
#define mm_srli_epi16(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>15?(__m128i)__lsx_vreplgr2vr_h(0):(__m128i)__lsx_vsrlri_h((__m128i)(_u_), (_c_)))
|
||||
#define mm_srli_epi32(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>31?(__m128i)__lsx_vreplgr2vr_w(0):(__m128i)__lsx_vsrlri_w((__m128i)(_u_), (_c_)))
|
||||
#define mm_srli_epi64(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>63?(__m128i)__lsx_vreplgr2vr_d(0):(__m128i)__lsx_vsrlri_d((__m128i)(_u_), (_c_)))
|
||||
#define _mm_srli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(_c_)>15?__lsx_vreplgr2vr_b(0): __lsx_vsrlr_b((__m128i)(_v_), __lsx_vreplgr2vr_b((_c_) * 8)) )
|
||||
|
||||
#define mm_srai_epi8(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_b((__m128i)(_v_), (_c_)) )
|
||||
#define mm_srai_epi16(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_h((__m128i)(_v_), (_c_)) )
|
||||
#define mm_srai_epi32(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_w((__m128i)(_v_), (_c_)) )
|
||||
#define mm_srai_epi64(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_d((__m128i)(_v_), (_c_)) )
|
||||
|
||||
#define _mm_slli_epi8(_u_, _m_) (__m128i)__lsx_vsll_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
|
||||
#define _mm_slli_epi16(_u_, _m_) (__m128i)__lsx_vsll_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
|
||||
#define _mm_slli_epi32(_u_, _m_) (__m128i)__lsx_vsll_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
|
||||
#define _mm_slli_epi64(_u_, _m_) (__m128i)__lsx_vsll_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
|
||||
|
||||
#define _mm_srli_epi8( _u_, _m_) (__m128i)__lsx_vsrl_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
|
||||
#define _mm_srli_epi16(_u_, _m_) (__m128i)__lsx_vsrl_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
|
||||
#define _mm_srli_epi32(_u_, _m_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
|
||||
#define _mm_srli_epi64(_u_, _m_) (__m128i)__lsx_vsrl_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
|
||||
|
||||
#define _mm_srai_epi8( _u_, _m_) (__m128i)__lsx_vsra_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
|
||||
#define _mm_srai_epi16(_u_, _m_) (__m128i)__lsx_vsra_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
|
||||
#define _mm_srai_epi32(_u_, _m_) (__m128i)__lsx_vsra_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
|
||||
#define _mm_srai_epi64(_u_, _m_) (__m128i)__lsx_vsra_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
|
||||
|
||||
#define _mm_sll_epi8(_u_, _v_) (__m128i)__lsx_vsll_b((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_sll_epi16(_u_, _v_) (__m128i)__lsx_vsll_h((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_sll_epi32(_u_, _v_) (__m128i)__lsx_vsll_w((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_sll_epi64(_u_, _v_) (__m128i)__lsx_vsll_d((__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
#define _mm_srl_epi8( _u_, _v_) (__m128i)__lsx_vsrl_b((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_srl_epi16(_u_, _v_) (__m128i)__lsx_vsrl_h((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_srl_epi32(_u_, _v_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_srl_epi64(_u_, _v_) (__m128i)__lsx_vsrl_d((__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
#define _mm_sllv_epi32(_u_, _v_) (__m128i)__lsx_vsll_w((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_srlv_epi32(_u_, _v_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), (__m128i)(_v_))
|
||||
//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
|
||||
#define _mm_cmpeq_epi8( _u_, _v_) (__m128i)__lsx_vseq_b((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_cmpeq_epi16(_u_, _v_) (__m128i)__lsx_vseq_h((__m128i)(_u_), (__m128i)(_v_))
|
||||
#define _mm_cmpeq_epi32(_u_, _v_) (__m128i)__lsx_vseq_w((__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
#define _mm_cmpgt_epi8( _u_, _v_) (__m128i)__lsx_vslt_b((__m128i)(_v_), (__m128i)(_u_)) // 注意参数顺序
|
||||
#define _mm_cmpgt_epi16(_u_, _v_) (__m128i)__lsx_vslt_h((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_cmpgt_epi32(_u_, _v_) (__m128i)__lsx_vslt_w((__m128i)(_v_), (__m128i)(_u_))
|
||||
|
||||
#define _mm_cmpgt_epu16(_u_, _v_) (__m128i)__lsx_vslt_hu((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define mm_cmpgt_epu32(_u_, _v_) (__m128i)__lsx_vslt_wu((__m128i)(_v_), (__m128i)(_u_))
|
||||
//---------------------------------------------- Load -----------------------------------------------------------------------------
|
||||
#define _mm_loadl_epi64(_u64p_) (__m128i)__lsx_vldrepl_d(_u64p_, 0) // 加载并广播到低64位
|
||||
#define mm_loadu_epi64p(_u64p_, _u_) (__m128i)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const uint64_t*)(_u64p_), 0)
|
||||
#define _mm_loadu_si128(_ip_) (__m128i)__lsx_vldx((const __m128i*)(_ip_), 0)
|
||||
#define _mm_load_si128(_ip_) (__m128i)__lsx_vld((const __m128i*)(_ip_), 0)
|
||||
|
||||
#define _mm_load_ps(_ip_) (__m128)__lsx_vld((const float*)(_ip_), 0)
|
||||
#define _mm_loadu_ps(_ip_) (__m128)__lsx_vldx((const float*)(_ip_), 0)
|
||||
#define _mm_load1_ps(_ip_) (__m128)__lsx_vreplfr2vr_s(*(const float*)(_ip_))
|
||||
#define _mm_loadl_pi(_u_, _ip_) (__m128)__lsx_vpickve2gr_w((__m128)(_u_), 1); (__m128)__lsx_vinsgr2vr_w((__m128)(_u_), *(const float*)(_ip_), 0)
|
||||
#define _mm_loadh_pi(_u_, _ip_) (__m128)__lsx_vpickve2gr_w((__m128)(_u_), 0); (__m128)__lsx_vinsgr2vr_w((__m128)(_u_), *(const float*)(_ip_), 1)
|
||||
//---------------------------------------------- Store ----------------------------------------------------------------------------
|
||||
#define _mm_storel_epi64(_ip_, _u_) __lsx_vstelm_d((__m128i)(_u_), (uint64_t*)(_ip_), 0, 0)
|
||||
#define _mm_storeu_si128(_ip_, _u_) __lsx_vstx((__m128i)(_u_), (__m128i*)(_ip_), 0)
|
||||
|
||||
#define _mm_store_ps(_ip_, _u_) __lsx_vst((float32x4_t)(_u_), (float*)(_ip_), 0)
|
||||
#define _mm_storeu_ps(_ip_, _u_) __lsx_vstx((float32x4_t)(_u_), (float*)(_ip_), 0)
|
||||
#define _mm_store_ss(_ip_, _u_) __lsx_vstelm_w((__m128)(_u_), (float*)(_ip_), 0, 0)
|
||||
//---------------------------------------------- Convert --------------------------------------------------------------------------
|
||||
#define mm_cvtsi64_si128p(_u64p_, _u_) (__m128i)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const uint64_t*)(_u64p_), 0)
|
||||
#define _mm_cvtsi64_si128(_u_) (__m128i)__lsx_vreplgr2vr_d(_u_)
|
||||
//---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
|
||||
#define mm_rbit_epi8(_v_) (__m128i)__lsx_vbitrev_b((__m128i)(_v_), (__m128i)(_v_))
|
||||
#define mm_rev_epi16(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E})
|
||||
#define mm_rev_epi32(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C})
|
||||
#define mm_rev_epi64(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x0706050403020100, 0x0F0E0D0C0B0A0908})
|
||||
//--------------------------------------------- Insert/extract --------------------------------------------------------------------
|
||||
#define mm_extract_epi32x(_u_, _u32_, _id_) (*(uint32_t*)&(_u32_) = __lsx_vpickve2gr_wu((__m128i)(_u_), (_id_))
|
||||
#define _mm_extract_epi64x(_u_, _u64_, _id_) (*(uint64_t*)&(_u64_) = __lsx_vpickve2gr_du((__m128i)(_u_), (_id_))
|
||||
|
||||
#define _mm_extract_epi8(_u_, _id_) __lsx_vpickve2gr_b((__m128i)(_u_), (_id_))
|
||||
#define _mm_extract_epi16(_u_, _id_) __lsx_vpickve2gr_h((__m128i)(_u_), (_id_))
|
||||
#define _mm_extract_epi32(_u_, _id_) __lsx_vpickve2gr_w((__m128i)(_u_), (_id_))
|
||||
#define mm_extract_epu32(_u_, _id_) __lsx_vpickve2gr_wu((__m128i)(_u_), (_id_))
|
||||
#define _mm_cvtsi128_si32(_u_) __lsx_vpickve2gr_w((__m128i)(_u_), 0)
|
||||
#define _mm_cvtsi128_si64(_u_) __lsx_vpickve2gr_d((__m128i)(_u_), 0)
|
||||
|
||||
#define _mm_insert_epu32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const uint32_t*)(_u32p_), (_id_))
|
||||
#define mm_insert_epi32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const int32_t*)(_u32p_), (_id_))
|
||||
#define _mm_cvtsi32_si128(_x_) (__m128i)__lsx_vinsgr2vr_w(__lsx_vldi(0), (_x_), 0)
|
||||
|
||||
#define _mm_blendv_epi8(_u_, _v_, _m_) (__m128i)__lsx_vbitsel_v((__m128i)(_u_), (__m128i)(_v_), (__m128i)(_m_))
|
||||
//---------------------------------------------- Miscellaneous --------------------------------------------------------------------
|
||||
#define _mm_alignr_epi8(_u_, _v_, _m_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_u_), (__m128i){_m_,_m_+1,_m_+2,_m_+3,_m_+4,_m_+5,_m_+6,_m_+7, _m_+8,_m_+9,_m_+10,_m_+11,_m_+12,_m_+13,_m_+14,_m_+15})
|
||||
#define _mm_packs_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
#define _mm_packs_epi32(_u_, _v_) (__m128i)__lsx_vpickev_h(__lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
|
||||
#define _mm_packs_epu16(_u_, _v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_packus_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
|
||||
/* static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { */
|
||||
/* const __m128i zero = __lsx_vldi(0); */
|
||||
/* const __m128i mask = __lsx_vldi(0x0102040810204080); */
|
||||
/* __m128i signs = __lsx_vsrli_b(v, 7); // 提取符号位到bit0 */
|
||||
/* __m128i masked = __lsx_vand_v(signs, mask); // 应用位权重 */
|
||||
/* __m128i sum = __lsx_vhaddw_wu_hu(__lsx_vhaddw_hu_bu(masked, zero), zero); */
|
||||
/* return __lsx_vpickve2gr_hu(sum, 0) & 0xFFFF; */
|
||||
/* } */
|
||||
|
||||
static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
|
||||
// 步骤1:提取每个字节的最高位(符号位)
|
||||
__m128i signs = __lsx_vsrli_b(v, 7); // 所有字节右移7位,符号位变bit0
|
||||
|
||||
// 步骤2:创建位掩码 (LSB-first: 0x01, 0x02, 0x04,...)
|
||||
const __m128i mask = __lsx_vld((void*)(uint64_t[]){0x0102040810204080}, 0);
|
||||
|
||||
// 步骤3:应用位掩码
|
||||
__m128i masked = __lsx_vand_v(signs, mask);
|
||||
|
||||
// 步骤4:水平相加(8-bit → 16-bit → 32-bit)
|
||||
__m128i sum16 = __lsx_vhaddw_hu_bu(masked, __lsx_vldi(0));
|
||||
__m128i sum32 = __lsx_vhaddw_wu_hu(sum16, __lsx_vldi(0));
|
||||
|
||||
// 步骤5:提取低16位结果
|
||||
return __lsx_vpickve2gr_hu(sum32, 0) & 0xFFFF;
|
||||
}
|
||||
|
||||
//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
|
||||
|
||||
static ALWAYS_INLINE uint8_t mm_movemask_epi8s(__m128i sv) {
|
||||
const __m128i mask = __lsx_vldi(0x0102040810204080);
|
||||
__m128i tmp = __lsx_vand_v(sv, mask);
|
||||
tmp = __lsx_vhaddw_hu_bu(tmp, __lsx_vldi(0));
|
||||
tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0));
|
||||
return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) {
|
||||
const __m128i mask = __lsx_vldi(0x0102040810204080);
|
||||
__m128i tmp = __lsx_vand_v(v, mask);
|
||||
tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0));
|
||||
return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) {
|
||||
// 1. 加载位掩码常量 (0x00000001, 0x00000002, 0x00000004, 0x00000008)
|
||||
const __m128i mask = __lsx_vldi(0x0000000100000002);
|
||||
__lsx_vinsgr2vr_d(mask, 0x0000000400000008, 1); // 设置高64位掩码
|
||||
|
||||
// 2. 应用位掩码
|
||||
__m128i masked = __lsx_vand_v(v, mask);
|
||||
|
||||
// 3. 水平相加
|
||||
__m128i sum2 = __lsx_vhaddw_du_wu(masked, __lsx_vldi(0)); // 4x32 -> 2x64
|
||||
__m128i sum1 = __lsx_vhaddw_qu_du(sum2, __lsx_vldi(0)); // 2x64 -> 1x128
|
||||
|
||||
// 4. 提取结果
|
||||
return (uint32_t)__lsx_vpickve2gr_d(sum1, 0);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) {
|
||||
// 1. 加载位掩码常量 (0x0000000000000001, 0x0000000000000002)
|
||||
const __m128i mask = {1ULL, 2ULL};
|
||||
|
||||
// 2. 应用位掩码并直接提取结果
|
||||
__m128i masked = __lsx_vand_v(v, mask);
|
||||
return __lsx_vpickve2gr_d(masked, 0) | __lsx_vpickve2gr_d(masked, 1);
|
||||
}
|
||||
// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
|
||||
#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_))
|
||||
|
||||
#define _mm_shuffle_epi8(_u_, _v_) (__m128i)__lsx_vshuf_b((__m128i)(_u_), (__m128i)(_u_), (__m128i)(_v_))
|
||||
|
||||
#define mm_shuffle_nnnn_epi32(_v_, _m_) (__m128i)__lsx_vreplvei_w((__m128i)(_v_), (_m_))
|
||||
|
||||
#ifdef USE_MACROS
|
||||
#define mm_shuffle_2031_epi32(_u_) ({__m128i rev = __lsx_vshuf4i_w(v, 0x1B); __lsx_vshuf4i_w(rev, 0xD8);})
|
||||
#define mm_shuffle_3120_epi32(_u_) __lsx_vshuf4i_w(v, 0xD8)
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) {__m128i rev = __lsx_vshuf4i_w(v, 0x1B); return __lsx_vshuf4i_w(rev, 0xD8);}
|
||||
static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) {return __lsx_vshuf4i_w(v, 0xD8);}
|
||||
#endif
|
||||
|
||||
#if defined(USE_MACROS) || defined(__clang__)
|
||||
#define _mm_shuffle_epi32(_u_, _m_) (__m128i)__lsx_vshuf4i_w((__m128i)(_u_), (_m_))
|
||||
#define _mm_shuffle_epi32s(_u_, _m_) (__m128i)__lsx_vshuf_w((__m128i)(_u_), (__m128i)(_u_), (__m128i){(_m_)&3, ((_m_)>>2)&3, ((_m_)>>4)&3, ((_m_)>>6)&3})
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _u_, const unsigned _m_) {return (__m128i)__lsx_vshuf4i_w((__m128i)_u_, _m_);}
|
||||
|
||||
static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _u_, const unsigned _m_) {
|
||||
const uint32_t idx0 = (_m_) & 0x3;
|
||||
const uint32_t idx1 = ((_m_) >> 2) & 0x3;
|
||||
const uint32_t idx2 = ((_m_) >> 4) & 0x3;
|
||||
const uint32_t idx3 = ((_m_) >> 6) & 0x3;
|
||||
return (__m128i)__lsx_vshuf_w((__m128i)_u_, (__m128i)_u_, (__m128i){idx0, idx1, idx2, idx3});
|
||||
}
|
||||
#endif
|
||||
#ifdef USE_MACROS
|
||||
#define _mm_unpacklo_epi8(_u_,_v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_unpacklo_epi16(_u_,_v_) (__m128i)__lsx_vilvl_h((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_unpacklo_epi32(_u_,_v_) (__m128i)__lsx_vilvl_w((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_unpacklo_epi64(_u_,_v_) (__m128i)__lsx_vilvl_d((__m128i)(_v_), (__m128i)(_u_))
|
||||
|
||||
#define _mm_unpackhi_epi8(_u_,_v_) (__m128i)__lsx_vilvh_b((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_unpackhi_epi16(_u_,_v_) (__m128i)__lsx_vilvh_h((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_unpackhi_epi32(_u_,_v_) (__m128i)__lsx_vilvh_w((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_unpackhi_epi64(_u_,_v_) (__m128i)__lsx_vilvh_d((__m128i)(_v_), (__m128i)(_u_))
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i _mm_unpacklo_epi8(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_b((__m128i)_v_, (__m128i)_u_);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_h((__m128i)_v_, (__m128i)_u_);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_w((__m128i)_v_, (__m128i)_u_);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_d((__m128i)_v_, (__m128i)_u_);}
|
||||
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi8(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_b((__m128i)_v_, (__m128i)_u_);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_h((__m128i)_v_, (__m128i)_u_);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_w((__m128i)_v_, (__m128i)_u_);}
|
||||
static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_d((__m128i)_v_, (__m128i)_u_);}
|
||||
#endif
|
||||
|
||||
#else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) --------------
|
||||
#define mm_movemask_epu32(_v_) _mm_movemask_ps(_mm_castsi128_ps(_v_))
|
||||
#define mm_movemask_epu16(_v_) _mm_movemask_epi8(_v_)
|
||||
|
@ -45,6 +45,8 @@
|
||||
#include <emmintrin.h>
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#include "include_/sse_neon.h"
|
||||
#elif defined(__loongarch_lp64)
|
||||
#include "include_/sse_neon.h"
|
||||
#endif
|
||||
|
||||
@ -728,7 +730,7 @@ void T2(TPDEC256V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
|
||||
#endif //TPDEC256V
|
||||
#else //__AVX2__
|
||||
|
||||
#if (defined(__SSE3__) || defined(__ARM_NEON)) && (ESIZE == 2 || ESIZE == 4 || ESIZE == 8)
|
||||
#if (defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && (ESIZE == 2 || ESIZE == 4 || ESIZE == 8)
|
||||
#define ST(_p_,_v_,_i_) _mm_storeu_si128((__m128i *)SIE(_p_,_i_), _v_)
|
||||
#define ST0(_p_,_v_) _mm_storeu_si128((__m128i *)(_p_), _v_)
|
||||
|
||||
@ -737,7 +739,7 @@ void T2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
|
||||
unsigned stride = v/STRIDE;
|
||||
unsigned char *op,*ip;
|
||||
|
||||
#if defined(__SSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
#if ESIZE == 2
|
||||
__m128i sf = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1,
|
||||
14, 12, 10, 8, 6, 4, 2, 0);
|
||||
@ -764,7 +766,7 @@ void T2(TPENC128V, ESIZE)(unsigned char *in, unsigned n, unsigned char *out) {
|
||||
|
||||
for(ip = in, op = out; ip != in+v; ip+=ESIZE*16,op += ESIZE*16/STRIDE) { unsigned char *p = op; PREFETCH(ip+(ESIZE*16)*ESIZE,0);
|
||||
__m128i iv[ESIZE], ov[ESIZE == 2 ? ESIZE + 2 : ESIZE];
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
#if ESIZE == 2
|
||||
#ifdef __ARM_NEON
|
||||
uint8x16x2_t w = vld2q_u8(ip);
|
||||
|
10
lib/trle.c
10
lib/trle.c
@ -26,11 +26,15 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef __APPLE__
|
||||
#ifdef __APPLE__
|
||||
#ifdef HAVE_MALLOC_MALLOC
|
||||
#include <sys/malloc.h>
|
||||
#else
|
||||
#else
|
||||
#include <malloc/malloc.h>
|
||||
#endif
|
||||
#else
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#endif
|
||||
#ifdef _MSC_VER
|
||||
#include "vs/getopt.h"
|
||||
#else
|
||||
|
10
lib/trled.c
10
lib/trled.c
@ -43,6 +43,8 @@
|
||||
#include <emmintrin.h>
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#include "include_/sse_neon.h"
|
||||
#elif defined(__loongarch_lp64)
|
||||
#include "include_/sse_neon.h"
|
||||
#endif
|
||||
#include "include_/conf.h"
|
||||
@ -74,7 +76,7 @@ unsigned _srled8(const unsigned char *__restrict in, unsigned char *__restrict o
|
||||
#endif
|
||||
if(outlen >= SRLE8)
|
||||
while(op < out+(outlen-SRLE8)) {
|
||||
#if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON)
|
||||
#if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
uint32_t mask;
|
||||
#ifdef __AVX2__
|
||||
__m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, ev)); if(mask) goto a; op += 32; ip += 32;
|
||||
@ -127,10 +129,12 @@ static inline unsigned _srled8x(const unsigned char *__restrict in, unsigned cha
|
||||
__m128i ev = _mm_set1_epi8(e);
|
||||
#elif defined(__ARM_NEON)
|
||||
uint8x8_t ev = vdup_n_u8(e);
|
||||
#elif defined(__loongarch_lp64)
|
||||
__m128i ev = __lsx_vreplgr2vr_b(e);
|
||||
#endif
|
||||
if(outlen >= SRLE8)
|
||||
while(op < out+(outlen-SRLE8)) {
|
||||
#if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON)
|
||||
#if defined(__AVX2__) || defined(__SSE__) //|| defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
uint32_t mask;
|
||||
#ifdef __AVX2__
|
||||
__m256i v = _mm256_loadu_si256((__m256i*)ip); _mm256_storeu_si256((__m256i *)op, v); mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, ev)); if(mask) goto a; op += 32; ip += 32;
|
||||
@ -326,7 +330,7 @@ unsigned T2(_srled, USIZE)(const unsigned char *__restrict in, unsigned char *__
|
||||
#ifdef __AVX2__
|
||||
#define _mm256_set1_epi64 _mm256_set1_epi64x
|
||||
__m256i ev = T2(_mm256_set1_epi, USIZE)(e);
|
||||
#elif (defined(__SSE__) /*|| defined(__ARM_NEON)*/)
|
||||
#elif (defined(__SSE__) /*|| defined(__ARM_NEON) || defined(__loongarch_lp64)*/)
|
||||
// #if USIZE != 64
|
||||
#define _mm_set1_epi64 _mm_set1_epi64x
|
||||
__m128i ev = T2(_mm_set1_epi, USIZE)(e);
|
||||
|
8
lib/v8.c
8
lib/v8.c
@ -1007,7 +1007,7 @@ unsigned char *T2(V8ENC,32)(uint32_t *__restrict in, unsigned n, unsigned char *
|
||||
_mm_storeu_si128((__m128i *)op, _mm256_castsi256_si128( ov3)); op += LEN32(m1,2);
|
||||
_mm_storeu_si128((__m128i *)op, _mm256_extracti128_si256(ov3,1)); op += LEN32(m1,3);
|
||||
}
|
||||
#elif defined(__SSSE3__) || defined(__ARM_NEON) // https://gist.github.com/aqrit/746d2f5e4ad1909230e2283272333dc1
|
||||
#elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) // https://gist.github.com/aqrit/746d2f5e4ad1909230e2283272333dc1
|
||||
VEINI128v32; const __m128i cv1_8 = _mm_set1_epi8(1), cv7f00 = _mm_set1_epi16(0x7f00);
|
||||
for(ip = in; ip != in+(n&~(32-1)); ip += 32, PNEXT(out,op,8) ) {
|
||||
__m128i iv0 = _mm_loadu_si128(ip ),
|
||||
@ -1149,7 +1149,7 @@ unsigned char *T2(V8DEC,32)(unsigned char *__restrict in, unsigned n, uint32_t
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(__ARM_NEON) || defined(__SSSE3__) // optimzed for ARM ----------------------------------------------------------
|
||||
#elif defined(__ARM_NEON) || defined(__loongarch_lp64) || defined(__SSSE3__) // optimzed for ARM ----------------------------------------------------------
|
||||
VDINI128v32;
|
||||
for(; op != out+(n&~(32-1)); op += 32) { //PREFETCH(ip+384,0);
|
||||
uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4);
|
||||
@ -1223,7 +1223,7 @@ unsigned char *T2(V8ENC,16)(uint16_t *__restrict in, unsigned n, unsigned char *
|
||||
uint16_t *ip,v;
|
||||
unsigned char *op = DATABEG(out,n,16);
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) //--------------------------------
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //--------------------------------
|
||||
VEINI128v16; const __m128i cv1_8 = _mm_set1_epi8(1);
|
||||
for(ip = in; ip != in+(n&~(64-1)); ip += 64, PNEXT(out,op,8)) { //PREFETCH(ip+512,0);
|
||||
__m128i iv0 = _mm_loadu_si128(ip ),
|
||||
@ -1291,7 +1291,7 @@ unsigned char *T2(V8DEC,16)(unsigned char *__restrict in, unsigned n, uint16_t
|
||||
unsigned char *ip = DATABEG(in,n,16);
|
||||
uint16_t v;
|
||||
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)//-----------------------
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)//-----------------------
|
||||
VDINI128v16;
|
||||
for(op = out; op != out+(n&~(64-1)); op += 64) { PREFETCH(ip+512,0);
|
||||
uint32_t m0 = ctou32(IP), m1 = ctou32(IP+4);
|
||||
|
@ -328,7 +328,7 @@ unsigned char *T2(VBDDEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_
|
||||
|
||||
if(in[0] == 0xfe) {
|
||||
in++;
|
||||
#if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32
|
||||
#if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
|
||||
#if VDELTA == 0
|
||||
if(n) T2(BITZERO, USIZE)(out, n, start);
|
||||
#else
|
||||
|
@ -210,7 +210,7 @@ size_t p4nsdec64(unsigned char *in, size_t n, uint64_t *out) { uint64_t *op,sta
|
||||
#undef _P4ENC
|
||||
#undef P4ENC
|
||||
#undef BITPACK
|
||||
#elif defined(__SSE3__) || defined(__ARM_NEON) //--------------------------------------------------
|
||||
#elif defined(__SSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64) //--------------------------------------------------
|
||||
#define BITDELTA bitdienc
|
||||
#define HYBRID 1
|
||||
#define P4BITS _p4bits
|
||||
|
10
lib/vp4d.c
10
lib/vp4d.c
@ -40,7 +40,7 @@
|
||||
|
||||
#define P4DELTA(a)
|
||||
#define P4DELTA_(a)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#if defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
extern char _shuffle_32[16][16]; // defined in bitunpack.c
|
||||
extern char _shuffle_16[256][16];
|
||||
#endif
|
||||
@ -94,7 +94,7 @@ extern char _shuffle_16[256][16];
|
||||
|
||||
#undef BITUNDD
|
||||
// #elif !defined(__SSE3__)
|
||||
#elif defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
#elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
#define _P4DEC _p4dec
|
||||
#define P4DEC p4dec
|
||||
#define P4NDEC p4ndec
|
||||
@ -175,7 +175,7 @@ extern char _shuffle_16[256][16];
|
||||
#undef USIZE
|
||||
#undef DELTA
|
||||
|
||||
// #elif defined(__SSSE3__) || defined(__ARM_NEON)
|
||||
// #elif defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
#define VSIZE 128
|
||||
#define P4DELTA(a)
|
||||
#define P4DELTA_(a)
|
||||
@ -305,7 +305,7 @@ ALWAYS_INLINE unsigned char *T2(_P4DEC, USIZE)(unsigned char *__restrict in, uns
|
||||
} //out += 64;
|
||||
}
|
||||
}
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && USIZE == 32
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
|
||||
{ uint_t *_op = out,*op,*pex = ex;
|
||||
for(i = 0; i < p4dn; i++) {
|
||||
for(op=_op; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf;
|
||||
@ -313,7 +313,7 @@ ALWAYS_INLINE unsigned char *T2(_P4DEC, USIZE)(unsigned char *__restrict in, uns
|
||||
} _op+=64;
|
||||
}
|
||||
}
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON)) && USIZE == 16
|
||||
#elif (defined(__SSSE3__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 16
|
||||
{ uint_t *_op = out, *op, *pex = ex;
|
||||
for(i = 0; i < p4dn; i++) {
|
||||
for(op = _op; bb[i]; bb[i] >>= 8,op += 8) { const unsigned char m = bb[i];
|
||||
|
@ -27,6 +27,8 @@
|
||||
#include <emmintrin.h>
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#include "include_/sse_neon.h"
|
||||
#elif defined(__loongarch_lp64)
|
||||
#include "include_/sse_neon.h"
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
@ -343,7 +345,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t *
|
||||
uint_t *q = op;
|
||||
unsigned r = (w>>4)&0xf;
|
||||
if(!r) { memcpy(op,ip+1, n*(USIZE/8)); return ip+n*(USIZE/8); }
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
__m128i zv = _mm_setzero_si128();
|
||||
#endif
|
||||
ip++;
|
||||
@ -355,7 +357,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t *
|
||||
r -= 1;
|
||||
op += r+1;
|
||||
while(q < op) {
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)
|
||||
_mm_storeu_si128((__m128i *)q,zv); q = (uint_t *)((unsigned char *)q+16);
|
||||
_mm_storeu_si128((__m128i *)q,zv); q = (uint_t *)((unsigned char *)q+16);
|
||||
#else
|
||||
@ -475,7 +477,7 @@ unsigned char *T2(VSDEC, USIZE)(unsigned char *__restrict ip, size_t n, uint_t *
|
||||
else { vbxget32(ip, r); }
|
||||
}
|
||||
op += r+1; T2(vbxget, USIZE)(ip,u);
|
||||
#if (defined(__SSE2__) || defined(__ARM_NEON)) && USIZE == 32
|
||||
#if (defined(__SSE2__) || defined(__ARM_NEON) || defined(__loongarch_lp64)) && USIZE == 32
|
||||
{ __m128i v = _mm_set1_epi32(u);
|
||||
while(q < op) {
|
||||
_mm_storeu_si128((__m128i *)q,v); q += 4;
|
||||
|
10
makefile
10
makefile
@ -81,10 +81,14 @@ ifeq (,$(findstring clang, $(CC)))
|
||||
OPT+=-falign-loops
|
||||
endif
|
||||
|
||||
CFLAGS+=$(DEBUG) $(OPT)
|
||||
ifeq ($(ARCH),loongarch64)
|
||||
CFLAGS+=-mlsx
|
||||
CXXFLAGS+=-mlsx
|
||||
endif
|
||||
CFLAGS+=$(DEBUG) $(OPT)
|
||||
#CFLAGS+=-Wno-macro-redefined -Wno-incompatible-pointer-types -Wno-tautological-constant-out-of-range-compare -Wno-discarded-qualifiers
|
||||
CFLAGS+=-w -Wall -pedantic
|
||||
CXXFLAGS+=-w
|
||||
CFLAGS+=-w -Wall -pedantic -Wno-macro-redefined -Wno-incompatible-pointer-types
|
||||
CXXFLAGS+=-w
|
||||
#-Wall -Wincompatible-pointer-types
|
||||
ifeq ($(OS),$(filter $(OS),Linux GNU/kFreeBSD GNU OpenBSD FreeBSD DragonFly NetBSD MSYS_NT Haiku))
|
||||
LDFLAGS+=-lrt -lm
|
||||
|
Reference in New Issue
Block a user