From 2cb48544d2c9ddc5fb8b4062147ce762f8e5e605 Mon Sep 17 00:00:00 2001 From: yangwenqing <1552539019@qq.com> Date: Tue, 10 Jun 2025 17:57:05 +0800 Subject: [PATCH] fix some SIMD functions Signed-off-by: yangwenqing <1552539019@qq.com> --- lib/include_/sse_neon.h | 62 +++++++++++++---------------------------- 1 file changed, 19 insertions(+), 43 deletions(-) diff --git a/lib/include_/sse_neon.h b/lib/include_/sse_neon.h index 0afbfd8..f78dcc5 100644 --- a/lib/include_/sse_neon.h +++ b/lib/include_/sse_neon.h @@ -503,16 +503,16 @@ static ALWAYS_INLINE __m128i mm_rbit_epi8(_v_) { #define _mm_insert_epu32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const uint32_t*)(_u32p_), (_id_)) #define mm_insert_epi32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const int32_t*)(_u32p_), (_id_)) -#define _mm_cvtsi32_si128(_x_) (__m128i)__lsx_vinsgr2vr_w(__lsx_vldi(0), (_x_), 0) +#define _mm_cvtsi32_si128(_x_) (__m128i)__lsx_vinsgr2vr_w(__lsx_vreplgr2vr_w(0), (_x_), 0) #define _mm_blendv_epi8(_u_, _v_, _m_) (__m128i)__lsx_vbitsel_v((__m128i)(_u_), (__m128i)(_v_), (__m128i)(_m_)) //---------------------------------------------- Miscellaneous -------------------------------------------------------------------- #define _mm_alignr_epi8(_u_, _v_, _m_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_u_), (__m128i){_m_,_m_+1,_m_+2,_m_+3,_m_+4,_m_+5,_m_+6,_m_+7, _m_+8,_m_+9,_m_+10,_m_+11,_m_+12,_m_+13,_m_+14,_m_+15}) -#define _mm_packs_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0)) -#define _mm_packs_epi32(_u_, _v_) (__m128i)__lsx_vpickev_h(__lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0)) +#define _mm_packs_epi16(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlrni_b_h((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0)) +#define _mm_packs_epi32(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlrni_h_w((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0)) -#define _mm_packs_epu16(_u_, _v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_)) -#define _mm_packus_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0)) +#define _mm_packs_epu16(_u_, _v_) (__m128i)__lsx_vilvl_d((__m128i)(_v_), (__m128i)(_u_) +#define _mm_packus_epi16(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlni_bu_h((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0)) /* static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { */ /* const __m128i zero = __lsx_vldi(0); */ @@ -525,63 +525,39 @@ static ALWAYS_INLINE __m128i mm_rbit_epi8(_v_) { static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { // 步骤1:提取每个字节的最高位(符号位) - __m128i signs = __lsx_vsrli_b(v, 7); // 所有字节右移7位,符号位变bit0 + __m128i signs = __lsx_vsrai_b(v, 7); // 所有字节算术右移7位,保留符号位 // 步骤2:创建位掩码 (LSB-first: 0x01, 0x02, 0x04,...) - const __m128i mask = __lsx_vld((void*)(uint64_t[]){0x0102040810204080}, 0); + static const v16i8 mask = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7} // 步骤3:应用位掩码 __m128i masked = __lsx_vand_v(signs, mask); // 步骤4:水平相加(8-bit → 16-bit → 32-bit) - __m128i sum16 = __lsx_vhaddw_hu_bu(masked, __lsx_vldi(0)); - __m128i sum32 = __lsx_vhaddw_wu_hu(sum16, __lsx_vldi(0)); + __m128i sum16 = __lsx_vhaddw_hu_bu(masked, masked); + __m128i sum32 = __lsx_vhaddw_wu_hu(sum16, sum16); + __m128i sum64 = __lsx_vhaddw_du_wu(sum32, sum32); // 步骤5:提取低16位结果 - return __lsx_vpickve2gr_hu(sum32, 0) & 0xFFFF; + return __lsx_vpickve2gr_hu(sum64, 0) | __lsx_vpickve2gr_hu(sum64, 8)<< 8; } -//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff) - -static ALWAYS_INLINE uint8_t mm_movemask_epi8s(__m128i sv) { - const __m128i mask = __lsx_vldi(0x0102040810204080); - __m128i tmp = __lsx_vand_v(sv, mask); - tmp = __lsx_vhaddw_hu_bu(tmp, __lsx_vldi(0)); - tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0)); - return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0); -} - -static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) { - const __m128i mask = __lsx_vldi(0x0102040810204080); - __m128i tmp = __lsx_vand_v(v, mask); - tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0)); - return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0); -} static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) { // 1. 加载位掩码常量 (0x00000001, 0x00000002, 0x00000004, 0x00000008) - const __m128i mask = __lsx_vldi(0x0000000100000002); - __lsx_vinsgr2vr_d(mask, 0x0000000400000008, 1); // 设置高64位掩码 + const v4i32 mask = { 1, 1<<1, 1<<2, 1<<3 }; // 2. 应用位掩码 __m128i masked = __lsx_vand_v(v, mask); // 3. 水平相加 - __m128i sum2 = __lsx_vhaddw_du_wu(masked, __lsx_vldi(0)); // 4x32 -> 2x64 - __m128i sum1 = __lsx_vhaddw_qu_du(sum2, __lsx_vldi(0)); // 2x64 -> 1x128 + __m128i sum1 = __lsx_vhaddw_du_wu(masked, masked); // 4x32 -> 2x64 + __m128i sum2 = __lsx_vhaddw_qu_du(sum1, sum1); // 2x64 -> 1x128 // 4. 提取结果 - return (uint32_t)__lsx_vpickve2gr_d(sum1, 0); + return (uint32_t)__lsx_vpickve2gr_b(sum2, 0); } -static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { - // 1. 加载位掩码常量 (0x0000000000000001, 0x0000000000000002) - const __m128i mask = {1ULL, 2ULL}; - - // 2. 应用位掩码并直接提取结果 - __m128i masked = __lsx_vand_v(v, mask); - return __lsx_vpickve2gr_d(masked, 0) | __lsx_vpickve2gr_d(masked, 1); -} // --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack ----------------------------------------- #define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_)) @@ -590,13 +566,13 @@ static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { #define mm_shuffle_nnnn_epi32(_v_, _m_) (__m128i)__lsx_vreplvei_w((__m128i)(_v_), (_m_)) #ifdef USE_MACROS -#define mm_shuffle_2031_epi32(_u_) ({__m128i rev = __lsx_vshuf4i_w(v, 0x1B); __lsx_vshuf4i_w(rev, 0xD8);}) -#define mm_shuffle_3120_epi32(_u_) __lsx_vshuf4i_w(v, 0xD8) +#define mm_shuffle_2031_epi32(_u_) (__m128i)__lsx_vshuf4i_w(v, 0x8D) +#define mm_shuffle_3120_epi32(_u_) (__m128i)__lsx_vshuf4i_w(v, 0xD8) #else -static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) {__m128i rev = __lsx_vshuf4i_w(v, 0x1B); return __lsx_vshuf4i_w(rev, 0xD8);} +static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) {__lsx_vshuf4i_w(rev, 0x8D);} static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) {return __lsx_vshuf4i_w(v, 0xD8);} #endif - +// tomorrow #if defined(USE_MACROS) || defined(__clang__) #define _mm_shuffle_epi32(_u_, _m_) (__m128i)__lsx_vshuf4i_w((__m128i)(_u_), (_m_)) #define _mm_shuffle_epi32s(_u_, _m_) (__m128i)__lsx_vshuf_w((__m128i)(_u_), (__m128i)(_u_), (__m128i){(_m_)&3, ((_m_)>>2)&3, ((_m_)>>4)&3, ((_m_)>>6)&3})