@ -503,16 +503,16 @@ static ALWAYS_INLINE __m128i mm_rbit_epi8(_v_) {
|
||||
|
||||
#define _mm_insert_epu32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const uint32_t*)(_u32p_), (_id_))
|
||||
#define mm_insert_epi32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const int32_t*)(_u32p_), (_id_))
|
||||
#define _mm_cvtsi32_si128(_x_) (__m128i)__lsx_vinsgr2vr_w(__lsx_vldi(0), (_x_), 0)
|
||||
#define _mm_cvtsi32_si128(_x_) (__m128i)__lsx_vinsgr2vr_w(__lsx_vreplgr2vr_w(0), (_x_), 0)
|
||||
|
||||
#define _mm_blendv_epi8(_u_, _v_, _m_) (__m128i)__lsx_vbitsel_v((__m128i)(_u_), (__m128i)(_v_), (__m128i)(_m_))
|
||||
//---------------------------------------------- Miscellaneous --------------------------------------------------------------------
|
||||
#define _mm_alignr_epi8(_u_, _v_, _m_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_u_), (__m128i){_m_,_m_+1,_m_+2,_m_+3,_m_+4,_m_+5,_m_+6,_m_+7, _m_+8,_m_+9,_m_+10,_m_+11,_m_+12,_m_+13,_m_+14,_m_+15})
|
||||
#define _mm_packs_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
#define _mm_packs_epi32(_u_, _v_) (__m128i)__lsx_vpickev_h(__lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
#define _mm_packs_epi16(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlrni_b_h((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
#define _mm_packs_epi32(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlrni_h_w((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
|
||||
#define _mm_packs_epu16(_u_, _v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_))
|
||||
#define _mm_packus_epi16(_u_, _v_) (__m128i)__lsx_vpickev_b(__lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0), __lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
#define _mm_packs_epu16(_u_, _v_) (__m128i)__lsx_vilvl_d((__m128i)(_v_), (__m128i)(_u_)
|
||||
#define _mm_packus_epi16(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlni_bu_h((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0))
|
||||
|
||||
/* static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { */
|
||||
/* const __m128i zero = __lsx_vldi(0); */
|
||||
@ -525,63 +525,39 @@ static ALWAYS_INLINE __m128i mm_rbit_epi8(_v_) {
|
||||
|
||||
static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
|
||||
// 步骤1:提取每个字节的最高位(符号位)
|
||||
__m128i signs = __lsx_vsrli_b(v, 7); // 所有字节右移7位,符号位变bit0
|
||||
__m128i signs = __lsx_vsrai_b(v, 7); // 所有字节算术右移7位,保留符号位
|
||||
|
||||
// 步骤2:创建位掩码 (LSB-first: 0x01, 0x02, 0x04,...)
|
||||
const __m128i mask = __lsx_vld((void*)(uint64_t[]){0x0102040810204080}, 0);
|
||||
static const v16i8 mask = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7}
|
||||
|
||||
// 步骤3:应用位掩码
|
||||
__m128i masked = __lsx_vand_v(signs, mask);
|
||||
|
||||
// 步骤4:水平相加(8-bit → 16-bit → 32-bit)
|
||||
__m128i sum16 = __lsx_vhaddw_hu_bu(masked, __lsx_vldi(0));
|
||||
__m128i sum32 = __lsx_vhaddw_wu_hu(sum16, __lsx_vldi(0));
|
||||
__m128i sum16 = __lsx_vhaddw_hu_bu(masked, masked);
|
||||
__m128i sum32 = __lsx_vhaddw_wu_hu(sum16, sum16);
|
||||
__m128i sum64 = __lsx_vhaddw_du_wu(sum32, sum32);
|
||||
|
||||
// 步骤5:提取低16位结果
|
||||
return __lsx_vpickve2gr_hu(sum32, 0) & 0xFFFF;
|
||||
return __lsx_vpickve2gr_hu(sum64, 0) | __lsx_vpickve2gr_hu(sum64, 8)<< 8;
|
||||
}
|
||||
|
||||
//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
|
||||
|
||||
static ALWAYS_INLINE uint8_t mm_movemask_epi8s(__m128i sv) {
|
||||
const __m128i mask = __lsx_vldi(0x0102040810204080);
|
||||
__m128i tmp = __lsx_vand_v(sv, mask);
|
||||
tmp = __lsx_vhaddw_hu_bu(tmp, __lsx_vldi(0));
|
||||
tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0));
|
||||
return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) {
|
||||
const __m128i mask = __lsx_vldi(0x0102040810204080);
|
||||
__m128i tmp = __lsx_vand_v(v, mask);
|
||||
tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0));
|
||||
return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) {
|
||||
// 1. 加载位掩码常量 (0x00000001, 0x00000002, 0x00000004, 0x00000008)
|
||||
const __m128i mask = __lsx_vldi(0x0000000100000002);
|
||||
__lsx_vinsgr2vr_d(mask, 0x0000000400000008, 1); // 设置高64位掩码
|
||||
const v4i32 mask = { 1, 1<<1, 1<<2, 1<<3 };
|
||||
|
||||
// 2. 应用位掩码
|
||||
__m128i masked = __lsx_vand_v(v, mask);
|
||||
|
||||
// 3. 水平相加
|
||||
__m128i sum2 = __lsx_vhaddw_du_wu(masked, __lsx_vldi(0)); // 4x32 -> 2x64
|
||||
__m128i sum1 = __lsx_vhaddw_qu_du(sum2, __lsx_vldi(0)); // 2x64 -> 1x128
|
||||
__m128i sum1 = __lsx_vhaddw_du_wu(masked, masked); // 4x32 -> 2x64
|
||||
__m128i sum2 = __lsx_vhaddw_qu_du(sum1, sum1); // 2x64 -> 1x128
|
||||
|
||||
// 4. 提取结果
|
||||
return (uint32_t)__lsx_vpickve2gr_d(sum1, 0);
|
||||
return (uint32_t)__lsx_vpickve2gr_b(sum2, 0);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) {
|
||||
// 1. 加载位掩码常量 (0x0000000000000001, 0x0000000000000002)
|
||||
const __m128i mask = {1ULL, 2ULL};
|
||||
|
||||
// 2. 应用位掩码并直接提取结果
|
||||
__m128i masked = __lsx_vand_v(v, mask);
|
||||
return __lsx_vpickve2gr_d(masked, 0) | __lsx_vpickve2gr_d(masked, 1);
|
||||
}
|
||||
// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
|
||||
#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_))
|
||||
|
||||
@ -590,13 +566,13 @@ static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) {
|
||||
#define mm_shuffle_nnnn_epi32(_v_, _m_) (__m128i)__lsx_vreplvei_w((__m128i)(_v_), (_m_))
|
||||
|
||||
#ifdef USE_MACROS
|
||||
#define mm_shuffle_2031_epi32(_u_) ({__m128i rev = __lsx_vshuf4i_w(v, 0x1B); __lsx_vshuf4i_w(rev, 0xD8);})
|
||||
#define mm_shuffle_3120_epi32(_u_) __lsx_vshuf4i_w(v, 0xD8)
|
||||
#define mm_shuffle_2031_epi32(_u_) (__m128i)__lsx_vshuf4i_w(v, 0x8D)
|
||||
#define mm_shuffle_3120_epi32(_u_) (__m128i)__lsx_vshuf4i_w(v, 0xD8)
|
||||
#else
|
||||
static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) {__m128i rev = __lsx_vshuf4i_w(v, 0x1B); return __lsx_vshuf4i_w(rev, 0xD8);}
|
||||
static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) {__lsx_vshuf4i_w(rev, 0x8D);}
|
||||
static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) {return __lsx_vshuf4i_w(v, 0xD8);}
|
||||
#endif
|
||||
|
||||
// tomorrow
|
||||
#if defined(USE_MACROS) || defined(__clang__)
|
||||
#define _mm_shuffle_epi32(_u_, _m_) (__m128i)__lsx_vshuf4i_w((__m128i)(_u_), (_m_))
|
||||
#define _mm_shuffle_epi32s(_u_, _m_) (__m128i)__lsx_vshuf_w((__m128i)(_u_), (__m128i)(_u_), (__m128i){(_m_)&3, ((_m_)>>2)&3, ((_m_)>>4)&3, ((_m_)>>6)&3})
|
||||
|
||||
Reference in New Issue
Block a user