685 lines
55 KiB
C
685 lines
55 KiB
C
/**
|
|
Copyright (C) powturbo 2013-2023
|
|
GPL v2 License
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
- homepage : https://sites.google.com/site/powturbo/
|
|
- github : https://github.com/powturbo
|
|
- twitter : https://twitter.com/powturbo
|
|
- email : powturbo [_AT_] gmail [_DOT_] com
|
|
**/
|
|
// Intel SSE to ARM NEON optimized for maximum speed (and compatibility gcc/clang) with possible minor changes to the source code
|
|
|
|
#ifndef _SSE_NEON_H_
|
|
#define _SSE_NEON_H_
|
|
#include "../include_/conf.h"
|
|
|
|
#ifdef __ARM_NEON //------------------------------------------------------------------------------------------------------------------
|
|
#include <arm_neon.h>
|
|
#define __m128i uint32x4_t // int32x4_t can also be used
|
|
#define __m128 float32x4_t
|
|
|
|
//#define USE_MACROS
|
|
#define uint8x16_to_8x8x2(_v_) ((uint8x8x2_t) { vget_low_u8(_v_), vget_high_u8(_v_) })
|
|
|
|
#ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
|
|
#define _mm_set_epi8(u15,u14,u13,u12,\
|
|
u11,u10, u9, u8,\
|
|
u7,u6,u5,u4,\
|
|
u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (uint32x4_t)vld1q_u8( _u);})
|
|
#define _mm_set_epi16( u7,u6,u5,u4,\
|
|
u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (uint32x4_t)vld1q_u16(_u);})
|
|
//#define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; vld1q_u32(_u);})
|
|
//#define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (uint32x4_t)vld1q_u64(_u);})
|
|
#define _mm_set_epi32(u3, u2, u1, u0) vcombine_u32(vcreate_u32((uint64_t)u1 << 32 | u0), vcreate_u32((uint64_t)u3 << 32 | u2))
|
|
#define _mm_set_epi64x(u1, u0) (__m128i)vcombine_u64(vcreate_u64(u0), vcreate_u64(u1))
|
|
|
|
#else
|
|
static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8,
|
|
uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4,
|
|
uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) {
|
|
uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (uint32x4_t)vld1q_u8( u); }
|
|
static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4,
|
|
uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (uint32x4_t)vld1q_u16(u); }
|
|
static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return vld1q_u32(u); }
|
|
static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (uint32x4_t)vld1q_u64(u); }
|
|
#endif
|
|
|
|
#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7)
|
|
#define _mm_setr_epi32(u3,u2,u1,u0) _mm_set_epi32( u0,u1,u2,u3)
|
|
#define _mm_setr_epi64x(u1,u0) _mm_set_epi64x(u0,u0)
|
|
|
|
#define _mm_set1_epi8( _u8_ ) (__m128i)vdupq_n_u8( _u8_ )
|
|
#define _mm_set1_epi16( _u16_) (__m128i)vdupq_n_u16(_u16_)
|
|
#define _mm_set1_epi32( _u32_) (__m128i)vdupq_n_u32(_u32_)
|
|
#define _mm_set1_epi64x(_u64_) (__m128i)vdupq_n_u64(_u64_)
|
|
#define _mm_setzero_si128() (__m128i)vdupq_n_u32( 0 )
|
|
|
|
#define _mm_cvtss_f32(_v_) vgetq_lane_f32((float32x4_t)(_v_), 0)
|
|
#define _mm_setzero_ps() (__m128)vdupq_n_f32(0)
|
|
#define _mm_set1_ps(_f32_) (__m128)vdupq_n_f32(_f32_)
|
|
//---------------------------------------------- Arithmetic -----------------------------------------------------------------------
|
|
#define _mm_add_epi8( _u_,_v_) (__m128i)vaddq_u8( (uint8x16_t)(_u_), (uint8x16_t)(_v_))
|
|
#define _mm_add_epi16( _u_,_v_) (__m128i)vaddq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
|
#define _mm_add_epi32( _u_,_v_) (__m128i)vaddq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
|
#define _mm_add_epi64( _u_,_v_) (__m128i)vaddq_u64((uint64x2_t)(_u_), (uint64x2_t)(_v_))
|
|
|
|
#define _mm_sub_epi8( _u_,_v_) (__m128i)vsubq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
|
|
#define _mm_sub_epi16( _u_,_v_) (__m128i)vsubq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
|
#define _mm_sub_epi32( _u_,_v_) (__m128i)vsubq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
|
#define _mm_sub_epi64( _u_,_v_) (__m128i)vsubq_u64((uint64x2_t)(_u_), (uint64x2_t)(_v_))
|
|
#define _mm_subs_epu8( _u_,_v_) (__m128i)vqsubq_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
|
|
|
|
#define _mm_mullo_epi16(_u_,_v_) (__m128i)vmulq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
|
|
#define _mm_mullo_epi32(_u_,_v_) (__m128i)vmulq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
|
|
#define mm_mullo_epu32(_u_,_v_) vmulq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
|
|
|
#define _mm_mulhi_epi16s(_u_,_v_) (__m128i)vqdmulhq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_)) // only for small values ??
|
|
static ALWAYS_INLINE __m128i _mm_mulhi_epi16(__m128i u, __m128i v) {
|
|
int32x4_t lo = vmull_s16(vget_low_s16( (int16x8_t)(u)), vget_low_s16( (int16x8_t)(v)));
|
|
int32x4_t hi = vmull_s16(vget_high_s16((int16x8_t)(u)), vget_high_s16((int16x8_t)(v)));
|
|
uint16x8x2_t a = vuzpq_u16((uint16x8_t)(lo), (uint16x8_t)(hi));
|
|
return (__m128i)(vreinterpretq_s32_u16(a.val[1]));
|
|
}
|
|
#define _mm_mul_epu32( _u_,_v_) (__m128i)vmull_u32(vget_low_u32(_u_),vget_low_u32(_v_))
|
|
#define _mm_adds_epu16( _u_,_v_) (__m128i)vqaddq_u16((uint16x8_t)(_u_),(uint16x8_t)(_v_))
|
|
static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) {
|
|
int32x4_t mlo = vmull_s16(vget_low_s16( (int16x8_t)u), vget_low_s16( (int16x8_t)v)),
|
|
mhi = vmull_s16(vget_high_s16((int16x8_t)u), vget_high_s16((int16x8_t)v));
|
|
int32x2_t alo = vpadd_s32(vget_low_s32(mlo), vget_high_s32(mlo)),
|
|
ahi = vpadd_s32(vget_low_s32(mhi), vget_high_s32(mhi));
|
|
return (__m128i)vcombine_s32(alo, ahi);
|
|
}
|
|
//---------------------------------------------- Special math functions -----------------------------------------------------------
|
|
#define _mm_min_epu8( _u_,_v_) (__m128i)vminq_u8( (uint8x16_t)(_u_), (uint8x16_t)(_v_))
|
|
#define _mm_min_epu16( _u_,_v_) (__m128i)vminq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
|
#define _mm_min_epi16( _u_,_v_) (__m128i)vminq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
|
|
//---------------------------------------------- Logical --------------------------------------------------------------------------
|
|
#define mm_testnz_epu32(_u_) vmaxvq_u32(_u_) //vaddvq_u32(_u_)
|
|
#define mm_testnz_epu8( _u_) vmaxv_u8(_u_)
|
|
#define _mm_or_si128( _u_,_v_) (__m128i)vorrq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
|
#define _mm_and_si128( _u_,_v_) (__m128i)vandq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
|
#define _mm_xor_si128( _u_,_v_) (__m128i)veorq_u32( (uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
|
//---------------------------------------------- Shift ----------------------------------------------------------------------------
|
|
#define mm_slli_epi8(_u_, _c_) (__m128i)((_c_)<1?(_u_):(__m128i)((_c_)>7?__lsx_vreplgr2vr_b(0):__lsx_vslli_b((__m128i)(_u_), (_c_))))
|
|
#define mm_slli_epi16(_u_, _c_) (__m128i)((_c_)<1?(_u_):(__m128i)((_c_)>15?__lsx_vreplgr2vr_h(0):__lsx_vslli_h((__m128i)(_u_), (_c_))))
|
|
#define mm_slli_epi32(_u_, _c_) (__m128i)((_c_)<1?(_u_):(__m128i)((_c_)>31?__lsx_vreplgr2vr_w(0):__lsx_vslli_w((__m128i)(_u_), (_c_))))
|
|
#define mm_slli_epi64(_u_, _c_) (__m128i)((_c_)<1?(_u_):(__m128i)((_c_)>63?__lsx_vreplgr2vr_d(0):__lsx_vslli_d((__m128i)(_u_), (_c_))))
|
|
#define _mm_slli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)((_c_)>15?__lsx_vreplgr2vr_b(0):__lsx_vbsll_v(_v_, _c_)))
|
|
|
|
#define mm_srli_epi8(_u_, _c_) (__m128i)((_c_)<1?(_u_):(__m128i)((_c_)>7?__lsx_vreplgr2vr_b(0):__lsx_vsrli_b((__m128i)(_u_), (_c_))))
|
|
#define mm_srli_epi16(_u_, _c_) (__m128i)((_c_)<1?(_u_):(__m128i)((_c_)>15?__lsx_vreplgr2vr_h(0):__lsx_vsrli_h((__m128i)(_u_), (_c_))))
|
|
#define mm_srli_epi32(_u_, _c_) (__m128i)((_c_)<1?(_u_):(__m128i)((_c_)>31?__lsx_vreplgr2vr_w(0):__lsx_vsrli_w((__m128i)(_u_), (_c_))))
|
|
#define mm_srli_epi64(_u_, _c_) (__m128i)((_c_)<1?(_u_):(__m128i)((_c_)>63?__lsx_vreplgr2vr_d(0):__lsx_vsrli_d((__m128i)(_u_), (_c_))))
|
|
#define _mm_srli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)((_c_)>15?__lsx_vreplgr2vr_b(0):__lsx_vbsrl_v(_v_, _c_)))
|
|
|
|
#define mm_srai_epi8( _v_,_c_) (__m128i)((_c_)<1?(_v_):(uint32x4_t)vshrq_n_s8( (int8x16_t)(_v_), (_c_))) // c <= 8 (vshrq_n:1-N)
|
|
#define mm_srai_epi16( _v_,_c_) (__m128i)((_c_)<1?(_v_):(uint32x4_t)vshrq_n_s16((int16x8_t)(_v_), (_c_))) // c <= 16
|
|
#define mm_srai_epi32( _v_,_c_) (__m128i)((_c_)<1?(_v_):(uint32x4_t)vshrq_n_s32((int32x4_t)(_v_), (_c_))) // c <= 32
|
|
#define mm_srai_epi64( _v_,_c_) (__m128i)((_c_)<1?(_v_):(uint32x4_t)vshrq_n_s64((int64x2_t)(_v_), (_c_))) // c <= 64
|
|
|
|
#define _mm_slli_epi8( _v_,_m_) (__m128i)vshlq_u8( (uint8x16_t)(_v_), vdupq_n_s8( (_m_))) // parameter c integer constant/variable
|
|
#define _mm_slli_epi16( _v_,_m_) (__m128i)vshlq_u16((uint16x8_t)(_v_), vdupq_n_s16( (_m_)))
|
|
#define _mm_slli_epi32( _v_,_m_) (__m128i)vshlq_u32((uint32x4_t)(_v_), vdupq_n_s32( (_m_)))
|
|
#define _mm_slli_epi64( _v_,_m_) (__m128i)vshlq_u64((uint64x2_t)(_v_), vdupq_n_s64( (_m_)))
|
|
|
|
#define _mm_srli_epi8( _v_,_m_) (__m128i)vshlq_u8( (uint8x16_t)(_v_), vdupq_n_s8( -(_m_)))
|
|
#define _mm_srli_epi16( _v_,_m_) (__m128i)vshlq_u16((uint16x8_t)(_v_), vdupq_n_s16(-(_m_)))
|
|
#define _mm_srli_epi32( _v_,_m_) (__m128i)vshlq_u32((uint32x4_t)(_v_), vdupq_n_s32(-(_m_)))
|
|
#define _mm_srli_epi64( _v_,_m_) (__m128i)vshlq_u64((uint64x2_t)(_v_), vdupq_n_s64(-(_m_)))
|
|
|
|
#define _mm_srai_epi8( _v_,_m_) (__m128i)vshlq_s8( (int8x16_t)(_v_), vdupq_n_s8( -(_m_)))
|
|
#define _mm_srai_epi16( _v_,_m_) (__m128i)vshlq_s16((int16x8_t)(_v_), vdupq_n_s16(-(_m_)))
|
|
#define _mm_srai_epi32( _v_,_m_) (__m128i)vshlq_s32((int32x4_t)(_v_), vdupq_n_s32(-(_m_)))
|
|
#define _mm_srai_epi64( _v_,_m_) (__m128i)vshlq_s64((int64x2_t)(_v_), vdupq_n_s64(-(_m_)))
|
|
|
|
#define _mm_sll_epi8( _u_,_v_) (__m128i)vshlq_s8( (int8x16_t)(_u_), (int8x16_t)(_v_)) // _v_:all lanes equal
|
|
#define _mm_sll_epi16( _u_,_v_) (__m128i)vshlq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_))
|
|
#define _mm_sll_epi32( _u_,_v_) (__m128i)vshlq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_))
|
|
#define _mm_sll_epi64( _u_,_v_) (__m128i)vshlq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_))
|
|
|
|
#define _mm_srl_epi8( _u_,_v_) (__m128i)vshrq_s8( (int8x16_t)(_u_), (int8x16_t)(_v_))
|
|
#define _mm_srl_epi16( _u_,_v_) (__m128i)vshrq_s16( (int16x8_t)(_u_), (int16x8_t)(_v_))
|
|
#define _mm_srl_epi32( _u_,_v_) (__m128i)vshrq_s32( (int32x4_t)(_u_), (int32x4_t)(_v_))
|
|
#define _mm_srl_epi64( _u_,_v_) (__m128i)vshrq_s64( (int64x2_t)(_u_), (int64x2_t)(_v_))
|
|
|
|
#define _mm_sllv_epi32( _u_,_v_) (__m128i)vshlq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_)) //variable shift
|
|
#define _mm_srlv_epi32( _u_,_v_) (__m128i)vshlq_u32((uint32x4_t)(_u_), vnegq_s32((int32x4_t)(_v_)))
|
|
//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
|
|
#define _mm_cmpeq_epi8( _u_,_v_) (__m128i)vceqq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
|
|
#define _mm_cmpeq_epi16( _u_,_v_) (__m128i)vceqq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
|
|
#define _mm_cmpeq_epi32( _u_,_v_) (__m128i)vceqq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
|
|
|
|
#define _mm_cmpgt_epi8( _u_,_v_) (__m128i)vcgtq_s8( ( int8x16_t)(_u_), ( int8x16_t)(_v_))
|
|
#define _mm_cmpgt_epi16( _u_,_v_) (__m128i)vcgtq_s16(( int16x8_t)(_u_), ( int16x8_t)(_v_))
|
|
#define _mm_cmpgt_epi32( _u_,_v_) (__m128i)vcgtq_s32(( int32x4_t)(_u_), ( int32x4_t)(_v_))
|
|
|
|
#define _mm_cmpgt_epu16( _u_,_v_) (__m128i)vcgtq_u16((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
|
#define mm_cmpgt_epu32( _u_,_v_) (__m128i)vcgtq_u32((uint32x4_t)(_u_), (uint32x4_t)(_v_))
|
|
//---------------------------------------------- Load -----------------------------------------------------------------------------
|
|
#define _mm_loadl_epi64( _u64p_) (__m128i)vcombine_s32(vld1_s32((int32_t const *)(_u64p_)), vcreate_s32(0))
|
|
#define mm_loadu_epi64p(_u64p_,_u_) (__m128i)vld1q_lane_u64((uint64_t *)(_u64p_), (uint64x2_t)(_u_), 0)
|
|
#define _mm_loadu_si128( _ip_) vld1q_u32(_ip_)
|
|
#define _mm_load_si128( _ip_) vld1q_u32(_ip_)
|
|
|
|
#define _mm_load_ps( _ip_) (__m128)vld1q_f32((float32_t *)(_ip_))
|
|
#define _mm_loadu_ps( _ip_) (__m128)vld1q_f32((float32_t *)(_ip_))
|
|
#define _mm_load1_ps( _ip_) (__m128)vld1q_dup_f32((float32_t *)(_p_))
|
|
#define _mm_loadl_pi(_v_,_ip_) (__m128)vcombine_f32((float32x2_t)vld1_f32((float32_t *)(_ip)), (float32x2_t)vget_high_f32(_v_))
|
|
#define _mm_loadh_pi(_v_,_ip_) (__m128)vcombine_f32((float32x2_t)vget_low_f32(_v_), (float32x2_t)vld1_f32((const float *)(_ip_)))
|
|
//---------------------------------------------- Store ----------------------------------------------------------------------------
|
|
#define _mm_storel_epi64(_ip_,_v_) vst1q_lane_u64((uint64_t *)(_ip_), (uint64x2_t)(_v_), 0)
|
|
#define _mm_storeu_si128(_ip_,_v_) vst1q_u32((__m128i *)(_ip_), _v_)
|
|
|
|
#define _mm_store_ps( _ip_,_v_) vst1q_f32( (float32_t *)(_ip_), (float32x4_t)(_v_))
|
|
#define _mm_storeu_ps( _ip_,_v_) vst1q_f32( (float32_t *)(_ip_), (float32x4_t)(_v_))
|
|
#define _mm_store_ss( _ip_,_v_) vst1q_lane_f32((float32_t *)(_ip_), (float32x4_t)(_v_), 0)
|
|
//---------------------------------------------- Convert --------------------------------------------------------------------------
|
|
#define mm_cvtsi64_si128p(_u64p_,_v_) mm_loadu_epi64p(_u64p_,_v_)
|
|
#define _mm_cvtsi64_si128(_v_) (__m128i)vdupq_n_u64(_v_) //vld1q_s64(_v_)
|
|
//---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
|
|
#define mm_rbit_epi8(_v_) (__m128i)vrbitq_u8( (uint8x16_t)(_v_)) // reverse bits
|
|
#define mm_rev_epi16(_v_) vrev16q_u8((uint8x16_t)(_v_)) // reverse bytes
|
|
#define mm_rev_epi32(_v_) vrev32q_u8((uint8x16_t)(_v_))
|
|
#define mm_rev_epi64(_v_) vrev64q_u8((uint8x16_t)(_v_))
|
|
//--------------------------------------------- Insert/extract --------------------------------------------------------------------
|
|
#define mm_extract_epi32x(_v_,_u32_,_id_) vst1q_lane_u32((uint32_t *)&(_u32_), _v_, _id_)
|
|
#define _mm_extract_epi64x(_v_,_u64_,_id_) vst1q_lane_u64((uint64_t *)&(_u64_), (uint64x2_t)(_v_), _id_)
|
|
|
|
#define _mm_extract_epi8( _v_, _id_) vgetq_lane_u8( (uint8x16_t)(_v_), _id_)
|
|
#define _mm_extract_epi16(_v_, _id_) vgetq_lane_u16(_v_, _id_)
|
|
#define _mm_extract_epi32(_v_, _id_) vgetq_lane_u32(_v_, _id_)
|
|
#define mm_extract_epu32(_v_, _id_) vgetq_lane_u32(_v_, _id_)
|
|
#define _mm_cvtsi128_si32(_v_) vgetq_lane_u32((uint32x4_t)(_v_),0)
|
|
#define _mm_cvtsi128_si64(_v_) vgetq_lane_u64((uint64x2_t)(_v_),0)
|
|
|
|
#define _mm_insert_epu32p(_v_,_u32p_,_id_) vsetq_lane_u32(_u32p_, _v_, _id_)
|
|
#define mm_insert_epi32p(_v_,_u32p_,_id_) vld1q_lane_u32(_u32p_, (uint32x4_t)(_v_), _id_)
|
|
#define _mm_cvtsi32_si128(_x_) (__m128i)vsetq_lane_s32(_x_, vdupq_n_s32(0), 0)
|
|
|
|
#define _mm_blendv_epi8(_u_,_v_,_m_) vbslq_u32(_m_,_v_,_u_)
|
|
//---------------------------------------------- Miscellaneous --------------------------------------------------------------------
|
|
#define _mm_alignr_epi8(_u_,_v_,_m_) (__m128i)vextq_u8( (uint8x16_t)(_v_), (uint8x16_t)(_u_), _m_)
|
|
#define _mm_packs_epi16( _u_,_v_) (__m128i)vcombine_s8( vqmovn_s16((int16x8_t)(_u_)), vqmovn_s16((int16x8_t)(_v_)))
|
|
#define _mm_packs_epi32( _u_,_v_) (__m128i)vcombine_s16(vqmovn_s32((int32x4_t)(_u_)), vqmovn_s32((int32x4_t)(_v_)))
|
|
|
|
#define _mm_packs_epu16( _u_,_v_) (__m128i)vcombine_u8((uint16x8_t)(_u_), (uint16x8_t)(_v_))
|
|
#define _mm_packus_epi16( _u_,_v_) (__m128i)vcombine_u8(vqmovun_s16((int16x8_t)(_u_)), vqmovun_s16((int16x8_t)(_v_)))
|
|
|
|
static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
|
|
const uint8x16_t __attribute__ ((aligned (16))) m = {1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7, 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7};
|
|
uint8x16_t mv = (uint8x16_t)vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vandq_u8(vcltq_s8((int8x16_t)v, vdupq_n_s8(0)), m))));
|
|
return vgetq_lane_u8(mv, 8) << 8 | vgetq_lane_u8(mv, 0);
|
|
}
|
|
//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
|
|
#ifdef __aarch64__
|
|
static ALWAYS_INLINE uint8_t mm_movemask_epi8s(uint8x8_t sv) { const uint8x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddv_u8( vand_u8( sv, m)); } // short only ARM
|
|
//static ALWAYS_INLINE uint16_t mm_movemask_epu16(uint32x4_t v) { const uint16x8_t m = { 1, 1<<2, 1<<4, 1<<6, 1<<8, 1<<10, 1<<12, 1<<14}; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
|
|
static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) { const uint16x8_t m = { 1, 1<<1, 1<<2, 1<<3, 1<<4, 1<< 5, 1<< 6, 1<<7 }; return vaddvq_u16(vandq_u16((uint16x8_t)v, m)); }
|
|
static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) { const uint32x4_t m = { 1, 1<<1, 1<<2, 1<<3 }; return vaddvq_u32(vandq_u32((uint32x4_t)v, m)); }
|
|
static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) { const uint64x2_t m = { 1, 1<<1 }; return vaddvq_u64(vandq_u64((uint64x2_t)v, m)); }
|
|
|
|
//static ALWAYS_INLINE uint64_t mm_movemask4_epu8(__m128i v) { return vgetq_lane_u64((uint64x2_t)vshrn_n_u16((uint8x16_t)v, 4), 0); } //uint8x16_t
|
|
#else
|
|
static ALWAYS_INLINE uint32_t mm_movemask_epu32(uint32x4_t v) { const uint32x4_t mask = {1,2,4,8}, av = vandq_u32(v, mask), xv = vextq_u32(av, av, 2), ov = vorrq_u32(av, xv); return vgetq_lane_u32(vorrq_u32(ov, vextq_u32(ov, ov, 3)), 0); }
|
|
#endif
|
|
// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
|
|
#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_))
|
|
|
|
#define _mm_shuffle_epi8(_u_, _v_) (__m128i)vqtbl1q_u8((uint8x16_t)(_u_), (uint8x16_t)(_v_))
|
|
#if defined(__aarch64__)
|
|
#define mm_shuffle_nnnn_epi32(_v_,_m_) (__m128i)vdupq_laneq_u32(_v_, _m_)
|
|
#else
|
|
#define mm_shuffle_nnnn_epi32(_v_,_m_) (__m128i)vdupq_n_u32(vgetq_lane_u32(_v_, _m_)
|
|
#endif
|
|
|
|
#ifdef USE_MACROS
|
|
#define mm_shuffle_2031_epi32(_v_) ({ uint32x4_t _zv = (uint32x4_t)vrev64q_u32(_v_); uint32x2x2_t _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
|
|
#define mm_shuffle_3120_epi32(_v_) ({ uint32x4_t _zv = _v_; _zv = vtrn_u32(vget_low_u32(_zv), vget_high_u32(_zv)); vcombine_u32(_zv.val[0], _zv.val[1]);})
|
|
#else
|
|
static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) { uint32x4_t a = (uint32x4_t)vrev64q_u32(v); uint32x2x2_t z = vtrn_u32(vget_low_u32(a), vget_high_u32(a)); return vcombine_u32(z.val[0], z.val[1]);}
|
|
static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) { uint32x2x2_t z = vtrn_u32(vget_low_u32(v), vget_high_u32(v)); return vcombine_u32(z.val[0], z.val[1]);}
|
|
#endif
|
|
|
|
#if defined(USE_MACROS) || defined(__clang__)
|
|
#define _mm_shuffle_epi32(_v_, _m_) ({ const uint32x4_t _av =_v_;\
|
|
uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));\
|
|
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);\
|
|
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);\
|
|
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3); _v;\
|
|
})
|
|
#define _mm_shuffle_epi32s(_v_, _m_) _mm_set_epi32(vgetq_lane_u32(_v_, ((_m_) ) & 0x3),\
|
|
vgetq_lane_u32(_v_, ((_m_) >> 2) & 0x3),\
|
|
vgetq_lane_u32(_v_, ((_m_) >> 4) & 0x3),\
|
|
vgetq_lane_u32(_v_, ((_m_) >> 6) & 0x3))
|
|
#else
|
|
static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _v_, const unsigned _m_) { const uint32x4_t _av =_v_;
|
|
uint32x4_t _v = vmovq_n_u32(vgetq_lane_u32(_av, (_m_) & 0x3));
|
|
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 2) & 0x3), _v, 1);
|
|
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 4) & 0x3), _v, 2);
|
|
_v = vsetq_lane_u32(vgetq_lane_u32(_av, ((_m_) >> 6) & 0x3), _v, 3);
|
|
return _v;
|
|
}
|
|
static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _v_, const unsigned _m_) {
|
|
return _mm_set_epi32(vgetq_lane_u32(_v_, ((_m_) ) & 0x3),
|
|
vgetq_lane_u32(_v_, ((_m_) >> 2) & 0x3),
|
|
vgetq_lane_u32(_v_, ((_m_) >> 4) & 0x3),
|
|
vgetq_lane_u32(_v_, ((_m_) >> 6) & 0x3));
|
|
}
|
|
#endif
|
|
#ifdef USE_MACROS
|
|
#define _mm_unpacklo_epi8( _u_,_v_) ({ uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
|
|
#define _mm_unpacklo_epi16(_u_,_v_) ({ uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
|
|
#define _mm_unpacklo_epi32(_u_,_v_) ({ uint32x2x2_t _zv = vzip_u32( vget_low_u32( _u_ ), vget_low_u32( _v_ )); vcombine_u32(_zv.val[0], _zv.val[1]);})
|
|
#define _mm_unpacklo_epi64(_u_,_v_) (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_)))
|
|
|
|
#define _mm_unpackhi_epi8( _u_,_v_) ({ uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);})
|
|
#define _mm_unpackhi_epi16(_u_,_v_) ({ uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);})
|
|
#define _mm_unpackhi_epi32(_u_,_v_) ({ uint32x2x2_t _zv = vzip_u32(vget_high_u32( _u_ ), vget_high_u32( _v_ )); vcombine_u32(_zv.val[0], _zv.val[1]);})
|
|
#define _mm_unpackhi_epi64(_u_,_v_) (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_)))
|
|
#else
|
|
static ALWAYS_INLINE __m128i _mm_unpacklo_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t _zv = vzip_u8 ( vget_low_u8( (uint8x16_t)(_u_)), vget_low_u8 ((uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]);}
|
|
static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16( vget_low_u16((uint16x8_t)(_u_)), vget_low_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]);}
|
|
static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32( vget_low_u32( _u_ ), vget_low_u32( _v_ )); return vcombine_u32(_zv.val[0], _zv.val[1]);}
|
|
static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_low_u64((uint64x2_t)(_u_)), vget_low_u64((uint64x2_t)(_v_))); }
|
|
|
|
static ALWAYS_INLINE __m128i _mm_unpackhi_epi8( __m128i _u_, __m128i _v_) { uint8x8x2_t _zv = vzip_u8 (vget_high_u8( (uint8x16_t)(_u_)), vget_high_u8( (uint8x16_t)(_v_))); return (uint32x4_t)vcombine_u8( _zv.val[0], _zv.val[1]); }
|
|
static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) { uint16x4x2_t _zv = vzip_u16(vget_high_u16((uint16x8_t)(_u_)), vget_high_u16((uint16x8_t)(_v_))); return (uint32x4_t)vcombine_u16(_zv.val[0], _zv.val[1]); }
|
|
static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) { uint32x2x2_t _zv = vzip_u32(vget_high_u32( _u_ ), vget_high_u32( _v_ )); return vcombine_u32(_zv.val[0], _zv.val[1]); }
|
|
static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) { return (uint32x4_t)vcombine_u64(vget_high_u64((uint64x2_t)(_u_)), vget_high_u64((uint64x2_t)(_v_))); }
|
|
#endif
|
|
|
|
#elif defined(__loongarch_lp64)
|
|
|
|
#include <lsxintrin.h>
|
|
// sse instruct to loongarch lsx instruct mapping
|
|
|
|
#ifdef USE_MACROS //---------------------------- Set : _mm_set_epi/_mm_set1_epi ----------------------------------------------------------
|
|
#define _mm_set_epi8(u15,u14,u13,u12,\
|
|
u11,u10, u9, u8,\
|
|
u7,u6,u5,u4,\
|
|
u3,u2,u1,u0) ({ uint8_t __attribute__((aligned(16))) _u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; (v4u32)__lsx_vld( _u, 0);})
|
|
#define _mm_set_epi16( u7,u6,u5,u4,\
|
|
u3,u2,u1,u0) ({ uint16_t __attribute__((aligned(16))) _u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; (v4u32)__lsx_vld( _u, 0);})
|
|
|
|
#define _mm_set_epi32( u3,u2,u1,u0) ({ uint32_t __attribute__((aligned(16))) _u[ 4] = { u0,u1,u2,u3 }; __lsx_vld(_u, 0);})
|
|
#define _mm_set_epi64x( u1,u0) ({ uint64_t __attribute__((aligned(16))) _u[ 2] = { u0,u1 }; (v4u32)__lsx_vld(_u);})
|
|
|
|
#else
|
|
static ALWAYS_INLINE __m128i _mm_set_epi8( uint8_t u15, uint8_t u14, uint8_t u13, uint8_t u12, uint8_t u11, uint8_t u10, uint8_t u9, uint8_t u8,
|
|
uint8_t u7, uint8_t u6, uint8_t u5, uint8_t u4,
|
|
uint8_t u3, uint8_t u2, uint8_t u1, uint8_t u0) {
|
|
uint8_t __attribute__((aligned(16))) u[16] = { u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15 }; return (__m128i)__lsx_vld(u, 0); }
|
|
static ALWAYS_INLINE __m128i _mm_set_epi16( uint16_t u7, uint16_t u6, uint16_t u5, uint16_t u4,
|
|
uint16_t u3, uint16_t u2, uint16_t u1, uint16_t u0) { uint16_t __attribute__((aligned(16))) u[ 8] = { u0,u1,u2,u3,u4,u5,u6,u7 }; return (__m128i)__lsx_vld(u, 0); }
|
|
static ALWAYS_INLINE __m128i _mm_set_epi32( uint32_t u3, uint32_t u2, uint32_t u1, uint32_t u0) { uint32_t __attribute__((aligned(16))) u[ 4] = { u0,u1,u2,u3 }; return __lsx_vld(u, 0); }
|
|
static ALWAYS_INLINE __m128i _mm_set_epi64x( uint64_t u1, uint64_t u0) { uint64_t __attribute__((aligned(16))) u[ 2] = { u0,u1 }; return (__m128i)__lsx_vld(u, 0); }
|
|
#endif
|
|
|
|
#define _mm_setr_epi16(u7,u6,u5,u4,u3,u2,u1,u0) _mm_set_epi16( u0,u1,u2,u3,u4,u5,u6,u7)
|
|
#define _mm_setr_epi32(u3,u2,u1,u0) _mm_set_epi32( u0,u1,u2,u3)
|
|
#define _mm_setr_epi64x(u1,u0) _mm_set_epi64x(u0,u0)
|
|
|
|
#define _mm_set1_epi8( _u8_ ) (__m128i)__lsx_vreplgr2vr_b(_u8_)
|
|
#define _mm_set1_epi16( _u16_) (__m128i)__lsx_vreplgr2vr_h(_u16_)
|
|
#define _mm_set1_epi32( _u32_) (__m128i)__lsx_vreplgr2vr_w(_u32_)
|
|
#define _mm_set1_epi64x(_u64_) (__m128i)__lsx_vreplgr2vr_d(_u64_)
|
|
#define _mm_setzero_si128() __lsx_vreplgr2vr_w( 0 )
|
|
|
|
#define _mm_cvtss_f32(_v_) __lsx_vpickve2gr_w((__m128)(_v_), 0)
|
|
#define _mm_setzero_ps() (__m128)__lsx_vldi(0)
|
|
#define _mm_set1_ps(_f32_) (__m128)__lsx_vreplgr2vr_w(_f32_)
|
|
//---------------------------------------------- Arithmetic -----------------------------------------------------------------------
|
|
#define _mm_add_epi8( _u_,_v_) (__m128i)__lsx_vadd_b((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_add_epi16( _u_,_v_) (__m128i)__lsx_vadd_h((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_add_epi32( _u_,_v_) (__m128i)__lsx_vadd_w((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_add_epi64( _u_,_v_) (__m128i)__lsx_vadd_d((__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
#define _mm_sub_epi8( _u_,_v_) (__m128i)__lsx_vsub_b((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_sub_epi16( _u_,_v_) (__m128i)__lsx_vsub_h((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_sub_epi32( _u_,_v_) (__m128i)__lsx_vsub_w((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_sub_epi64( _u_,_v_) (__m128i)__lsx_vsub_d((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_subs_epu8( _u_,_v_) (__m128i)__lsx_vssub_bu((__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
#define _mm_mullo_epi16(_u_, _v_) (__m128i)__lsx_vmul_h((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_mullo_epi32(_u_,_v_) (__m128i)__lsx_vmul_w((__m128i)(_u_), (__m128i)(_v_))
|
|
#define mm_mullo_epu32(_u_,_v_) (__m128i)__lsx_vmul_w((__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
#define _mm_mulhi_epi16s(_u_,_v_) (__m128i)__lsx_vmuh_h((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_mulhi_epi16(_u_,_v_) (__m128i)__lsx_vmuh_h((__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
#define _mm_mul_epu32(_u_, _v_) (__m128i)__lsx_vmulwev_d_wu((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_adds_epu16(_u_, _v_) (__m128i)__lsx_vsadd_hu((__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
static ALWAYS_INLINE __m128i _mm_madd_epi16(__m128i u, __m128i v) {
|
|
// 1. 执行16位有符号乘法得到32位中间结果
|
|
__m128i mul_even = __lsx_vmulwev_w_h(u, v); // 偶数位相乘 (0*0, 2*2, 4*4, 6*6)
|
|
__m128i mul_odd = __lsx_vmulwod_w_h(u, v); // 奇数位相乘 (1*1, 3*3, 5*5, 7*7)
|
|
|
|
// 2. 水平相加相邻的两个32位结果
|
|
__m128i sum = __lsx_vadd_w(mul_even, mul_odd); // [0*0+1*1, 2*2+3*3, 4*4+5*5, 6*6+7*7]
|
|
return sum;
|
|
}
|
|
//---------------------------------------------- Special math functions -----------------------------------------------------------
|
|
#define _mm_min_epu8(_u_, _v_) (__m128i)__lsx_vmin_bu((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_min_epu16(_u_, _v_) (__m128i)__lsx_vmin_hu((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_min_epi16(_u_, _v_) (__m128i)__lsx_vmin_h((__m128i)(_u_), (__m128i)(_v_))
|
|
//---------------------------------------------- Logical --------------------------------------------------------------------------
|
|
#define mm_testnz_epu32(_u_) (__lsx_vreplgr2vr_w(__lsx_vsrlri_w((__m128i)(_u_), 31)) != __lsx_vreplgr2vr_w(0))
|
|
#define mm_testnz_epu8(_u_) (__lsx_vreplgr2vr_b(__lsx_vsrlri_b((__m128i)(_u_), 7)) != __lsx_vreplgr2vr_b(0))
|
|
#define _mm_or_si128(_u_, _v_) (__m128i)__lsx_vor_v((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_and_si128(_u_, _v_) (__m128i)__lsx_vand_v((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_xor_si128(_u_, _v_) (__m128i)__lsx_vxor_v((__m128i)(_u_), (__m128i)(_v_))
|
|
//---------------------------------------------- Shift ----------------------------------------------------------------------------
|
|
#define mm_slli_epi8(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>7?(__m128i)__lsx_vreplgr2vr_b(0):(__m128i)__lsx_vslli_b((__m128i)(_u_), (_c_)))
|
|
#define mm_slli_epi16(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>15?(__m128i)__lsx_vreplgr2vr_h(0):(__m128i)__lsx_vslli_h((__m128i)(_u_), (_c_)))
|
|
#define mm_slli_epi32(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>31?(__m128i)__lsx_vreplgr2vr_w(0):(__m128i)__lsx_vslli_w((__m128i)(_u_), (_c_)))
|
|
#define mm_slli_epi64(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>63?(__m128i)__lsx_vreplgr2vr_d(0):(__m128i)__lsx_vslli_d((__m128i)(_u_), (_c_)))
|
|
#define _mm_slli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(_c_)>15?__lsx_vreplgr2vr_b(0):__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), (__m128i)(_v_), __lsx_vreplgr2vr_b((16 - (_c_)) + ((16 - (_c_)) << 8))))
|
|
|
|
#define mm_srli_epi8(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)> 7?(__m128i)__lsx_vreplgr2vr_b(0):(__m128i)__lsx_vsrlri_b((__m128i)(_u_), (_c_)))
|
|
#define mm_srli_epi16(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>15?(__m128i)__lsx_vreplgr2vr_h(0):(__m128i)__lsx_vsrlri_h((__m128i)(_u_), (_c_)))
|
|
#define mm_srli_epi32(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>31?(__m128i)__lsx_vreplgr2vr_w(0):(__m128i)__lsx_vsrlri_w((__m128i)(_u_), (_c_)))
|
|
#define mm_srli_epi64(_u_, _c_) (__m128i)((_c_)<1?(__m128i)(_u_):(_c_)>63?(__m128i)__lsx_vreplgr2vr_d(0):(__m128i)__lsx_vsrlri_d((__m128i)(_u_), (_c_)))
|
|
#define _mm_srli_si128(_v_, _c_) (__m128i)((_c_)<1?(_v_):(_c_)>15?__lsx_vreplgr2vr_b(0): __lsx_vsrlr_b((__m128i)(_v_), __lsx_vreplgr2vr_b((_c_) * 8)) )
|
|
|
|
#define mm_srai_epi8(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_b((__m128i)(_v_), (_c_)) )
|
|
#define mm_srai_epi16(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_h((__m128i)(_v_), (_c_)) )
|
|
#define mm_srai_epi32(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_w((__m128i)(_v_), (_c_)) )
|
|
#define mm_srai_epi64(_v_, _c_) (__m128i)((_c_)<1?(_v_):(__m128i)__lsx_vsrai_d((__m128i)(_v_), (_c_)) )
|
|
|
|
#define _mm_slli_epi8(_u_, _m_) (__m128i)__lsx_vsll_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
|
|
#define _mm_slli_epi16(_u_, _m_) (__m128i)__lsx_vsll_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
|
|
#define _mm_slli_epi32(_u_, _m_) (__m128i)__lsx_vsll_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
|
|
#define _mm_slli_epi64(_u_, _m_) (__m128i)__lsx_vsll_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
|
|
|
|
#define _mm_srli_epi8( _u_, _m_) (__m128i)__lsx_vsrl_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
|
|
#define _mm_srli_epi16(_u_, _m_) (__m128i)__lsx_vsrl_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
|
|
#define _mm_srli_epi32(_u_, _m_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
|
|
#define _mm_srli_epi64(_u_, _m_) (__m128i)__lsx_vsrl_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
|
|
|
|
#define _mm_srai_epi8( _u_, _m_) (__m128i)__lsx_vsra_b((__m128i)(_u_), __lsx_vreplgr2vr_b(_m_))
|
|
#define _mm_srai_epi16(_u_, _m_) (__m128i)__lsx_vsra_h((__m128i)(_u_), __lsx_vreplgr2vr_h(_m_))
|
|
#define _mm_srai_epi32(_u_, _m_) (__m128i)__lsx_vsra_w((__m128i)(_u_), __lsx_vreplgr2vr_w(_m_))
|
|
#define _mm_srai_epi64(_u_, _m_) (__m128i)__lsx_vsra_d((__m128i)(_u_), __lsx_vreplgr2vr_d(_m_))
|
|
|
|
#define _mm_sll_epi8(_u_, _v_) (__m128i)__lsx_vsll_b((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_sll_epi16(_u_, _v_) (__m128i)__lsx_vsll_h((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_sll_epi32(_u_, _v_) (__m128i)__lsx_vsll_w((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_sll_epi64(_u_, _v_) (__m128i)__lsx_vsll_d((__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
#define _mm_srl_epi8( _u_, _v_) (__m128i)__lsx_vsrl_b((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_srl_epi16(_u_, _v_) (__m128i)__lsx_vsrl_h((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_srl_epi32(_u_, _v_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_srl_epi64(_u_, _v_) (__m128i)__lsx_vsrl_d((__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
#define _mm_sllv_epi32(_u_, _v_) (__m128i)__lsx_vsll_w((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_srlv_epi32(_u_, _v_) (__m128i)__lsx_vsrl_w((__m128i)(_u_), (__m128i)(_v_))
|
|
//---------------------------------------------- Compare --------- true/false->1/0 (all bits set) ---------------------------------
|
|
#define _mm_cmpeq_epi8( _u_, _v_) (__m128i)__lsx_vseq_b((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_cmpeq_epi16(_u_, _v_) (__m128i)__lsx_vseq_h((__m128i)(_u_), (__m128i)(_v_))
|
|
#define _mm_cmpeq_epi32(_u_, _v_) (__m128i)__lsx_vseq_w((__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
#define _mm_cmpgt_epi8( _u_, _v_) (__m128i)__lsx_vslt_b((__m128i)(_v_), (__m128i)(_u_)) // 注意参数顺序
|
|
#define _mm_cmpgt_epi16(_u_, _v_) (__m128i)__lsx_vslt_h((__m128i)(_v_), (__m128i)(_u_))
|
|
#define _mm_cmpgt_epi32(_u_, _v_) (__m128i)__lsx_vslt_w((__m128i)(_v_), (__m128i)(_u_))
|
|
|
|
#define _mm_cmpgt_epu16(_u_, _v_) (__m128i)__lsx_vslt_hu((__m128i)(_v_), (__m128i)(_u_))
|
|
#define mm_cmpgt_epu32(_u_, _v_) (__m128i)__lsx_vslt_wu((__m128i)(_v_), (__m128i)(_u_))
|
|
//---------------------------------------------- Load -----------------------------------------------------------------------------
|
|
#define _mm_loadl_epi64(_u64p_) (__m128i)__lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0),*(const uint64_t*)(_u64p_), 0) // 加载并广播到低64位
|
|
#define mm_loadu_epi64p(_u64p_, _u_) (__m128i)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const uint64_t*)(_u64p_), 0)
|
|
#define _mm_loadu_si128(_ip_) (__m128i)__lsx_vld((const int32_t*)(_ip_), 0)
|
|
#define _mm_load_si128(_ip_) (__m128i)__lsx_vld((const int32_t*)(_ip_), 0)
|
|
|
|
#define _mm_load_ps(_ip_) (__m128)__lsx_vld((const float*)(_ip_), 0)
|
|
#define _mm_loadu_ps(_ip_) (__m128)__lsx_vld((const float*)(_ip_), 0)
|
|
#define _mm_load1_ps(_ip_) (__m128)__lsx_vreplgr2vr_w(*(const float*)(_ip_))
|
|
#define _mm_loadl_pi(_u_, _ip_) (__m128)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const float*)(_ip_), 0)
|
|
#define _mm_loadh_pi(_u_, _ip_) (__m128)__lsx_vinsgr2vr_d((__m128i)(_u_), *(const float*)(_ip_), 1)
|
|
|
|
//---------------------------------------------- Store ----------------------------------------------------------------------------
|
|
#define _mm_storel_epi64(_ip_, _u_) __lsx_vstelm_d((__m128i)(_u_), (uint64_t*)(_ip_), 0, 0)
|
|
#define _mm_storeu_si128(_ip_, _u_) __lsx_vstx((__m128i)(_u_), (int32_t*)(_ip_), 0)
|
|
|
|
#define _mm_store_ps(_ip_, _u_) __lsx_vst((v4f32)(_u_), (float*)(_ip_), 0)
|
|
#define _mm_storeu_ps(_ip_, _u_) __lsx_vstx((v4f32)(_u_), (float*)(_ip_), 0)
|
|
#define _mm_store_ss(_ip_, _u_) __lsx_vstelm_w((__m128i)(_u_), (float*)(_ip_), 0, 0)
|
|
//---------------------------------------------- Convert --------------------------------------------------------------------------
|
|
#define mm_cvtsi64_si128p(_u64p_,_v_) mm_loadu_epi64p(_u64p_,_v_)
|
|
#define _mm_cvtsi64_si128(_u_) (__m128i)__lsx_vreplgr2vr_d(_u_)
|
|
//---------------------------------------------- Reverse bits/bytes ---------------------------------------------------------------
|
|
static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i _v_) {
|
|
uint64_t low_src = __lsx_vpickve2gr_du(_v_, 0);
|
|
uint64_t low = 0;
|
|
asm volatile(
|
|
"bitrev.8b %[out], %[in]\n\t"
|
|
:[out]"+r"(low)
|
|
:[in]"r"(low_src)
|
|
);
|
|
uint64_t high_src = __lsx_vpickve2gr_du(_v_, 1);
|
|
uint64_t high = 0;
|
|
asm volatile(
|
|
"bitrev.8b %[out], %[in]\n\t"
|
|
:[out]"+r"(high)
|
|
:[in]"r"(high_src)
|
|
);
|
|
return __lsx_vinsgr2vr_d(__lsx_vinsgr2vr_d(__lsx_vreplgr2vr_w(0),low,0),high,1);
|
|
}
|
|
|
|
#define mm_rev_epi16(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E})
|
|
#define mm_rev_epi32(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C})
|
|
#define mm_rev_epi64(_v_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_v_), (__m128i){0x0706050403020100, 0x0F0E0D0C0B0A0908})
|
|
//--------------------------------------------- Insert/extract --------------------------------------------------------------------
|
|
#define mm_extract_epi32x(_u_, _u32_, _id_) __lsx_vstelm_w((__m128i)(_u_), (uint32_t*)&(_u32_), 0, (_id_))
|
|
#define _mm_extract_epi64x(_u_, _u64_, _id_) __lsx_vstelm_d((__m128i)(_u_), (uint64_t*)&(_u64_), 0, (_id_))
|
|
|
|
#define _mm_extract_epi8(_u_, _id_) __lsx_vpickve2gr_b((__m128i)(_u_), (_id_))
|
|
#define _mm_extract_epi16(_u_, _id_) __lsx_vpickve2gr_h((__m128i)(_u_), (_id_))
|
|
#define _mm_extract_epi32(_u_, _id_) __lsx_vpickve2gr_w((__m128i)(_u_), (_id_))
|
|
#define mm_extract_epu32(_u_, _id_) __lsx_vpickve2gr_wu((__m128i)(_u_), (_id_))
|
|
#define _mm_cvtsi128_si32(_u_) __lsx_vpickve2gr_w((__m128i)(_u_), 0)
|
|
#define _mm_cvtsi128_si64(_u_) __lsx_vpickve2gr_d((__m128i)(_u_), 0)
|
|
|
|
#define _mm_insert_epu32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const uint32_t*)(_u32p_), (_id_))
|
|
#define mm_insert_epi32p(_u_, _u32p_, _id_) (__m128i)__lsx_vinsgr2vr_w((__m128i)(_u_), *(const int32_t*)(_u32p_), (_id_))
|
|
#define _mm_cvtsi32_si128(_x_) (__m128i)__lsx_vinsgr2vr_w(__lsx_vreplgr2vr_w(0), (_x_), 0)
|
|
|
|
#define _mm_blendv_epi8(_u_, _v_, _m_) (__m128i)__lsx_vbitsel_v((__m128i)(_u_), (__m128i)(_v_), (__m128i)(_m_))
|
|
//---------------------------------------------- Miscellaneous --------------------------------------------------------------------
|
|
#define _mm_alignr_epi8(_u_, _v_, _m_) (__m128i)__lsx_vshuf_b((__m128i)(_v_), (__m128i)(_u_), (__m128i){_m_,_m_+1,_m_+2,_m_+3,_m_+4,_m_+5,_m_+6,_m_+7, _m_+8,_m_+9,_m_+10,_m_+11,_m_+12,_m_+13,_m_+14,_m_+15})
|
|
#define _mm_packs_epi16(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlrni_b_h((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlrni_b_h((__m128i)(_v_), (__m128i)(_u_), 0))
|
|
#define _mm_packs_epi32(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlrni_h_w((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlrni_h_w((__m128i)(_v_), (__m128i)(_u_), 0))
|
|
|
|
#define _mm_packs_epu16(_u_, _v_) (__m128i)__lsx_vilvl_d((__m128i)(_v_), (__m128i)(_u_))
|
|
#define _mm_packus_epi16(_u_, _v_) (__m128i)__lsx_vilvl_d(__lsx_vssrlni_bu_h((__m128i)(_u_), (__m128i)(_v_), 0), __lsx_vssrlni_bu_h((__m128i)(_v_), (__m128i)(_u_), 0))
|
|
|
|
static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
|
|
// 步骤1:提取每个字节的最高位(符号位)
|
|
__m128i signs = __lsx_vsrai_b(v, 7); // 所有字节算术右移7位, 保留符合位
|
|
|
|
// 步骤2:创建位掩码 (LSB-first: 0x01, 0x02, 0x04,...)
|
|
static const uint8_t mask_data[16] = {
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, // 低8字节
|
|
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 // 高8字节
|
|
};
|
|
__m128i mask = __lsx_vld((const void*)mask_data, 0); // 从内存加载掩码
|
|
|
|
// 步骤3:应用位掩码
|
|
__m128i masked = __lsx_vand_v(signs, mask);
|
|
|
|
// 步骤4:水平相加(8-bit → 16-bit → 32-bit)
|
|
__m128i sum16 = __lsx_vhaddw_hu_bu(masked, masked);
|
|
__m128i sum32 = __lsx_vhaddw_wu_hu(sum16, sum16);
|
|
__m128i sum64 = __lsx_vhaddw_du_wu(sum32, sum32);
|
|
|
|
// 步骤5:提取低16位结果
|
|
return (uint16_t)__lsx_vpickve2gr_bu(sum64, 0) | (((uint16_t)__lsx_vpickve2gr_bu(sum64, 8)) << 8);
|
|
}
|
|
|
|
//-------- Neon movemask ------ All lanes must be 0 or -1 (=0xff, 0xffff or 0xffffffff)
|
|
|
|
static ALWAYS_INLINE uint8_t mm_movemask_epi8s(__m128i sv) {
|
|
static const uint64_t mask_data[2] = {0x0102040810204080ULL, 0x0102040810204080ULL};
|
|
const __m128i mask = __lsx_vld((const void*)mask_data, 0);
|
|
__m128i tmp = __lsx_vand_v(sv, mask);
|
|
tmp = __lsx_vhaddw_hu_bu(tmp, __lsx_vldi(0));
|
|
tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0));
|
|
return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0);
|
|
}
|
|
|
|
static ALWAYS_INLINE uint16_t mm_movemask_epu16(__m128i v) {
|
|
static const uint64_t mask_data[2] = {0x0102040810204080ULL, 0x0102040810204080ULL};
|
|
const __m128i mask = __lsx_vld((const void*)mask_data, 0);
|
|
__m128i tmp = __lsx_vand_v(v, mask);
|
|
tmp = __lsx_vhaddw_wu_hu(tmp, __lsx_vldi(0));
|
|
return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_du_wu(tmp, __lsx_vldi(0)), 0);
|
|
}
|
|
|
|
static ALWAYS_INLINE uint32_t mm_movemask_epu32(__m128i v) {
|
|
// 1. 加载位掩码常量 (0x00000001, 0x00000002, 0x00000004, 0x00000008)
|
|
static const uint32_t mask_data[4] = {
|
|
0x00000001, 0x00000002, 0x00000004, 0x00000008
|
|
};
|
|
__m128i mask = __lsx_vld((const void*)mask_data, 0); // 加载32位x4的掩码
|
|
|
|
// 2. 应用位掩码
|
|
__m128i masked = __lsx_vand_v(v, mask);
|
|
|
|
// 3. 水平相加
|
|
__m128i sum1 = __lsx_vhaddw_du_wu(masked, masked); // 4x32 -> 2x64
|
|
__m128i sum2 = __lsx_vhaddw_qu_du(sum1, sum1); // 2x64 -> 1x128
|
|
|
|
// 4. 提取结果
|
|
return (uint32_t)__lsx_vpickve2gr_b(sum2, 0);
|
|
}
|
|
|
|
static ALWAYS_INLINE uint64_t mm_movemask_epu64(__m128i v) {
|
|
// 1. 加载位掩码常量 (0x0000000000000001, 0x0000000000000002)
|
|
const __m128i mask = {1ULL, 2ULL};
|
|
|
|
// 2. 应用位掩码并直接提取结果
|
|
__m128i masked = __lsx_vand_v(v, mask);
|
|
return __lsx_vpickve2gr_d(masked, 0) | __lsx_vpickve2gr_d(masked, 1);
|
|
}
|
|
// --------------------------------------------- Swizzle : _mm_shuffle_epi8 / _mm_shuffle_epi32 / Pack/Unpack -----------------------------------------
|
|
#define _MM_SHUFFLE(_u3_,_u2_,_u1_,_u0_) ((_u3_) << 6 | (_u2_) << 4 | (_u1_) << 2 | (_u0_))
|
|
|
|
#define _mm_shuffle_epi8(_u_, _v_) (__m128i)__lsx_vshuf_b((__m128i)(_u_), (__m128i)(_u_), (__m128i)(_v_))
|
|
|
|
#define mm_shuffle_nnnn_epi32(_v_, _m_) (__m128i)__lsx_vreplvei_w((__m128i)(_v_), (_m_))
|
|
|
|
#ifdef USE_MACROS
|
|
#define mm_shuffle_2031_epi32(_u_) (__m128i)__lsx_vshuf4i_w(v, 0x8D)
|
|
#define mm_shuffle_3120_epi32(_u_) (__m128i)__lsx_vshuf4i_w(v, 0xD8)
|
|
#else
|
|
static ALWAYS_INLINE __m128i mm_shuffle_2031_epi32(__m128i v) {return __lsx_vshuf4i_w(v, 0x8D);}
|
|
static ALWAYS_INLINE __m128i mm_shuffle_3120_epi32(__m128i v) {return __lsx_vshuf4i_w(v, 0xD8);}
|
|
#endif
|
|
|
|
#if defined(USE_MACROS) || defined(__clang__)
|
|
#define _mm_shuffle_epi32(_u_, _m_) (__m128i)__lsx_vshuf4i_w((__m128i)(_u_), (_m_))
|
|
#define _mm_shuffle_epi32s(_u_, _m_) (__m128i)__lsx_vshuf_w((__m128i)(_u_), (__m128i)(_u_), (__m128i){(_m_)&3, ((_m_)>>2)&3, ((_m_)>>4)&3, ((_m_)>>6)&3})
|
|
#else
|
|
static ALWAYS_INLINE __m128i _mm_shuffle_epi32(__m128i _u_, const unsigned _m_) {return (__m128i)__lsx_vshuf4i_w((__m128i)_u_, _m_);}
|
|
|
|
static ALWAYS_INLINE __m128i _mm_shuffle_epi32s(__m128i _u_, const unsigned _m_) {
|
|
const uint32_t idx0 = (_m_) & 0x3;
|
|
const uint32_t idx1 = ((_m_) >> 2) & 0x3;
|
|
const uint32_t idx2 = ((_m_) >> 4) & 0x3;
|
|
const uint32_t idx3 = ((_m_) >> 6) & 0x3;
|
|
return (__m128i)__lsx_vshuf_w((__m128i)_u_, (__m128i)_u_, (__m128i){idx0, idx1, idx2, idx3});
|
|
}
|
|
#endif
|
|
#ifdef USE_MACROS
|
|
#define _mm_unpacklo_epi8(_u_,_v_) (__m128i)__lsx_vilvl_b((__m128i)(_v_), (__m128i)(_u_))
|
|
#define _mm_unpacklo_epi16(_u_,_v_) (__m128i)__lsx_vilvl_h((__m128i)(_v_), (__m128i)(_u_))
|
|
#define _mm_unpacklo_epi32(_u_,_v_) (__m128i)__lsx_vilvl_w((__m128i)(_v_), (__m128i)(_u_))
|
|
#define _mm_unpacklo_epi64(_u_,_v_) (__m128i)__lsx_vilvl_d((__m128i)(_v_), (__m128i)(_u_))
|
|
|
|
#define _mm_unpackhi_epi8(_u_,_v_) (__m128i)__lsx_vilvh_b((__m128i)(_v_), (__m128i)(_u_))
|
|
#define _mm_unpackhi_epi16(_u_,_v_) (__m128i)__lsx_vilvh_h((__m128i)(_v_), (__m128i)(_u_))
|
|
#define _mm_unpackhi_epi32(_u_,_v_) (__m128i)__lsx_vilvh_w((__m128i)(_v_), (__m128i)(_u_))
|
|
#define _mm_unpackhi_epi64(_u_,_v_) (__m128i)__lsx_vilvh_d((__m128i)(_v_), (__m128i)(_u_))
|
|
#else
|
|
static ALWAYS_INLINE __m128i _mm_unpacklo_epi8(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_b((__m128i)_v_, (__m128i)_u_);}
|
|
static ALWAYS_INLINE __m128i _mm_unpacklo_epi16(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_h((__m128i)_v_, (__m128i)_u_);}
|
|
static ALWAYS_INLINE __m128i _mm_unpacklo_epi32(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_w((__m128i)_v_, (__m128i)_u_);}
|
|
static ALWAYS_INLINE __m128i _mm_unpacklo_epi64(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvl_d((__m128i)_v_, (__m128i)_u_);}
|
|
|
|
static ALWAYS_INLINE __m128i _mm_unpackhi_epi8(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_b((__m128i)_v_, (__m128i)_u_);}
|
|
static ALWAYS_INLINE __m128i _mm_unpackhi_epi16(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_h((__m128i)_v_, (__m128i)_u_);}
|
|
static ALWAYS_INLINE __m128i _mm_unpackhi_epi32(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_w((__m128i)_v_, (__m128i)_u_);}
|
|
static ALWAYS_INLINE __m128i _mm_unpackhi_epi64(__m128i _u_, __m128i _v_) {return (__m128i)__lsx_vilvh_d((__m128i)_v_, (__m128i)_u_);}
|
|
#endif
|
|
|
|
#else //----------------- intel SSE2/SSSE3 ( wraper functions compatible with intel/arm; permits to have one source code version for arm+intel) --------------
|
|
#define mm_movemask_epu32(_v_) _mm_movemask_ps(_mm_castsi128_ps(_v_))
|
|
#define mm_movemask_epu16(_v_) _mm_movemask_epi8(_v_)
|
|
#define mm_loadu_epi64p( _u64p_,_v_) _v_ = _mm_cvtsi64_si128(ctou64(_u64p_))
|
|
|
|
#define mm_extract_epu32( _v_, _id_) _mm_extract_epi32(_v_, _id_)
|
|
#define mm_extract_epi32x(_v_,_u32_, _id_) _u32_ = _mm_extract_epi32(_v_, _id_)
|
|
#define mm_extract_epi64x(_v_,_u64_, _id_) _u64_ = _mm_extract_epi64(_v_, _id_)
|
|
#define mm_insert_epi32p( _v_,_u32p_,_c_) _mm_insert_epi32( _v_,ctou32(_u32p_),_c_)
|
|
|
|
#define mm_mullo_epu32( _u_,_v_) _mm_mullo_epi32(_u_,_v_)
|
|
#define mm_cvtsi64_si128p(_u64p_,_v_) _v_ = _mm_cvtsi64_si128(ctou64(_u64p_))
|
|
|
|
#define cv80000000_def _mm_set1_epi32((int)0x80000000)
|
|
#define mm_cmplt_epu32( _u_, _v_) _mm_cmplt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000)) //__m128i cv80000000 = _mm_set1_epi32(0x80000000); must be declared
|
|
#define mm_cmpgt_epu32( _u_, _v_) _mm_cmpgt_epi32(_mm_xor_si128(_u_, cv80000000), _mm_xor_si128(_v_, cv80000000))
|
|
#define _mm_cmplt_epu32( _u_, _v_) _mm_cmplt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000)))
|
|
#define _mm_cmpgt_epu32( _u_, _v_) _mm_cmpgt_epi32(_mm_xor_si128(_u_, _mm_set1_epi32(0x80000000)), _mm_xor_si128(_v_, _mm_set1_epi32(0x80000000)))
|
|
|
|
#define mm_shuffle_nnnn_epi32(_v_, _n_) _mm_shuffle_epi32(_v_, _MM_SHUFFLE(_n_,_n_,_n_,_n_))
|
|
#define mm_shuffle_2031_epi32(_v_) _mm_shuffle_epi32(_v_, _MM_SHUFFLE(2,0,3,1))
|
|
#define mm_shuffle_3120_epi32(_v_) _mm_shuffle_epi32(_v_, _MM_SHUFFLE(3,1,2,0))
|
|
|
|
#define _mm_slli_epi8(_v_, _m_ ) _mm_and_si128(_mm_set1_epi8(0xff << _m_), _mm_slli_epi32(_v_, _m_ ))
|
|
#define _mm_srli_epi8(_v_, _m_ ) _mm_and_si128(_mm_set1_epi8(0xff >> _m_), _mm_srli_epi32(_v_, _m_ ))
|
|
|
|
#define mm_slli_epi8( _v_,_c_) _mm_slli_epi8( _v_,_c_) // parameter c MUST be a constant for compatibilty with the arm functions above
|
|
#define mm_slli_epi16( _v_,_c_) _mm_slli_epi16(_v_,_c_)
|
|
#define mm_slli_epi32( _v_,_c_) _mm_slli_epi32(_v_,_c_)
|
|
#define mm_slli_epi64( _v_,_c_) _mm_slli_epi64(_v_,_c_)
|
|
|
|
#define mm_srli_epi8( _v_,_c_) _mm_srli_epi8( _v_,_c_)
|
|
#define mm_srli_epi16( _v_,_c_) _mm_srli_epi16(_v_,_c_)
|
|
#define mm_srli_epi32( _v_,_c_) _mm_srli_epi32(_v_,_c_)
|
|
#define mm_srli_epi64( _v_,_c_) _mm_srli_epi64(_v_,_c_)
|
|
|
|
#define mm_srai_epi8( _v_,_c_) _mm_srai_epi8( _v_,_c_)
|
|
#define mm_srai_epi16( _v_,_c_) _mm_srai_epi16(_v_,_c_)
|
|
#define mm_srai_epi32( _v_,_c_) _mm_srai_epi32(_v_,_c_)
|
|
#define mm_srai_epi64( _v_,_c_) _mm_srai_epi64(_v_,_c_)
|
|
|
|
#ifdef __SSSE3__
|
|
static ALWAYS_INLINE __m128i mm_rbit_epi8(__m128i v) { // reverse bits in bytes
|
|
__m128i fv = _mm_set_epi8(15, 7,11, 3,13, 5, 9, 1,14, 6,10, 2,12, 4, 8, 0), cv0f_8 = _mm_set1_epi8(0xf);
|
|
__m128i lv = _mm_shuffle_epi8(fv,_mm_and_si128( v, cv0f_8));
|
|
__m128i hv = _mm_shuffle_epi8(fv,_mm_and_si128( mm_srli_epi64(v, 4), cv0f_8));
|
|
return _mm_or_si128( mm_slli_epi64(lv,4), hv);
|
|
}
|
|
|
|
static ALWAYS_INLINE __m128i mm_rev_epi16(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(14,15,12,13,10,11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } // reverse vector bytes in uint??_t
|
|
static ALWAYS_INLINE __m128i mm_rev_epi32(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8(12,13,14,15, 8, 9,10,11, 4, 5, 6, 7, 0, 1, 2, 3)); }
|
|
static ALWAYS_INLINE __m128i mm_rev_epi64(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, 0, 1, 2, 3, 4, 5, 6, 7)); }
|
|
static ALWAYS_INLINE __m128i mm_rev_si128(__m128i v) { return _mm_shuffle_epi8(v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15)); }
|
|
#endif
|
|
#endif
|
|
#endif
|