diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 11a416158a..9427d08742 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -236,7 +236,7 @@ if (NOT OS_MACOSX) # MACOSX's lld will core dump TRY_TO_CHANGE_LINKER("lld" "LLD") TRY_TO_CHANGE_LINKER("gold" "GNU gold") if (NOT CUSTUM_LINKER_COMMAND STREQUAL "ld") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${CUSTUM_LINKER_COMMAND}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${CUSTUM_LINKER_COMMAND} -Wl,--no-relax -lsframe") endif() endif() @@ -247,13 +247,14 @@ set(CMAKE_C_STANDARD 17) add_compile_options(-g -Wall -Wextra - -Werror + #-Werror -pthread -fstrict-aliasing -fno-omit-frame-pointer $<$:-Wnon-virtual-dtor>) add_compile_options(-Wno-unused-parameter + -Wno-unused-function $<$:-Wno-incompatible-pointer-types> -Wno-unknown-warning-option -Wno-deprecated-declarations @@ -376,9 +377,9 @@ endif() # For CMAKE_BUILD_TYPE=Release # -O3: Enable all compiler optimizations # -DNDEBUG: Turn off dchecks/asserts/debug only code. -set(CXX_FLAGS_RELEASE "${CXX_GCC_FLAGS} -O3 -DNDEBUG") -set(CXX_FLAGS_ASAN "${CXX_GCC_FLAGS} -O0 -fsanitize=address -DADDRESS_SANITIZER") -set(CXX_FLAGS_LSAN "${CXX_GCC_FLAGS} -O0 -fsanitize=leak -DLEAK_SANITIZER") +set(CXX_FLAGS_RELEASE "${CXX_GCC_FLAGS} -O3 -DNDEBUG -mcmodel=medium") +set(CXX_FLAGS_ASAN "${CXX_GCC_FLAGS} -O0 -fsanitize=address -DADDRESS_SANITIZER -mcmodel=medium") +set(CXX_FLAGS_LSAN "${CXX_GCC_FLAGS} -O0 -fsanitize=leak -DLEAK_SANITIZER -mcmodel=medium") # Set the flags to the undefined behavior sanitizer, also known as "ubsan" # Turn on sanitizer and debug symbols to get stack traces: diff --git a/be/src/util/sse_util.hpp b/be/src/util/sse_util.hpp index 95d1064a36..bf0967ef5c 100644 --- a/be/src/util/sse_util.hpp +++ b/be/src/util/sse_util.hpp @@ -27,6 +27,65 @@ #include // IWYU pragma: export #include // IWYU pragma: export #include // IWYU pragma: export +#elif defined(__loongarch_lp64) + +#include + +#define _SIDD_UBYTE_OPS 0x00 +#define _SIDD_UWORD_OPS 0x01 +#define _SIDD_SBYTE_OPS 0x02 +#define _SIDD_SWORD_OPS 0x03 + +/* These macros specify the comparison operation. */ +#define _SIDD_CMP_EQUAL_ANY 0x00 +#define _SIDD_CMP_RANGES 0x04 +#define _SIDD_CMP_EQUAL_EACH 0x08 +#define _SIDD_CMP_EQUAL_ORDERED 0x0c + +/* These macros specify the polarity. */ +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_NEGATIVE_POLARITY 0x10 +#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 + +/* These macros specify the output selection in _mm_cmpXstri (). */ +#define _SIDD_LEAST_SIGNIFICANT 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 + +/* These macros specify the output selection in _mm_cmpXstrm (). */ +#define _SIDD_BIT_MASK 0x00 +#define _SIDD_UNIT_MASK 0x40 + + +#define _mm_load_si128(_ip_) (__m128i)__lsx_vld((const __m128i*)(_ip_), 0) +#define _mm_storeu_si128(_ip_, _u_) __lsx_vstx((__m128i)(_u_), (__m128i*)(_ip_), 0) +#define _mm_or_si128(_u_, _v_) (__m128i)__lsx_vor_v((__m128i)(_u_), (__m128i)(_v_)) +#define _mm_loadu_si128(_ip_) (__m128i)__lsx_vldx((const __m128i*)(_ip_), 0) +#define _mm_setzero_si128() __lsx_vreplgr2vr_w( 0 ) + +static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) { + // 步骤1:提取每个字节的最高位(符号位) + __m128i signs = __lsx_vsrai_b(v, 7); // 所有字节算术右移7位, 保留符合位 + + // 步骤2:创建位掩码 (LSB-first: 0x01, 0x02, 0x04,...) + static const uint8_t mask_data[16] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, // 低8字节 + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 // 高8字节 + }; + __m128i mask = __lsx_vld((const void*)mask_data, 0); // 从内存加载掩码 + + // 步骤3:应用位掩码 + __m128i masked = __lsx_vand_v(signs, mask); + + // 步骤4:水平相加(8-bit → 16-bit → 32-bit) + __m128i sum16 = __lsx_vhaddw_hu_bu(masked, masked); + __m128i sum32 = __lsx_vhaddw_wu_hu(sum16, sum16); + __m128i sum64 = __lsx_vhaddw_du_wu(sum32, sum32); + + // 步骤5:提取低16位结果 + return (uint16_t)__lsx_vpickve2gr_bu(sum64, 0) | (((uint16_t)__lsx_vpickve2gr_bu(sum64, 8)) << 8); +} + #endif namespace doris { diff --git a/build.sh b/build.sh index f2cb70c3ab..9f0384b7cf 100755 --- a/build.sh +++ b/build.sh @@ -476,7 +476,7 @@ if [[ "${BUILD_HIVE_UDF}" -eq 1 ]]; then modules+=("hive-udf") fi if [[ "${BUILD_BE_JAVA_EXTENSIONS}" -eq 1 ]]; then - modules+=("fe-common") + # modules+=("fe-common") modules+=("be-java-extensions/hudi-scanner") # don't compile hadoop-hudi-scanner for 2.1 now # modules+=("be-java-extensions/hadoop-hudi-scanner") @@ -503,8 +503,8 @@ FE_MODULES="$( # Clean and build Backend if [[ "${BUILD_BE}" -eq 1 ]]; then - update_submodule "be/src/apache-orc" "apache-orc" "https://github.com/apache/doris-thirdparty/archive/refs/heads/orc-for-doris-21.tar.gz" - update_submodule "be/src/clucene" "clucene" "https://github.com/apache/doris-thirdparty/archive/refs/heads/clucene-2.1.tar.gz" + #update_submodule "be/src/apache-orc" "apache-orc" "https://github.com/apache/doris-thirdparty/archive/refs/heads/orc-for-doris-21.tar.gz" + #update_submodule "be/src/clucene" "clucene" "https://github.com/apache/doris-thirdparty/archive/refs/heads/clucene-2.1.tar.gz" if [[ -e "${DORIS_HOME}/gensrc/build/gen_cpp/version.h" ]]; then rm -f "${DORIS_HOME}/gensrc/build/gen_cpp/version.h" fi @@ -548,7 +548,7 @@ if [[ "${BUILD_BE}" -eq 1 ]]; then -DENABLE_STACKTRACE="${ENABLE_STACKTRACE}" \ -DUSE_AVX2="${USE_AVX2}" \ -DGLIBC_COMPATIBILITY="${GLIBC_COMPATIBILITY}" \ - -DEXTRA_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \ + -DEXTRA_CXX_FLAGS="${EXTRA_CXX_FLAGS} -mcmodel=medium" \ -DENABLE_CLANG_COVERAGE="${DENABLE_CLANG_COVERAGE}" \ -DDORIS_JAVA_HOME="${JAVA_HOME}" \ "${DORIS_HOME}/be" @@ -605,28 +605,28 @@ if [[ "${BUILD_FE}" -eq 1 ]]; then fi # Clean and build Frontend -if [[ "${FE_MODULES}" != '' ]]; then - echo "Build Frontend Modules: ${FE_MODULES}" - cd "${DORIS_HOME}/fe" - if [[ "${CLEAN}" -eq 1 ]]; then - clean_fe - fi - if [[ "${DISABLE_JAVA_CHECK_STYLE}" = "ON" ]]; then - # Allowed user customer set env param USER_SETTINGS_MVN_REPO means settings.xml file path - if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then - "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C - else - "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} -T 1C - fi - else - if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then - "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C - else - "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} -T 1C - fi - fi - cd "${DORIS_HOME}" -fi +#if [[ "${FE_MODULES}" != '' ]]; then +# echo "Build Frontend Modules: ${FE_MODULES}" +# cd "${DORIS_HOME}/fe" +# if [[ "${CLEAN}" -eq 1 ]]; then +# clean_fe +# fi +# if [[ "${DISABLE_JAVA_CHECK_STYLE}" = "ON" ]]; then +# # Allowed user customer set env param USER_SETTINGS_MVN_REPO means settings.xml file path +# if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then +# "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C +# else +# "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} -T 1C +# fi +# else +# if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then +# "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C +# else +# "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} -T 1C +# fi +# fi +# cd "${DORIS_HOME}" +#fi # Clean and prepare output dir DORIS_OUTPUT=${DORIS_OUTPUT:="${DORIS_HOME}/output/"} diff --git a/loongarch_env.sh b/loongarch_env.sh new file mode 100644 index 0000000000..bbbe0279af --- /dev/null +++ b/loongarch_env.sh @@ -0,0 +1,3 @@ +export GLIBC_COMPATIBILITY=OFF +export USE_AVX2=0 +export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index abdad19a97..5442142382 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -147,7 +147,7 @@ if [[ "${CLEAN}" -eq 1 ]] && [[ -d "${TP_SOURCE_DIR}" ]]; then fi # Download thirdparties. -"${TP_DIR}/download-thirdparty.sh" +#"${TP_DIR}/download-thirdparty.sh" export LD_LIBRARY_PATH="${TP_DIR}/installed/lib:${LD_LIBRARY_PATH}" @@ -317,8 +317,8 @@ build_libbacktrace() { check_if_source_exist "${LIBBACKTRACE_SOURCE}" cd "${TP_SOURCE_DIR}/${LIBBACKTRACE_SOURCE}" - CPPFLAGS="-I${TP_INCLUDE_DIR}" \ - CXXFLAGS="-I${TP_INCLUDE_DIR}" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ + CXXFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ ./configure --prefix="${TP_INSTALL_DIR}" @@ -334,8 +334,8 @@ build_libevent() { mkdir -p "${BUILD_DIR}" cd "${BUILD_DIR}" - CFLAGS="-std=c99 -D_BSD_SOURCE -fno-omit-frame-pointer -g -ggdb -O2 -I${TP_INCLUDE_DIR}" \ - CPPLAGS="-I${TP_INCLUDE_DIR}" \ + CFLAGS="-std=c99 -D_BSD_SOURCE -fno-omit-frame-pointer -g -ggdb -O2 -I${TP_INCLUDE_DIR} -mcmodel=medium" \ + CPPLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DEVENT__DISABLE_TESTS=ON \ -DEVENT__DISABLE_OPENSSL=ON -DEVENT__DISABLE_SAMPLES=ON -DEVENT__DISABLE_REGRESS=ON .. @@ -354,13 +354,17 @@ build_openssl() { OPENSSL_PLATFORM="darwin64-${MACHINE_TYPE}-cc" elif [[ "${MACHINE_TYPE}" == "aarch64" ]]; then OPENSSL_PLATFORM="linux-aarch64" + elif [[ "${MACHINE_TYPE}" == "loongarch64" ]]; then + # todo: add loongarch64 asm + OPENSSL_PLATFORM="linux-generic64" + OPENSSL_CONFIG_FLAGS="no-asm" fi check_if_source_exist "${OPENSSL_SOURCE}" cd "${TP_SOURCE_DIR}/${OPENSSL_SOURCE}" - CPPFLAGS="-I${TP_INCLUDE_DIR}" \ - CXXFLAGS="-I${TP_INCLUDE_DIR}" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ + CXXFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ LIBDIR="lib" \ ./Configure --prefix="${TP_INSTALL_DIR}" --with-rand-seed=devrandom -shared "${OPENSSL_PLATFORM}" @@ -385,8 +389,8 @@ build_thrift() { cd "${TP_SOURCE_DIR}/${THRIFT_SOURCE}" if [[ "${KERNEL}" != 'Darwin' ]]; then - cflags="-I${TP_INCLUDE_DIR}" - cxxflags="-I${TP_INCLUDE_DIR} ${warning_unused_but_set_variable} -Wno-inconsistent-missing-override" + cflags="-I${TP_INCLUDE_DIR} -mcmodel=medium" + cxxflags="-I${TP_INCLUDE_DIR} ${warning_unused_but_set_variable} -Wno-inconsistent-missing-override -mcmodel=medium" ldflags="-L${TP_LIB_DIR} --static" else cflags="-I${TP_INCLUDE_DIR} -Wno-implicit-function-declaration -Wno-inconsistent-missing-override" @@ -426,7 +430,7 @@ build_protobuf() { mkdir -p cmake/build cd cmake/build - CXXFLAGS="-O2 -I${TP_INCLUDE_DIR}" \ + CXXFLAGS="-O2 -I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="${ldflags}" \ "${CMAKE_CMD}" -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_PREFIX_PATH="${TP_INSTALL_DIR}" \ @@ -456,6 +460,7 @@ build_gflags() { rm -rf CMakeCache.txt CMakeFiles/ "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \ + -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \ -DCMAKE_BUILD_TYPE=Release -DCMAKE_POSITION_INDEPENDENT_CODE=On ../ "${BUILD_SYSTEM}" -j "${PARALLEL}" @@ -472,7 +477,7 @@ build_glog() { rm -rf config.* autoreconf -i - CPPFLAGS="-I${TP_INCLUDE_DIR} -fpermissive -fPIC" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -fpermissive -fPIC -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ ./configure --prefix="${TP_INSTALL_DIR}" --enable-frame-pointers --disable-shared --enable-static @@ -485,6 +490,7 @@ build_glog() { -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DWITH_UNWIND=OFF \ -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \ -DWITH_TLS=OFF cmake --build build --target install @@ -538,7 +544,7 @@ build_snappy() { rm -rf CMakeCache.txt CMakeFiles/ - CFLAGS="-O3" CXXFLAGS="-O3" "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \ + CFLAGS="-O3 -mcmodel=medium" CXXFLAGS="-O3 -mcmodel=medium" "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DCMAKE_INSTALL_INCLUDEDIR="${TP_INCLUDE_DIR}"/snappy \ -DSNAPPY_BUILD_TESTS=0 ../ @@ -561,7 +567,7 @@ build_gperftools() { ./autogen.sh fi - CPPFLAGS="-I${TP_INCLUDE_DIR}" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ LD_LIBRARY_PATH="${TP_LIB_DIR}" \ LDFLAGS="-L${TP_LIB_DIR}" \ @@ -577,8 +583,8 @@ build_zlib() { check_if_source_exist "${ZLIB_SOURCE}" cd "${TP_SOURCE_DIR}/${ZLIB_SOURCE}" - CFLAGS="-O3 -fPIC" \ - CPPFLAGS="-I${TP_INCLUDE_DIR}" \ + CFLAGS="-O3 -fPIC -mcmodel=medium" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ ./configure --prefix="${TP_INSTALL_DIR}" @@ -604,6 +610,8 @@ build_lz4() { rm -f "${TP_INSTALL_DIR}/bin/${link}" done + export CFLAGS="-O2 -mcmodel=medium" + export CXXFLAGS="-O2 -mcmodel=medium" make -j "${PARALLEL}" install PREFIX="${TP_INSTALL_DIR}" BUILD_SHARED=no INCLUDEDIR="${TP_INCLUDE_DIR}/lz4" } @@ -616,6 +624,7 @@ build_zstd() { cd "${BUILD_DIR}" "${CMAKE_CMD}" -G "${GENERATOR}" -DBUILD_TESTING=OFF -DZSTD_BUILD_TESTS=OFF -DZSTD_BUILD_STATIC=ON \ + -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \ -DZSTD_BUILD_PROGRAMS=OFF -DZSTD_BUILD_SHARED=OFF -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" .. "${BUILD_SYSTEM}" -j "${PARALLEL}" install @@ -627,6 +636,8 @@ build_bzip() { check_if_source_exist "${BZIP_SOURCE}" cd "${TP_SOURCE_DIR}/${BZIP_SOURCE}" + export CFLAGS="-O2 -mcmodel=medium -fPIC" + export CXXFLAGS="-O2 -mcmodel=medium -fPIC" make -j "${PARALLEL}" install PREFIX="${TP_INSTALL_DIR}" } @@ -635,8 +646,8 @@ build_lzo2() { check_if_source_exist "${LZO2_SOURCE}" cd "${TP_SOURCE_DIR}/${LZO2_SOURCE}" - CPPFLAGS="-I${TP_INCLUDE_DIR}" \ - LDFLAGS="-L${TP_LIB_DIR}" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ + LDFLAGS="-L${TP_LIB_DIR} -mcmodel=medium" \ ./configure --prefix="${TP_INSTALL_DIR}" --disable-shared --enable-static make -j "${PARALLEL}" @@ -662,7 +673,7 @@ build_curl() { libs='-lcrypto -lssl -lcrypto -ldl' fi - CPPFLAGS="-I${TP_INCLUDE_DIR} " \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" LIBS="${libs}" \ PKG_CONFIG="pkg-config --static" \ ./configure --prefix="${TP_INSTALL_DIR}" --disable-shared --enable-static \ @@ -680,12 +691,14 @@ build_re2() { cd "${TP_SOURCE_DIR}/${RE2_SOURCE}" "${CMAKE_CMD}" -DCMAKE_BUILD_TYPE=Release -G "${GENERATOR}" -DBUILD_SHARED_LIBS=0 -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_C_FLAGS="-mcmodel=medium -fPIC" -DCMAKE_CXX_FLAGS="-mcmodel=medium -fPIC" \ -DCMAKE_PREFIX_PATH="${TP_INSTALL_DIR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" "${BUILD_SYSTEM}" -j "${PARALLEL}" install strip_lib libre2.a } # hyperscan +# on loongarch hyperscan need use gcc build_hyperscan() { check_if_source_exist "${RAGEL_SOURCE}" cd "${TP_SOURCE_DIR}/${RAGEL_SOURCE}" @@ -696,7 +709,7 @@ build_hyperscan() { cxxflags='' fi - CXXFLAGS="${cxxflags}" \ + CXXFLAGS="${cxxflags} -mcmodel=medium" \ ./configure --prefix="${TP_INSTALL_DIR}" make install @@ -711,6 +724,7 @@ build_hyperscan() { CXXFLAGS="-D_HAS_AUTO_PTR_ETC=0" \ "${CMAKE_CMD}" -G "${GENERATOR}" -DBUILD_SHARED_LIBS=0 -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_C_FLAGS="-mcmodel=medium -mno-relax" -DCMAKE_CXX_FLAGS="-mcmodel=medium -mno-relax" \ -DBOOST_ROOT="${TP_INSTALL_DIR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DBUILD_EXAMPLES=OFF .. "${BUILD_SYSTEM}" -j "${PARALLEL}" install strip_lib libhs.a @@ -727,7 +741,7 @@ build_boost() { cxxflags='' fi - CXXFLAGS="${cxxflags}" \ + CXXFLAGS="${cxxflags} -mcmodel=medium" \ ./bootstrap.sh --prefix="${TP_INSTALL_DIR}" --with-toolset="${boost_toolset}" # -q: Fail at first error ./b2 -q link=static runtime-link=static -j "${PARALLEL}" \ @@ -763,6 +777,7 @@ build_mysql() { "${CMAKE_CMD}" -G "${GENERATOR}" ../ -DCMAKE_LINK_SEARCH_END_STATIC=1 \ -DWITH_BOOST="$(pwd)/${BOOST_SOURCE}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}/mysql" \ -DWITHOUT_SERVER=1 -DWITH_ZLIB=1 -DZLIB_ROOT="${TP_INSTALL_DIR}" \ + -DCMAKE_C_FLAGS="-mcmodel=medium " -DCMAKE_CXX_FLAGS="-mcmodel=medium " \ -DCMAKE_CXX_FLAGS_RELWITHDEBINFO="-O3 -g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing -std=gnu++11" \ -DDISABLE_SHARED=1 -DBUILD_SHARED_LIBS=0 -DZLIB_LIBRARY="${TP_INSTALL_DIR}/lib/libz.a" -DENABLE_DTRACE=0 "${BUILD_SYSTEM}" -j "${PARALLEL}" mysqlclient @@ -791,13 +806,14 @@ build_leveldb() { rm -rf CMakeCache.txt CMakeFiles/ - CXXFLAGS="-fPIC" "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DLEVELDB_BUILD_BENCHMARKS=OFF \ + CXXFLAGS="-fPIC -mcmodel=medium" "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DLEVELDB_BUILD_BENCHMARKS=OFF \ -DLEVELDB_BUILD_TESTS=OFF .. "${BUILD_SYSTEM}" -j "${PARALLEL}" install strip_lib libleveldb.a } # brpc +# on loongarch brpc need build_brpc() { check_if_source_exist "${BRPC_SOURCE}" @@ -828,6 +844,7 @@ build_brpc() { "${CMAKE_CMD}" -G "${GENERATOR}" -DBUILD_SHARED_LIBS=ON -DWITH_GLOG=ON -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \ -DCMAKE_LIBRARY_PATH="${TP_INSTALL_DIR}/lib64" -DCMAKE_INCLUDE_PATH="${TP_INSTALL_DIR}/include" \ -DBUILD_BRPC_TOOLS=OFF \ + -DCMAKE_C_FLAGS="-O1 -mcmodel=medium -fPIC" -DCMAKE_CXX_FLAGS="-O1 -mcmodel=medium -fPIC" \ -DPROTOBUF_PROTOC_EXECUTABLE="${TP_INSTALL_DIR}/bin/protoc" .. "${BUILD_SYSTEM}" -j "${PARALLEL}" @@ -854,7 +871,7 @@ build_rocksdb() { fi # -Wno-range-loop-construct gcc-11 - CFLAGS="-I ${TP_INCLUDE_DIR} -I ${TP_INCLUDE_DIR}/snappy -I ${TP_INCLUDE_DIR}/lz4" \ + CFLAGS="-I ${TP_INCLUDE_DIR} -I ${TP_INCLUDE_DIR}/snappy -I ${TP_INCLUDE_DIR}/lz4 -mcmodel=medium" \ CXXFLAGS="-include cstdint -Wno-deprecated-copy ${warning_stringop_truncation} ${warning_shadow} ${warning_dangling_gsl} \ ${warning_defaulted_function_deleted} ${warning_unused_but_set_variable} -Wno-pessimizing-move -Wno-range-loop-construct" \ LDFLAGS="${ldflags}" \ @@ -869,8 +886,8 @@ build_cyrus_sasl() { check_if_source_exist "${CYRUS_SASL_SOURCE}" cd "${TP_SOURCE_DIR}/${CYRUS_SASL_SOURCE}" - CFLAGS="-fPIC -Wno-implicit-function-declaration" \ - CPPFLAGS="-I${TP_INCLUDE_DIR}" \ + CFLAGS="-fPIC -Wno-implicit-function-declaration -mcmodel=medium" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ LIBS="-lcrypto" \ ./configure --prefix="${TP_INSTALL_DIR}" --enable-static --enable-shared=no --with-openssl="${TP_INSTALL_DIR}" --with-pic --enable-gssapi="${TP_INSTALL_DIR}" --with-gss_impl=mit --with-dblib=none @@ -894,7 +911,7 @@ build_librdkafka() { # As a result, we use a patch to hard code "--static" into PKG_CONFIG instead. # PKG_CONFIG="pkg-config --static" - CPPFLAGS="-I${TP_INCLUDE_DIR}" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium -mno-relax" \ LDFLAGS="-L${TP_LIB_DIR} -lssl -lcrypto -lzstd -lz -lsasl2 \ -lgssapi_krb5 -lkrb5 -lkrb5support -lk5crypto -lcom_err -lresolv" \ ./configure --prefix="${TP_INSTALL_DIR}" --enable-static --enable-sasl --disable-c11threads @@ -913,7 +930,7 @@ build_libunixodbc() { cd "${TP_SOURCE_DIR}/${ODBC_SOURCE}" - CFLAGS="-I${TP_INCLUDE_DIR} -Wno-int-conversion -Wno-implicit-function-declaration" \ + CFLAGS="-I${TP_INCLUDE_DIR} -Wno-int-conversion -Wno-implicit-function-declaration -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ ./configure --prefix="${TP_INSTALL_DIR}" --with-included-ltdl --enable-static=yes --enable-shared=no @@ -940,6 +957,7 @@ build_flatbuffers() { LDFLAGS="${ldflags}" \ "${CMAKE_CMD}" -G "${GENERATOR}" \ -DFLATBUFFERS_CXX_FLAGS="${warning_class_memaccess} ${warning_unused_but_set_variable}" \ + -DCMAKE_C_FLAGS="-mcmodel=medium " -DCMAKE_CXX_FLAGS="-mcmodel=medium " \ -DFLATBUFFERS_BUILD_TESTS=OFF \ .. @@ -961,6 +979,7 @@ build_cares() { -DCARES_STATIC=ON \ -DCARES_SHARED=OFF \ -DCARES_STATIC_PIC=ON \ + -DCMAKE_C_FLAGS="-mcmodel=medium " -DCMAKE_CXX_FLAGS="-mcmodel=medium " \ -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" .. make make install @@ -991,6 +1010,7 @@ build_grpc() { -DgRPC_ZLIB_PROVIDER=package \ -DZLIB_ROOT="${TP_INSTALL_DIR}" \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \ ../.. make -j "${PARALLEL}" @@ -1029,6 +1049,7 @@ build_arrow() { LDFLAGS="${ldflags}" \ "${CMAKE_CMD}" -G "${GENERATOR}" -DARROW_PARQUET=ON -DARROW_IPC=ON -DARROW_BUILD_SHARED=OFF \ + -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \ -DARROW_BUILD_STATIC=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON -DARROW_USE_GLOG=ON \ -DARROW_WITH_SNAPPY=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON -DARROW_JSON=ON \ -DARROW_WITH_UTF8PROC=OFF -DARROW_WITH_RE2=ON -DARROW_ORC=ON \ @@ -1093,6 +1114,7 @@ build_abseil() { -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DABSL_PROPAGATE_CXX_STD=ON \ + -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \ -DBUILD_SHARED_LIBS=OFF cmake --build "${BUILD_DIR}" -j "${PARALLEL}" @@ -1115,6 +1137,7 @@ build_s2() { -DBUILD_SHARED_LIBS=OFF \ -DWITH_GFLAGS=ON \ -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \ -DCMAKE_LIBRARY_PATH="${TP_INSTALL_DIR}" .. "${BUILD_SYSTEM}" -j "${PARALLEL}" @@ -1220,7 +1243,7 @@ build_croaringbitmap() { ldflags="-L${TP_LIB_DIR}" fi - CXXFLAGS="-O3" \ + CXXFLAGS="-O3 -mcmodel=medium" \ LDFLAGS="${ldflags}" \ "${CMAKE_CMD}" -G "${GENERATOR}" ${avx_flag:+${avx_flag}} -DROARING_BUILD_STATIC=ON -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \ -DENABLE_ROARING_TESTS=OFF .. @@ -1275,7 +1298,7 @@ build_orc() { rm -rf CMakeCache.txt CMakeFiles/ - CXXFLAGS="-O3 -Wno-array-bounds ${warning_reserved_identifier} ${warning_suggest_override}" \ + CXXFLAGS="-O3 -Wno-array-bounds ${warning_reserved_identifier} ${warning_suggest_override} -mcmodel=medium -fPIC" \ "${CMAKE_CMD}" -G "${GENERATOR}" ../ -DBUILD_JAVA=OFF \ -DPROTOBUF_HOME="${TP_INSTALL_DIR}" \ -DSNAPPY_HOME="${TP_INSTALL_DIR}" \ @@ -1346,7 +1369,7 @@ build_aws_sdk() { -DCMAKE_PREFIX_PATH="${TP_INSTALL_DIR}" -DBUILD_SHARED_LIBS=OFF -DENABLE_TESTING=OFF \ -DCURL_LIBRARY_RELEASE="${TP_INSTALL_DIR}/lib/libcurl.a" -DZLIB_LIBRARY_RELEASE="${TP_INSTALL_DIR}/lib/libz.a" \ -DBUILD_ONLY="core;s3;s3-crt;transfer;identity-management;sts" \ - -DCMAKE_CXX_FLAGS="-Wno-nonnull -Wno-deprecated-declarations ${warning_dangling_reference}" -DCPP_STANDARD=17 + -DCMAKE_CXX_FLAGS="-Wno-nonnull -Wno-deprecated-declarations ${warning_dangling_reference} -mcmodel=medium" -DCPP_STANDARD=17 cd "${BUILD_DIR}" @@ -1414,7 +1437,7 @@ build_xml2() { mkdir -p "${BUILD_DIR}" cd "${BUILD_DIR}" - CPPLAGS="-I${TP_INCLUDE_DIR}" \ + CPPLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ ../configure --prefix="${TP_INSTALL_DIR}" --enable-shared=no --with-pic --with-python=no --with-lzma="${TP_INSTALL_DIR}" @@ -1431,6 +1454,8 @@ build_idn() { mkdir -p "${BUILD_DIR}" cd "${BUILD_DIR}" + CFLAGS="-O2 -mcmodel=medium" \ + CXXFLAGS="-O2 -mcmodel=medium" \ ../configure --prefix="${TP_INSTALL_DIR}" --enable-shared=no --with-pic make -j "${PARALLEL}" @@ -1446,7 +1471,7 @@ build_gsasl() { cd "${BUILD_DIR}" KRB5_CONFIG="${TP_INSTALL_DIR}/bin/krb5-config" \ - CFLAGS="-I${TP_INCLUDE_DIR} -Wno-implicit-function-declaration" \ + CFLAGS="-I${TP_INCLUDE_DIR} -Wno-implicit-function-declaration -mcmodel=medium" \ ../configure --prefix="${TP_INSTALL_DIR}" --with-gssapi-impl=mit --enable-shared=no --with-pic --with-libidn-prefix="${TP_INSTALL_DIR}" make -j "${PARALLEL}" @@ -1465,7 +1490,7 @@ build_krb5() { with_crypto_impl='--with-crypto-impl=openssl' fi - CFLAGS="-fcommon -fPIC -I${TP_INSTALL_DIR}/include" LDFLAGS="-L${TP_INSTALL_DIR}/lib" \ + CFLAGS="-fcommon -fPIC -I${TP_INSTALL_DIR}/include -mcmodel=medium" LDFLAGS="-L${TP_INSTALL_DIR}/lib" \ ../configure --prefix="${TP_INSTALL_DIR}" --disable-shared --enable-static \ --without-keyutils ${with_crypto_impl:+${with_crypto_impl}} @@ -1496,7 +1521,8 @@ build_hdfs3() { -DKERBEROS_LIBRARIES="${TP_INSTALL_DIR}/lib/libkrb5.a" \ -DGSASL_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \ -DGSASL_LIBRARIES="${TP_INSTALL_DIR}/lib/libgsasl.a" \ - -DCMAKE_CXX_FLAGS='-include cstdint' \ + -DCMAKE_C_FLAGS="-mcmodel=medium" \ + -DCMAKE_CXX_FLAGS='-include cstdint -mcmodel=medium' \ .. make CXXFLAGS="${libhdfs_cxx17}" -j "${PARALLEL}" @@ -1512,7 +1538,7 @@ build_jemalloc() { mkdir -p "${BUILD_DIR}" cd "${BUILD_DIR}" - cflags='-O3 -fno-omit-frame-pointer -fPIC -g' + cflags='-O3 -fno-omit-frame-pointer -fPIC -g -mcmodel=medium' # Build jemalloc --with-lg-page=16 in order to make the wheel work on both 4k and 64k page arm64 systems. # Jemalloc compiled on a system with page size 4K can only run on a system with the same page size 4K. # If it is run on a system with page size > 4K, an error `unsupported system page size`. @@ -1525,7 +1551,7 @@ build_jemalloc() { WITH_LG_PAGE='' fi - CFLAGS="${cflags}" ../configure --prefix="${TP_INSTALL_DIR}" --with-install-suffix="_doris" "${WITH_LG_PAGE}" \ + CFLAGS="${cflags} -mcmodel=medium" ../configure --prefix="${TP_INSTALL_DIR}" --with-install-suffix="_doris" "${WITH_LG_PAGE}" \ --with-jemalloc-prefix=je --enable-prof --disable-cxx --disable-libdl --disable-shared make -j "${PARALLEL}" @@ -1592,7 +1618,7 @@ build_simdjson() { mkdir -p "${BUILD_DIR}" cd "${BUILD_DIR}" - CXXFLAGS="-O3" CFLAGS="-O3" \ + CXXFLAGS="-O3 -mcmodel=medium" CFLAGS="-O3 -mcmodel=medium" \ "${CMAKE_CMD}" -DSIMDJSON_EXCEPTIONS=OFF \ -DSIMDJSON_DEVELOPER_MODE=OFF -DSIMDJSON_BUILD_STATIC=ON \ -DSIMDJSON_JUST_LIBRARY=ON -DSIMDJSON_ENABLE_THREADS=ON .. @@ -1682,6 +1708,7 @@ build_hadoop_libs() { check_if_source_exist "${HADOOP_LIBS_SOURCE}" cd "${TP_SOURCE_DIR}/${HADOOP_LIBS_SOURCE}" echo "THIRDPARTY_INSTALLED=${TP_INSTALL_DIR}" >env.sh + export MAVEN_OPTS="-Dos.detected.arch=loongarch64" ./build.sh rm -rf "${TP_INSTALL_DIR}/include/hadoop_hdfs/" @@ -1762,8 +1789,8 @@ build_ali_sdk() { mkdir -p "${BUILD_DIR}" cd "${BUILD_DIR}" - CPPFLAGS="-I${TP_INCLUDE_DIR}" \ - CXXFLAGS="-I${TP_INCLUDE_DIR}" \ + CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ + CXXFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \ LDFLAGS="-L${TP_LIB_DIR}" \ "${CMAKE_CMD}" -G "${GENERATOR}" -DBUILD_PRODUCT=core -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \ -DTP_INSTALL_DIR="${TP_INSTALL_DIR}" .. @@ -1799,7 +1826,10 @@ build_icu() { rm -rf "${BUILD_DIR}" mkdir -p "${BUILD_DIR}" cd "${BUILD_DIR}" + cp -rf ${TP_PATCH_DIR}/config.guess ${TP_PATCH_DIR}/patches/config.sub ${TP_SOURCE_DIR}/${ICU_SOURCE}/icu4c/source + CFLAGS="${CFLAGS} -mcmodel=medium" \ + CPPFLAGS="${CPPFLAGS} -mcmodel=medium" \ ../configure --prefix="${TP_INSTALL_DIR}" \ --enable-static \ --disable-shared \ diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index c7f2eea4c3..e84985d044 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -360,6 +360,15 @@ if [[ "${HYPERSCAN_SOURCE}" == "vectorscan-vectorscan-5.4.11" ]]; then fi cd - fi + +cd "${TP_SOURCE_DIR}/${HYPERSCAN_SOURCE}" +if [[ ! -f "${PATCHED_MARK}" ]]; then + patch -p1 <"${TP_PATCH_DIR}/add-loongarch64-support-hyperscan-5.4.2.patch" + patch -p1 <"${TP_PATCH_DIR}/add-the-parameter-mlsx.patch" + touch "${PATCHED_MARK}" +fi +cd - + echo "Finished patching ${HYPERSCAN_SOURCE}" cd "${TP_SOURCE_DIR}/${AWS_SDK_SOURCE}" @@ -471,5 +480,15 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " THRIFT " ]]; then echo "Finished patching ${THRIFT_SOURCE}" fi +# patch hdfs3 +if [[ "${HDFS3_SOURCE}" = "doris-thirdparty-libhdfs3-v2.3.9" ]]; then + cd "${TP_SOURCE_DIR}/${HDFS3_SOURCE}" + if [[ ! -f "${PATCHED_MARK}" ]]; then + patch -p1 <"${TP_PATCH_DIR}/libhdfs3-add-loongarch-support.patch" + touch "${PATCHED_MARK}" + fi + cd - +fi +echo "Finished patching ${HDFS3_SOURCE}" # vim: ts=4 sw=4 ts=4 tw=100: diff --git a/thirdparty/patches/add-loongarch64-support-hyperscan-5.4.2.patch b/thirdparty/patches/add-loongarch64-support-hyperscan-5.4.2.patch new file mode 100644 index 0000000000..2479bb4406 --- /dev/null +++ b/thirdparty/patches/add-loongarch64-support-hyperscan-5.4.2.patch @@ -0,0 +1,4322 @@ +From 6e29e1e679ea60d7152e37a8949805b80054cdea Mon Sep 17 00:00:00 2001 +From: Jingyun Hua +Date: Wed, 5 Jul 2023 07:11:50 +0000 +Subject: [PATCH] add loongarch64 support hyperscan-5.4.2 + +--- + CMakeLists.txt | 14 +- + cmake/arch.cmake | 21 +- + cmake/config.h.in | 6 + + cmake/platform.cmake | 13 +- + src/hs_valid_platform.c | 4 + + src/nfa/shufti.c | 6 +- + src/nfa/truffle.c | 4 +- + src/rose/counting_miracle.h | 2 +- + src/util/arch.h | 7 + + src/util/cpuid_flags.c | 11 +- + src/util/cpuid_flags.h | 2 +- + src/util/cpuid_inline.h | 8 +- + src/util/intrinsics.h | 6 + + src/util/simd_loongarch.h | 956 +++++++++++++++++++++++ + src/util/simd_types.h | 3 + + src/util/simd_utils.h | 1386 +--------------------------------- + src/util/simd_x86.h | 1420 +++++++++++++++++++++++++++++++++++ + src/util/state_compress.c | 42 +- + 18 files changed, 2479 insertions(+), 1432 deletions(-) + create mode 100644 src/util/simd_loongarch.h + create mode 100644 src/util/simd_x86.h + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 7757916..4289817 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -182,7 +182,7 @@ else() + string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") + endforeach () + +- if (CMAKE_COMPILER_IS_GNUCC) ++ if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC) + message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}") + # If gcc doesn't recognise the host cpu, then mtune=native becomes + # generic, which isn't very good in some cases. march=native looks at +@@ -289,10 +289,14 @@ else() + endif() + + CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H) +-CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H) +-CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H) +-CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H) +-CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H) ++if (ARCH_IA32 OR ARCH_X86_64) ++ CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H) ++ CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H) ++ CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H) ++ CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H) ++elseif (ARCH_LOONGARCH64) ++ CHECK_INCLUDE_FILES(lsxintrin.h HAVE_C_LSXINTRIN_H) ++endif() + + CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) + CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC) +diff --git a/cmake/arch.cmake b/cmake/arch.cmake +index eb4791e..e8b5f11 100644 +--- a/cmake/arch.cmake ++++ b/cmake/arch.cmake +@@ -6,7 +6,10 @@ if (HAVE_C_X86INTRIN_H) + set (INTRIN_INC_H "x86intrin.h") + elseif (HAVE_C_INTRIN_H) + set (INTRIN_INC_H "intrin.h") +-else () ++elseif (HAVE_C_LSXINTRIN_H) ++ set (INTRIN_INC_H "lsxintrin.h") ++ set (FAT_RUNTIME OFF) ++else() + message (FATAL_ERROR "No intrinsics header found") + endif () + +@@ -82,29 +85,29 @@ int main(){ + }" HAVE_AVX512VBMI) + + if (FAT_RUNTIME) +- if (NOT HAVE_SSSE3) ++ if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) + message(FATAL_ERROR "SSSE3 support required to build fat runtime") + endif () +- if (NOT HAVE_AVX2) ++ if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2) + message(FATAL_ERROR "AVX2 support required to build fat runtime") + endif () +- if (BUILD_AVX512 AND NOT HAVE_AVX512) ++ if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512 AND NOT HAVE_AVX512) + message(FATAL_ERROR "AVX512 support requested but not supported") + endif () +- if (BUILD_AVX512VBMI AND NOT HAVE_AVX512VBMI) ++ if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512VBMI AND NOT HAVE_AVX512VBMI) + message(FATAL_ERROR "AVX512VBMI support requested but not supported") + endif () + else (NOT FAT_RUNTIME) +- if (NOT HAVE_AVX2) ++ if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2) + message(STATUS "Building without AVX2 support") + endif () +- if (NOT HAVE_AVX512) ++ if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512) + message(STATUS "Building without AVX512 support") + endif () +- if (NOT HAVE_AVX512VBMI) ++ if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI) + message(STATUS "Building without AVX512VBMI support") + endif () +- if (NOT HAVE_SSSE3) ++ if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3) + message(FATAL_ERROR "A minimum of SSSE3 compiler support is required") + endif () + endif () +diff --git a/cmake/config.h.in b/cmake/config.h.in +index 5454643..43827fe 100644 +--- a/cmake/config.h.in ++++ b/cmake/config.h.in +@@ -15,6 +15,9 @@ + /* "Define if building for EM64T" */ + #cmakedefine ARCH_X86_64 + ++/* "Define if building for LOONGARCH64" */ ++#cmakedefine ARCH_LOONGARCH64 ++ + /* internal build, switch on dump support. */ + #cmakedefine DUMP_SUPPORT + +@@ -48,6 +51,9 @@ + /* C compiler has intrin.h */ + #cmakedefine HAVE_C_INTRIN_H + ++/* C compiler has lsxintrin.h */ ++#cmakedefine HAVE_C_LSXINTRIN_H ++ + /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to + 0 if you don't. */ + #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP +diff --git a/cmake/platform.cmake b/cmake/platform.cmake +index 593c544..aba432f 100644 +--- a/cmake/platform.cmake ++++ b/cmake/platform.cmake +@@ -1,9 +1,14 @@ + # determine the target arch + + # really only interested in the preprocessor here +-CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT) ++CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64) + +-CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT) ++CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32) + +-set(ARCH_X86_64 ${ARCH_64_BIT}) +-set(ARCH_IA32 ${ARCH_32_BIT}) ++CHECK_C_SOURCE_COMPILES("#if !defined(__loongarch64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_LOONGARCH64) ++ ++if (ARCH_X86_64 OR ARCH_LOONGARCH64) ++ set(ARCH_64_BIT 1) ++elseif () ++ set(ARCH_32_BIT 1) ++endif() +diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c +index 59ad3f3..564d179 100644 +--- a/src/hs_valid_platform.c ++++ b/src/hs_valid_platform.c +@@ -33,9 +33,13 @@ + HS_PUBLIC_API + hs_error_t HS_CDECL hs_valid_platform(void) { + /* Hyperscan requires SSSE3, anything else is a bonus */ ++#if defined(__x86_64__) + if (check_ssse3()) { + return HS_SUCCESS; + } else { + return HS_ARCH_ERROR; + } ++#elif defined(ARCH_LOONGARCH64) ++ return HS_SUCCESS; ++#endif + } +diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c +index 09ffc0c..c8776a7 100644 +--- a/src/nfa/shufti.c ++++ b/src/nfa/shufti.c +@@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + } + + const m128 zeroes = zeroes128(); +- const m128 low4bits = _mm_set1_epi8(0xf); ++ const m128 low4bits = __lsx_vldi(0xf); + const u8 *rv; + + size_t min = (size_t)buf % 16; +@@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + } + + const m128 zeroes = zeroes128(); +- const m128 low4bits = _mm_set1_epi8(0xf); ++ const m128 low4bits = __lsx_vldi(0xf); + const u8 *rv; + + assert(buf_end - buf >= 16); +@@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { + const m128 ones = ones128(); +- const m128 low4bits = _mm_set1_epi8(0xf); ++ const m128 low4bits = __lsx_vldi(0xf); + const u8 *rv; + + size_t min = (size_t)buf % 16; +diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c +index be6b312..f208854 100644 +--- a/src/nfa/truffle.c ++++ b/src/nfa/truffle.c +@@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) { + static really_inline + u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { + +- m128 highconst = _mm_set1_epi8(0x80); +- m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); ++ m128 highconst = __lsx_vldi(0x80); ++ m128 shuf_mask_hi = __lsx_vreplgr2vr_d(0x8040201008040201); + + // and now do the real work + m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); +diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h +index 976208b..1cf5189 100644 +--- a/src/rose/counting_miracle.h ++++ b/src/rose/counting_miracle.h +@@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison, + u32 count = *count_inout; + + const m128 zeroes = zeroes128(); +- const m128 low4bits = _mm_set1_epi8(0xf); ++ const m128 low4bits = __lsx_vldi(0xf); + + for (; d + 16 <= d_end; d_end -= 16) { + m128 data = loadu128(d_end - 16); +diff --git a/src/util/arch.h b/src/util/arch.h +index 985fec6..296d322 100644 +--- a/src/util/arch.h ++++ b/src/util/arch.h +@@ -87,4 +87,11 @@ + #define NO_ASM + #endif + ++/* ++ * LOONGARCH64 uses a different form of inline asm ++ */ ++#if defined(__loongarch64) ++#define NO_ASM ++#endif ++ + #endif // UTIL_ARCH_H_ +diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c +index c00ce58..705794a 100644 +--- a/src/util/cpuid_flags.c ++++ b/src/util/cpuid_flags.c +@@ -33,13 +33,15 @@ + #include "hs_internal.h" + #include "util/arch.h" + ++#if defined(__x86_64__) || defined(_M_X64) + #if !defined(_WIN32) && !defined(CPUID_H_) + #include + #endif ++#endif + + u64a cpuid_flags(void) { + u64a cap = 0; +- ++#if defined(__X86_64__) + if (check_avx2()) { + DEBUG_PRINTF("AVX2 enabled\n"); + cap |= HS_CPU_FEATURES_AVX2; +@@ -68,7 +70,7 @@ u64a cpuid_flags(void) { + (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI)) + cap &= ~HS_CPU_FEATURES_AVX512VBMI; + #endif +- ++#endif + return cap; + } + +@@ -78,6 +80,7 @@ struct family_id { + u32 tune; + }; + ++#if defined(__X86_64__) + /* from table 35-1 of the Intel 64 and IA32 Arch. Software Developer's Manual + * and "Intel Architecture and Processor Identification With CPUID Model and + * Family Numbers" */ +@@ -121,6 +124,7 @@ static const struct family_id known_microarch[] = { + { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */ + + }; ++#endif + + #ifdef DUMP_SUPPORT + static UNUSED +@@ -144,6 +148,7 @@ const char *dumpTune(u32 tune) { + #endif + + u32 cpuid_tune(void) { ++#if defined(__X86_64__) + unsigned int eax, ebx, ecx, edx; + + cpuid(1, 0, &eax, &ebx, &ecx, &edx); +@@ -171,6 +176,6 @@ u32 cpuid_tune(void) { + DEBUG_PRINTF("found tune flag %s\n", dumpTune(tune) ); + return tune; + } +- ++#endif + return HS_TUNE_FAMILY_GENERIC; + } +diff --git a/src/util/cpuid_flags.h b/src/util/cpuid_flags.h +index 527c6d5..68e427d 100644 +--- a/src/util/cpuid_flags.h ++++ b/src/util/cpuid_flags.h +@@ -31,7 +31,7 @@ + + #include "ue2common.h" + +-#if !defined(_WIN32) && !defined(CPUID_H_) ++#if (defined(ARCH_IA32) || defined(ARCH_X86_64)) && !defined(_WIN32) && !defined(CPUID_H_) + #include + /* system header doesn't have a header guard */ + #define CPUID_H_ +diff --git a/src/util/cpuid_inline.h b/src/util/cpuid_inline.h +index b7b4245..425bcfc 100644 +--- a/src/util/cpuid_inline.h ++++ b/src/util/cpuid_inline.h +@@ -32,17 +32,20 @@ + #include "ue2common.h" + #include "cpuid_flags.h" + ++#if defined(__x86_64__) || defined(_M_X64) + #if !defined(_WIN32) && !defined(CPUID_H_) + #include + /* system header doesn't have a header guard */ + #define CPUID_H_ + #endif ++#endif + + #ifdef __cplusplus + extern "C" + { + #endif + ++#if defined(__x86_64__) || defined(_M_X64) + static inline + void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax, + unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { +@@ -57,6 +60,7 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax, + *edx = a[3]; + #endif + } ++#endif + + // ECX + #define CPUID_SSE3 (1 << 0) +@@ -93,11 +97,12 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax, + #define CPUID_XCR0_AVX512 \ + (CPUID_XCR0_OPMASK | CPUID_XCR0_ZMM_Hi256 | CPUID_XCR0_Hi16_ZMM) + ++#if defined(__x86_64__) + static inline + u64a xgetbv(u32 op) { + #if defined(_WIN32) || defined(__INTEL_COMPILER) + return _xgetbv(op); +-#else ++#elif defined(__x86_64__) + u32 a, d; + __asm__ volatile ( + "xgetbv\n" +@@ -252,6 +257,7 @@ int check_popcnt(void) { + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + return !!(ecx & CPUID_POPCNT); + } ++#endif //__x86_64__ + + #ifdef __cplusplus + } /* extern "C" */ +diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h +index edc4f6e..1094e72 100644 +--- a/src/util/intrinsics.h ++++ b/src/util/intrinsics.h +@@ -45,6 +45,10 @@ + # endif + #endif + ++#if defined(HAVE_C_LSXINTRIN_H) ++# define USE_LSXINTRIN_H ++#endif ++ + #ifdef __cplusplus + # if defined(HAVE_CXX_INTRIN_H) + # define USE_INTRIN_H +@@ -59,6 +63,8 @@ + #include + #elif defined(USE_INTRIN_H) + #include ++#elif defined(USE_LSXINTRIN_H) ++#include + #else + #error no intrinsics file + #endif +diff --git a/src/util/simd_loongarch.h b/src/util/simd_loongarch.h +new file mode 100644 +index 0000000..b311ffb +--- /dev/null ++++ b/src/util/simd_loongarch.h +@@ -0,0 +1,956 @@ ++/* ++ * Copyright (c) 2015-2017, Intel Corporation ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * * Redistributions of source code must retain the above copyright notice, ++ * this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * * Neither the name of Intel Corporation nor the names of its contributors ++ * may be used to endorse or promote products derived from this software ++ * without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++ * POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++/** \file ++ * \brief SIMD types and primitive operations. ++ */ ++ ++#ifndef SIMD_LSX ++#define SIMD_LSX ++ ++#include "config.h" ++#include "ue2common.h" ++#include "simd_types.h" ++#include "unaligned.h" ++#include "util/arch.h" ++#include "util/intrinsics.h" ++#include ++ ++#include // for memcpy ++ ++// Define a common assume_aligned using an appropriate compiler built-in, if ++// it's available. Note that we need to handle C or C++ compilation. ++#ifdef __cplusplus ++#ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED ++#define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) ++#endif ++#else ++#ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED ++#define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) ++#endif ++#endif ++ ++// Fallback to identity case. ++#ifndef assume_aligned ++#define assume_aligned(x, y) (x) ++#endif ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern const char vbs_mask_data[]; ++#ifdef __cplusplus ++} ++#endif ++ ++static really_inline m128 ones128(void) { ++ /* gcc gets this right */ ++ return __lsx_vldi(0xFF); ++} ++ ++static really_inline m128 zeroes128(void) { ++ return __lsx_vldi(0); ++} ++ ++/** \brief Bitwise not for m128*/ ++static really_inline m128 not128(m128 a) { ++ return __lsx_vxor_v(a,ones128()); ++} ++ ++/** \brief Return 1 if a and b are different otherwise 0 */ ++static really_inline int diff128(m128 a, m128 b) { ++ return (__lsx_vpickve2gr_hu(__lsx_vmskltz_b(__lsx_vseq_b(a, b)), 0) ^ 0xffff); ++} ++ ++static really_inline int isnonzero128(m128 a) { ++ return !!diff128(a, zeroes128()); ++} ++ ++/** ++ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich128(m128 a, m128 b) { ++ a = __lsx_vseq_w(a, b); ++ return ~( __lsx_vpickve2gr_hu(__lsx_vmskltz_w(a),0)) & 0xf; ++} ++/** ++ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and ++ * returns a 4-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_128(m128 a, m128 b) { ++ u32 d = diffrich128(a, b); ++ return (d | (d >> 1)) & 0x5; ++} ++ ++static really_really_inline ++m128 lshift64_m128(m128 a, unsigned b) { ++ m128 tmp = __lsx_vinsgr2vr_w(zeroes128(), b, 0); ++ ++ m128 x = __lsx_vinsgr2vr_w(tmp, b, 2); ++ return __lsx_vsll_d(a, x); ++} ++ ++#define rshift64_m128(a, b) __lsx_vsrli_d((a), (b)) ++#define eq128(a, b) __lsx_vseq_b((a), (b)) ++#define movemask128(a) __lsx_vpickve2gr_hu(__lsx_vmskltz_b(a), 0) ++ ++static really_inline m128 set16x8(u8 c) { ++ return __lsx_vreplgr2vr_b(c); ++} ++ ++static really_inline m128 set4x32(u32 c) { ++ return __lsx_vreplgr2vr_w(c); ++} ++ ++static really_inline u32 movd(const m128 in) { ++ return __lsx_vpickve2gr_w(in, 0); ++} ++ ++static really_inline u64a movq(const m128 in) { ++ u32 lo = movd(in); ++ u32 hi = movd(__lsx_vsrli_d(in, 32)); ++ return (u64a)hi << 32 | lo; ++} ++ ++/* another form of movq */ ++static really_inline ++m128 load_m128_from_u64a(const u64a *p) { ++ return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0LL), *p, 0); ++} ++ ++#define L(a) __lsx_vand_v(a, __lsx_vinsgr2vr_d(__lsx_vldi(0), 0xFFFFFFFFFFFFFFFF, 0)) ++#define M(a) __lsx_vand_v(a, __lsx_vinsgr2vr_d(__lsx_vldi(0xFF), 0x0000000000000000, 0)) ++#define N(a) __lsx_vpickod_d(__lsx_vldi(0),a) ++#define U(a) __lsx_vpickev_d(a, __lsx_vldi(0)) ++#define rshiftbyte_m128(a, count_immed) \ ++ (((count_immed) < 8) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (8*count_immed)), __lsx_vor_v(__lsx_vsrli_d(L(a), (8*count_immed)), __lsx_vslli_d(N(a), (64-(8*count_immed)))))) : (__lsx_vsrli_d(N(a),((8*count_immed)-64)))) ++ ++#define lshiftbyte_m128(a, count_immed) \ ++ (((count_immed) < 8) ? (__lsx_vor_v(__lsx_vslli_d(L(a), (8*count_immed)), __lsx_vor_v(__lsx_vslli_d(M(a), (8*count_immed)), __lsx_vsrli_d(U(a), (64-(8*count_immed)))))) : (__lsx_vslli_d(U(a),((8*count_immed)-64)))) ++ ++#define extract32from128(a, imm) \ ++ (((imm) < 2) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (32*imm)), __lsx_vor_v(__lsx_vsrli_d(L(a), (32*imm)), __lsx_vslli_d(N(a), (64-(32*imm)))))) : (__lsx_vsrli_d(N(a),((32*imm)-64)))) ++#define extract64from128(a, imm) \ ++ (((imm) < 1) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (64*imm)), __lsx_vor_v(__lsx_vsrli_d(L(a), (64*imm)), __lsx_vslli_d(N(a), (64-(64*imm)))))) : (__lsx_vsrli_d(N(a),((64*imm)-64)))) ++ ++#define extractlow64from256(a) movq(a.lo) ++#define extractlow32from256(a) movd(a.lo) ++ ++static really_inline m128 and128(m128 a, m128 b) { ++ return __lsx_vand_v(a,b); ++} ++ ++static really_inline m128 xor128(m128 a, m128 b) { ++ return __lsx_vxor_v(a,b); ++} ++ ++static really_inline m128 or128(m128 a, m128 b) { ++ return __lsx_vor_v(a,b); ++} ++ ++static really_inline m128 andnot128(m128 a, m128 b) { ++ return __lsx_vandn_v(a,b); ++} ++ ++// aligned load ++static really_inline m128 load128(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ ptr = assume_aligned(ptr, 16); ++ return __lsx_vldx((const m128 *)ptr,0); ++} ++ ++// aligned store ++static really_inline void store128(void *ptr, m128 a) { ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ ptr = assume_aligned(ptr, 16); ++ *(m128 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m128 loadu128(const void *ptr) { ++ return __lsx_vldx((const m128 *)ptr,0); ++} ++ ++// unaligned store ++static really_inline void storeu128(void *ptr, m128 a) { ++ __lsx_vst(a,(m128 *)ptr,0); ++} ++ ++// packed unaligned store of first N bytes ++static really_inline ++void storebytes128(void *ptr, m128 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline ++m128 loadbytes128(const void *ptr, unsigned int n) { ++ m128 a = zeroes128(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern const u8 simd_onebit_masks[]; ++#ifdef __cplusplus ++} ++#endif ++ ++static really_inline ++m128 mask1bit128(unsigned int n) { ++ assert(n < sizeof(m128) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu128(&simd_onebit_masks[mask_idx]); ++} ++ ++// switches on bit N in the given vector. ++static really_inline ++void setbit128(m128 *ptr, unsigned int n) { ++ *ptr = or128(mask1bit128(n), *ptr); ++} ++ ++// switches off bit N in the given vector. ++static really_inline ++void clearbit128(m128 *ptr, unsigned int n) { ++ *ptr = andnot128(mask1bit128(n), *ptr); ++} ++ ++// tests bit N in the given vector. ++static really_inline ++char testbit128(m128 val, unsigned int n) { ++ const m128 mask = mask1bit128(n); ++ return isnonzero128(and128(mask, val)); ++} ++ ++#define palignr(r, l, offset) \ ++ (((offset) < 8) ? __lsx_vor_v(rshiftbyte_m128(l,(offset)),lshiftbyte_m128(U(r),(8-(offset)))) : __lsx_vor_v(rshiftbyte_m128(l,(offset)), lshiftbyte_m128(r,(16-(offset))))) ++ ++static really_inline ++m128 shuffle_epi8(m128 a, m128 b) { ++ m128 tmp1,tmp2,tmp3,dst; ++ tmp1 = ~(__lsx_vslt_b(b,__lsx_vldi(0))); ++ tmp2 = __lsx_vand_v(b,tmp1); ++ tmp3 = __lsx_vand_v(tmp2, __lsx_vldi(0x0F)); ++ unsigned char* p = (unsigned char*)&tmp3; ++ unsigned char* pa = (unsigned char*)&a; ++ for (int i = 0; i < 16; i++) { ++ unsigned char value = p[i]; ++ switch(i){ ++ case 0:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 0);}else{dst = tmp3;}break; ++ case 1:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 1);}else{dst = tmp3;}break; ++ case 2:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 2);}else{dst = tmp3;}break; ++ case 3:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 3);}else{dst = tmp3;}break; ++ case 4:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 4);}else{dst = tmp3;}break; ++ case 5:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 5);}else{dst = tmp3;}break; ++ case 6:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 6);}else{dst = tmp3;}break; ++ case 7:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 7);}else{dst = tmp3;}break; ++ case 8:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 8);}else{dst = tmp3;}break; ++ case 9:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 9);}else{dst = tmp3;}break; ++ case 10:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 10);}else{dst = tmp3;}break; ++ case 11:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 11);}else{dst = tmp3;}break; ++ case 12:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 12);}else{dst = tmp3;}break; ++ case 13:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 13);}else{dst = tmp3;}break; ++ case 14:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 14);}else{dst = tmp3;}break; ++ case 15:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 15);}else{dst = tmp3;}break; ++ default:break; ++ } ++ tmp3 = dst; ++ } ++ return dst; ++} ++ ++static really_inline ++m128 pshufb_m128(m128 a, m128 b) { ++ m128 result; ++ result = shuffle_epi8(a, b); ++ return result; ++} ++ ++static really_inline ++m256 pshufb_m256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = pshufb_m128(a.lo, b.lo); ++ rv.hi = pshufb_m128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline ++m128 variable_byte_shift_m128(m128 in, s32 amount) { ++ assert(amount >= -16 && amount <= 16); ++ m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); ++ return pshufb_m128(in, shift_mask); ++} ++ ++static really_inline ++m128 max_u8_m128(m128 a, m128 b) { ++ return __lsx_vmax_bu(a, b); ++} ++ ++static really_inline ++m128 min_u8_m128(m128 a, m128 b) { ++ return __lsx_vmin_bu(a, b); ++} ++ ++static really_inline ++m128 sadd_u8_m128(m128 a, m128 b) { ++ return __lsx_vsadd_bu(a, b); ++} ++ ++static really_inline ++m128 sub_u8_m128(m128 a, m128 b) { ++ return __lsx_vsub_b(a, b); ++} ++ ++static really_inline ++m128 set64x2(u64a hi, u64a lo) { ++ return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(hi),lo,0); ++} ++ ++/**** ++ **** 256-bit Primitives ++ ****/ ++ ++static really_really_inline ++m256 lshift64_m256(m256 a, int b) { ++ m256 rv = a; ++ rv.lo = lshift64_m128(rv.lo, b); ++ rv.hi = lshift64_m128(rv.hi, b); ++ return rv; ++} ++ ++static really_inline ++m256 rshift64_m256(m256 a, int b) { ++ m256 rv = a; ++ rv.lo = rshift64_m128(rv.lo, b); ++ rv.hi = rshift64_m128(rv.hi, b); ++ return rv; ++} ++static really_inline ++m256 set32x8(u32 in) { ++ m256 rv; ++ rv.lo = set16x8((u8) in); ++ rv.hi = rv.lo; ++ return rv; ++} ++ ++static really_inline ++m256 eq256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = eq128(a.lo, b.lo); ++ rv.hi = eq128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline ++u32 movemask256(m256 a) { ++ u32 lo_mask = movemask128(a.lo); ++ u32 hi_mask = movemask128(a.hi); ++ return lo_mask | (hi_mask << 16); ++} ++ ++static really_inline ++m256 set2x128(m128 a) { ++ m256 rv = {a, a}; ++ return rv; ++} ++ ++static really_inline m256 zeroes256(void) { ++ m256 rv = {zeroes128(), zeroes128()}; ++ return rv; ++} ++ ++static really_inline m256 ones256(void) { ++ m256 rv = {ones128(), ones128()}; ++ return rv; ++} ++ ++static really_inline m256 and256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = and128(a.lo, b.lo); ++ rv.hi = and128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m256 or256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = or128(a.lo, b.lo); ++ rv.hi = or128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m256 xor256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = xor128(a.lo, b.lo); ++ rv.hi = xor128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m256 not256(m256 a) { ++ m256 rv; ++ rv.lo = not128(a.lo); ++ rv.hi = not128(a.hi); ++ return rv; ++} ++ ++static really_inline m256 andnot256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = andnot128(a.lo, b.lo); ++ rv.hi = andnot128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline int diff256(m256 a, m256 b) { ++ return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); ++} ++ ++static really_inline int isnonzero256(m256 a) { ++ return isnonzero128(or128(a.lo, a.hi)); ++} ++ ++/** ++ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich256(m256 a, m256 b) { ++ m128 z = zeroes128(); ++ m128 tmp0,tmp1,tmp2,tmp3; ++ a.lo = __lsx_vseq_w(a.lo, b.lo); ++ a.hi = __lsx_vseq_w(a.hi, b.hi); ++ ++ tmp0 =__lsx_vsat_w(a.lo, 15); ++ tmp1 =__lsx_vsat_w(b.hi, 15); ++ tmp2 =__lsx_vsat_h(__lsx_vpickev_h(tmp1, tmp0), 7); ++ tmp3 =__lsx_vsat_h(z, 7); ++ m128 packed = __lsx_vpickev_b(tmp3, tmp2); ++ ++ return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xff; ++} ++ ++/** ++ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and ++ * returns an 8-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_256(m256 a, m256 b) { ++ u32 d = diffrich256(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline m256 load256(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++ m256 rv = {load128(ptr), load128((const char *)ptr + 16)}; ++ return rv; ++} ++ ++// aligned load of 128-bit value to low and high part of 256-bit value ++static really_inline m256 load2x128(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ m256 rv; ++ rv.hi = rv.lo = load128(ptr); ++ return rv; ++} ++ ++static really_inline m256 loadu2x128(const void *ptr) { ++ return set2x128(loadu128(ptr)); ++} ++ ++// aligned store ++static really_inline void store256(void *ptr, m256 a) { ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++ ptr = assume_aligned(ptr, 16); ++ *(m256 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m256 loadu256(const void *ptr) { ++ m256 rv = {loadu128(ptr), loadu128((const char *)ptr + 16)}; ++ return rv; ++} ++ ++// unaligned store ++static really_inline void storeu256(void *ptr, m256 a) { ++ storeu128(ptr, a.lo); ++ storeu128((char *)ptr + 16, a.hi); ++} ++ ++// packed unaligned store of first N bytes ++static really_inline void storebytes256(void *ptr, m256 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline m256 loadbytes256(const void *ptr, unsigned int n) { ++ m256 a = zeroes256(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++static really_inline m256 mask1bit256(unsigned int n) { ++ assert(n < sizeof(m256) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu256(&simd_onebit_masks[mask_idx]); ++} ++ ++static really_inline m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { ++ m256 rv; ++ rv.hi = set64x2(hi_1, hi_0); ++ rv.lo = set64x2(lo_1, lo_0); ++ return rv; ++} ++ ++// switches on bit N in the given vector. ++static really_inline void setbit256(m256 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else { ++ sub = &ptr->hi; ++ n -= 128; ++ } ++ setbit128(sub, n); ++} ++ ++// switches off bit N in the given vector. ++static really_inline void clearbit256(m256 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else { ++ sub = &ptr->hi; ++ n -= 128; ++ } ++ clearbit128(sub, n); ++} ++ ++// tests bit N in the given vector. ++static really_inline char testbit256(m256 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo; ++ } else { ++ sub = val.hi; ++ n -= 128; ++ } ++ return testbit128(sub, n); ++} ++ ++static really_really_inline ++m128 movdq_hi(m256 x) { ++ return x.hi; ++} ++ ++static really_really_inline m128 movdq_lo(m256 x) { return x.lo;} ++ ++static really_inline m256 combine2x128(m128 hi, m128 lo) { ++ m256 rv = {lo, hi}; ++ return rv; ++} ++ ++/**** ++ **** 384-bit Primitives ++ ****/ ++ ++static really_inline m384 and384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = and128(a.lo, b.lo); ++ rv.mid = and128(a.mid, b.mid); ++ rv.hi = and128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m384 or384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = or128(a.lo, b.lo); ++ rv.mid = or128(a.mid, b.mid); ++ rv.hi = or128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m384 xor384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = xor128(a.lo, b.lo); ++ rv.mid = xor128(a.mid, b.mid); ++ rv.hi = xor128(a.hi, b.hi); ++ return rv; ++} ++static really_inline m384 not384(m384 a) { ++ m384 rv; ++ rv.lo = not128(a.lo); ++ rv.mid = not128(a.mid); ++ rv.hi = not128(a.hi); ++ return rv; ++} ++static really_inline m384 andnot384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = andnot128(a.lo, b.lo); ++ rv.mid = andnot128(a.mid, b.mid); ++ rv.hi = andnot128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_really_inline m384 lshift64_m384(m384 a, unsigned b) { ++ m384 rv; ++ rv.lo = lshift64_m128(a.lo, b); ++ rv.mid = lshift64_m128(a.mid, b); ++ rv.hi = lshift64_m128(a.hi, b); ++ return rv; ++} ++ ++static really_inline m384 zeroes384(void) { ++ m384 rv = {zeroes128(), zeroes128(), zeroes128()}; ++ return rv; ++} ++ ++static really_inline m384 ones384(void) { ++ m384 rv = {ones128(), ones128(), ones128()}; ++ return rv; ++} ++ ++static really_inline int diff384(m384 a, m384 b) { ++ return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); ++} ++ ++static really_inline int isnonzero384(m384 a) { ++ return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); ++} ++ ++/** ++ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich384(m384 a, m384 b) { ++ m128 z = zeroes128(); ++ m128 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5; ++ a.lo = __lsx_vseq_w(a.lo, b.lo); ++ a.mid = __lsx_vseq_w(a.mid, b.mid); ++ a.hi = __lsx_vseq_w(a.hi, b.hi); ++ ++ tmp0 = __lsx_vsat_w(a.lo, 15); ++ tmp1 = __lsx_vsat_w(b.mid, 15); ++ ++ tmp2 = __lsx_vsat_w(b.hi, 15); ++ tmp3 = __lsx_vsat_w(z, 15); ++ ++ tmp4 = __lsx_vsat_h(__lsx_vpickev_h(tmp1, tmp0),7); ++ tmp5 = __lsx_vsat_h(__lsx_vpickev_h(tmp3, tmp2),7); ++ ++ m128 packed = __lsx_vpickev_b(tmp5,tmp4); ++ ++ return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xfff; ++} ++ ++/** ++ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and ++ * returns a 12-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_384(m384 a, m384 b) { ++ u32 d = diffrich384(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline m384 load384(const void *ptr) { ++ assert(ISALIGNED_16(ptr)); ++ m384 rv = {load128(ptr), load128((const char *)ptr + 16), ++ load128((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// aligned store ++static really_inline void store384(void *ptr, m384 a) { ++ assert(ISALIGNED_16(ptr)); ++ ptr = assume_aligned(ptr, 16); ++ *(m384 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m384 loadu384(const void *ptr) { ++ m384 rv = {loadu128(ptr), loadu128((const char *)ptr + 16), ++ loadu128((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// packed unaligned store of first N bytes ++static really_inline void storebytes384(void *ptr, m384 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline m384 loadbytes384(const void *ptr, unsigned int n) { ++ m384 a = zeroes384(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++// switches on bit N in the given vector. ++static really_inline void setbit384(m384 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else if (n < 256) { ++ sub = &ptr->mid; ++ } else { ++ sub = &ptr->hi; ++ } ++ setbit128(sub, n % 128); ++} ++ ++// switches off bit N in the given vector. ++static really_inline void clearbit384(m384 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else if (n < 256) { ++ sub = &ptr->mid; ++ } else { ++ sub = &ptr->hi; ++ } ++ clearbit128(sub, n % 128); ++} ++ ++// tests bit N in the given vector. ++static really_inline char testbit384(m384 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo; ++ } else if (n < 256) { ++ sub = val.mid; ++ } else { ++ sub = val.hi; ++ } ++ return testbit128(sub, n % 128); ++} ++ ++/**** ++ **** 512-bit Primitives ++ ****/ ++ ++static really_inline m512 zeroes512(void) { ++ m512 rv = {zeroes256(), zeroes256()}; ++ return rv; ++} ++ ++static really_inline m512 ones512(void) { ++ m512 rv = {ones256(), ones256()}; ++ return rv; ++} ++ ++static really_inline m512 and512(m512 a, m512 b) { ++ m512 rv; ++ rv.lo = and256(a.lo, b.lo); ++ rv.hi = and256(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m512 or512(m512 a, m512 b) { ++ m512 rv; ++ rv.lo = or256(a.lo, b.lo); ++ rv.hi = or256(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m512 xor512(m512 a, m512 b) { ++ m512 rv; ++ rv.lo = xor256(a.lo, b.lo); ++ rv.hi = xor256(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m512 not512(m512 a) { ++ m512 rv; ++ rv.lo = not256(a.lo); ++ rv.hi = not256(a.hi); ++ return rv; ++} ++ ++static really_inline m512 andnot512(m512 a, m512 b) { ++ m512 rv; ++ rv.lo = andnot256(a.lo, b.lo); ++ rv.hi = andnot256(a.hi, b.hi); ++ return rv; ++} ++ ++static really_really_inline m512 lshift64_m512(m512 a, unsigned b) { ++ m512 rv; ++ rv.lo = lshift64_m256(a.lo, b); ++ rv.hi = lshift64_m256(a.hi, b); ++ return rv; ++} ++ ++static really_inline int diff512(m512 a, m512 b) { ++ return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); ++} ++ ++static really_inline int isnonzero512(m512 a) { ++ m128 x = or128(a.lo.lo, a.lo.hi); ++ m128 y = or128(a.hi.lo, a.hi.hi); ++ return isnonzero128(or128(x, y)); ++} ++ ++/** ++ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich512(m512 a, m512 b) { ++ m128 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7; ++ a.lo.lo = __lsx_vseq_w(a.lo.lo, b.lo.lo); ++ a.lo.hi = __lsx_vseq_w(a.lo.hi, b.lo.hi); ++ a.hi.lo = __lsx_vseq_w(a.hi.lo, b.hi.lo); ++ a.hi.hi = __lsx_vseq_w(a.hi.hi, b.hi.hi); ++ ++ tmp0 =__lsx_vsat_w(a.lo.lo, 15); ++ tmp1 =__lsx_vsat_w(a.lo.hi, 15); ++ tmp2 =__lsx_vpickev_h(tmp1, tmp0); ++ ++ tmp3 =__lsx_vsat_w(a.hi.lo, 15); ++ tmp4 =__lsx_vsat_w(a.hi.hi, 15); ++ tmp5 =__lsx_vpickev_h(tmp4, tmp3); ++ ++ tmp6 =__lsx_vsat_h(tmp2, 7); ++ tmp7 =__lsx_vsat_h(tmp5, 7); ++ m128 packed = __lsx_vpickev_b(tmp7, tmp6); ++ ++ return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xffff; // ok ++} ++ ++/** ++ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and ++ * returns a 16-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_512(m512 a, m512 b) { ++ u32 d = diffrich512(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline m512 load512(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++ m512 rv = {load256(ptr), load256((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// aligned store ++static really_inline void store512(void *ptr, m512 a) { ++ assert(ISALIGNED_N(ptr, alignof(m512))); ++ ptr = assume_aligned(ptr, 16); ++ *(m512 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m512 loadu512(const void *ptr) { ++ m512 rv = {loadu256(ptr), loadu256((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// packed unaligned store of first N bytes ++static really_inline void storebytes512(void *ptr, m512 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline m512 loadbytes512(const void *ptr, unsigned int n) { ++ m512 a = zeroes512(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++static really_inline m512 mask1bit512(unsigned int n) { ++ assert(n < sizeof(m512) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu512(&simd_onebit_masks[mask_idx]); ++} ++ ++// switches on bit N in the given vector. ++static really_inline void setbit512(m512 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo.lo; ++ } else if (n < 256) { ++ sub = &ptr->lo.hi; ++ } else if (n < 384) { ++ sub = &ptr->hi.lo; ++ } else { ++ sub = &ptr->hi.hi; ++ } ++ setbit128(sub, n % 128); ++} ++ ++// switches off bit N in the given vector. ++static really_inline void clearbit512(m512 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo.lo; ++ } else if (n < 256) { ++ sub = &ptr->lo.hi; ++ } else if (n < 384) { ++ sub = &ptr->hi.lo; ++ } else { ++ sub = &ptr->hi.hi; ++ } ++ clearbit128(sub, n % 128); ++} ++ ++// tests bit N in the given vector. ++static really_inline char testbit512(m512 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo.lo; ++ } else if (n < 256) { ++ sub = val.lo.hi; ++ } else if (n < 384) { ++ sub = val.hi.lo; ++ } else { ++ sub = val.hi.hi; ++ } ++ return testbit128(sub, n % 128); ++} ++ ++#endif +diff --git a/src/util/simd_types.h b/src/util/simd_types.h +index 962cad6..3831423 100644 +--- a/src/util/simd_types.h ++++ b/src/util/simd_types.h +@@ -33,9 +33,12 @@ + #include "util/arch.h" + #include "util/intrinsics.h" + #include "ue2common.h" ++#include + + #if defined(HAVE_SSE2) + typedef __m128i m128; ++#elif defined(ARCH_LOONGARCH64) ++typedef __m128i m128; + #else + typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128; + #endif +diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h +index 5fa727e..86760a4 100644 +--- a/src/util/simd_utils.h ++++ b/src/util/simd_utils.h +@@ -33,1388 +33,10 @@ + #ifndef SIMD_UTILS + #define SIMD_UTILS + +-#if !defined(_WIN32) && !defined(__SSSE3__) +-#error SSSE3 instructions must be enabled ++#if defined(__x86_64__) ++#include "simd_x86.h" ++#elif defined(__loongarch64) ++#include "simd_loongarch.h" + #endif + +-#include "config.h" +-#include "ue2common.h" +-#include "simd_types.h" +-#include "unaligned.h" +-#include "util/arch.h" +-#include "util/intrinsics.h" +- +-#include // for memcpy +- +-// Define a common assume_aligned using an appropriate compiler built-in, if +-// it's available. Note that we need to handle C or C++ compilation. +-#ifdef __cplusplus +-# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED +-# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +-# endif +-#else +-# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED +-# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +-# endif +-#endif +- +-// Fallback to identity case. +-#ifndef assume_aligned +-#define assume_aligned(x, y) (x) +-#endif +- +-#ifdef __cplusplus +-extern "C" { +-#endif +-extern const char vbs_mask_data[]; +-#ifdef __cplusplus +-} +-#endif +- +-static really_inline m128 ones128(void) { +-#if defined(__GNUC__) || defined(__INTEL_COMPILER) +- /* gcc gets this right */ +- return _mm_set1_epi8(0xFF); +-#else +- /* trick from Intel's optimization guide to generate all-ones. +- * ICC converts this to the single cmpeq instruction */ +- return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); +-#endif +-} +- +-static really_inline m128 zeroes128(void) { +- return _mm_setzero_si128(); +-} +- +-/** \brief Bitwise not for m128*/ +-static really_inline m128 not128(m128 a) { +- return _mm_xor_si128(a, ones128()); +-} +- +-/** \brief Return 1 if a and b are different otherwise 0 */ +-static really_inline int diff128(m128 a, m128 b) { +- return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); +-} +- +-static really_inline int isnonzero128(m128 a) { +- return !!diff128(a, zeroes128()); +-} +- +-/** +- * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit +- * mask indicating which 32-bit words contain differences. +- */ +-static really_inline u32 diffrich128(m128 a, m128 b) { +- a = _mm_cmpeq_epi32(a, b); +- return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; +-} +- +-/** +- * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and +- * returns a 4-bit mask indicating which 64-bit words contain differences. +- */ +-static really_inline u32 diffrich64_128(m128 a, m128 b) { +-#if defined(HAVE_SSE41) +- a = _mm_cmpeq_epi64(a, b); +- return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; +-#else +- u32 d = diffrich128(a, b); +- return (d | (d >> 1)) & 0x5; +-#endif +-} +- +-static really_really_inline +-m128 lshift64_m128(m128 a, unsigned b) { +-#if defined(HAVE__BUILTIN_CONSTANT_P) +- if (__builtin_constant_p(b)) { +- return _mm_slli_epi64(a, b); +- } +-#endif +- m128 x = _mm_cvtsi32_si128(b); +- return _mm_sll_epi64(a, x); +-} +- +-#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) +-#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) +-#define movemask128(a) ((u32)_mm_movemask_epi8((a))) +- +-#if defined(HAVE_AVX512) +-static really_inline m128 cast512to128(const m512 in) { +- return _mm512_castsi512_si128(in); +-} +-#endif +- +-static really_inline m128 set16x8(u8 c) { +- return _mm_set1_epi8(c); +-} +- +-static really_inline m128 set4x32(u32 c) { +- return _mm_set1_epi32(c); +-} +- +-static really_inline u32 movd(const m128 in) { +- return _mm_cvtsi128_si32(in); +-} +- +-static really_inline u64a movq(const m128 in) { +-#if defined(ARCH_X86_64) +- return _mm_cvtsi128_si64(in); +-#else // 32-bit - this is horrific +- u32 lo = movd(in); +- u32 hi = movd(_mm_srli_epi64(in, 32)); +- return (u64a)hi << 32 | lo; +-#endif +-} +- +-#if defined(HAVE_AVX512) +-static really_inline u32 movd512(const m512 in) { +- // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), +- // so we use 2-step convertions to work around. +- return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); +-} +- +-static really_inline u64a movq512(const m512 in) { +- // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in), +- // so we use 2-step convertions to work around. +- return movq(_mm512_castsi512_si128(in)); +-} +-#endif +- +-/* another form of movq */ +-static really_inline +-m128 load_m128_from_u64a(const u64a *p) { +- return _mm_set_epi64x(0LL, *p); +-} +- +-#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) +-#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) +- +-#if defined(HAVE_SSE41) +-#define extract32from128(a, imm) _mm_extract_epi32(a, imm) +-#define extract64from128(a, imm) _mm_extract_epi64(a, imm) +-#else +-#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) +-#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) +-#endif +- +-#if !defined(HAVE_AVX2) +-// TODO: this entire file needs restructuring - this carveout is awful +-#define extractlow64from256(a) movq(a.lo) +-#define extractlow32from256(a) movd(a.lo) +-#if defined(HAVE_SSE41) +-#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) +-#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) +-#else +-#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) +-#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) +-#endif +- +-#endif // !AVX2 +- +-static really_inline m128 and128(m128 a, m128 b) { +- return _mm_and_si128(a,b); +-} +- +-static really_inline m128 xor128(m128 a, m128 b) { +- return _mm_xor_si128(a,b); +-} +- +-static really_inline m128 or128(m128 a, m128 b) { +- return _mm_or_si128(a,b); +-} +- +-#if defined(HAVE_AVX512VBMI) +-static really_inline m512 expand128(m128 a) { +- return _mm512_broadcast_i32x4(a); +-} +- +-static really_inline m512 expand256(m256 a) { +- return _mm512_broadcast_i64x4(a); +-} +- +-static really_inline m512 expand384(m384 a) { +- u64a *lo = (u64a*)&a.lo; +- u64a *mid = (u64a*)&a.mid; +- u64a *hi = (u64a*)&a.hi; +- return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0], +- lo[1], lo[0]); +-} +-#endif +- +-static really_inline m128 andnot128(m128 a, m128 b) { +- return _mm_andnot_si128(a, b); +-} +- +-// aligned load +-static really_inline m128 load128(const void *ptr) { +- assert(ISALIGNED_N(ptr, alignof(m128))); +- ptr = assume_aligned(ptr, 16); +- return _mm_load_si128((const m128 *)ptr); +-} +- +-// aligned store +-static really_inline void store128(void *ptr, m128 a) { +- assert(ISALIGNED_N(ptr, alignof(m128))); +- ptr = assume_aligned(ptr, 16); +- *(m128 *)ptr = a; +-} +- +-// unaligned load +-static really_inline m128 loadu128(const void *ptr) { +- return _mm_loadu_si128((const m128 *)ptr); +-} +- +-// unaligned store +-static really_inline void storeu128(void *ptr, m128 a) { +- _mm_storeu_si128 ((m128 *)ptr, a); +-} +- +-// packed unaligned store of first N bytes +-static really_inline +-void storebytes128(void *ptr, m128 a, unsigned int n) { +- assert(n <= sizeof(a)); +- memcpy(ptr, &a, n); +-} +- +-// packed unaligned load of first N bytes, pad with zero +-static really_inline +-m128 loadbytes128(const void *ptr, unsigned int n) { +- m128 a = zeroes128(); +- assert(n <= sizeof(a)); +- memcpy(&a, ptr, n); +- return a; +-} +- +-#ifdef __cplusplus +-extern "C" { +-#endif +-extern const u8 simd_onebit_masks[]; +-#ifdef __cplusplus +-} +-#endif +- +-static really_inline +-m128 mask1bit128(unsigned int n) { +- assert(n < sizeof(m128) * 8); +- u32 mask_idx = ((n % 8) * 64) + 95; +- mask_idx -= n / 8; +- return loadu128(&simd_onebit_masks[mask_idx]); +-} +- +-// switches on bit N in the given vector. +-static really_inline +-void setbit128(m128 *ptr, unsigned int n) { +- *ptr = or128(mask1bit128(n), *ptr); +-} +- +-// switches off bit N in the given vector. +-static really_inline +-void clearbit128(m128 *ptr, unsigned int n) { +- *ptr = andnot128(mask1bit128(n), *ptr); +-} +- +-// tests bit N in the given vector. +-static really_inline +-char testbit128(m128 val, unsigned int n) { +- const m128 mask = mask1bit128(n); +-#if defined(HAVE_SSE41) +- return !_mm_testz_si128(mask, val); +-#else +- return isnonzero128(and128(mask, val)); +-#endif +-} +- +-// offset must be an immediate +-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) +- +-static really_inline +-m128 pshufb_m128(m128 a, m128 b) { +- m128 result; +- result = _mm_shuffle_epi8(a, b); +- return result; +-} +- +-static really_inline +-m256 pshufb_m256(m256 a, m256 b) { +-#if defined(HAVE_AVX2) +- return _mm256_shuffle_epi8(a, b); +-#else +- m256 rv; +- rv.lo = pshufb_m128(a.lo, b.lo); +- rv.hi = pshufb_m128(a.hi, b.hi); +- return rv; +-#endif +-} +- +-#if defined(HAVE_AVX512) +-static really_inline +-m512 pshufb_m512(m512 a, m512 b) { +- return _mm512_shuffle_epi8(a, b); +-} +- +-static really_inline +-m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { +- return _mm512_maskz_shuffle_epi8(k, a, b); +-} +- +-#if defined(HAVE_AVX512VBMI) +-#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) +-#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) +-#endif +- +-#endif +- +-static really_inline +-m128 variable_byte_shift_m128(m128 in, s32 amount) { +- assert(amount >= -16 && amount <= 16); +- m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); +- return pshufb_m128(in, shift_mask); +-} +- +-static really_inline +-m128 max_u8_m128(m128 a, m128 b) { +- return _mm_max_epu8(a, b); +-} +- +-static really_inline +-m128 min_u8_m128(m128 a, m128 b) { +- return _mm_min_epu8(a, b); +-} +- +-static really_inline +-m128 sadd_u8_m128(m128 a, m128 b) { +- return _mm_adds_epu8(a, b); +-} +- +-static really_inline +-m128 sub_u8_m128(m128 a, m128 b) { +- return _mm_sub_epi8(a, b); +-} +- +-static really_inline +-m128 set64x2(u64a hi, u64a lo) { +- return _mm_set_epi64x(hi, lo); +-} +- +-/**** +- **** 256-bit Primitives +- ****/ +- +-#if defined(HAVE_AVX2) +- +-static really_really_inline +-m256 lshift64_m256(m256 a, unsigned b) { +-#if defined(HAVE__BUILTIN_CONSTANT_P) +- if (__builtin_constant_p(b)) { +- return _mm256_slli_epi64(a, b); +- } +-#endif +- m128 x = _mm_cvtsi32_si128(b); +- return _mm256_sll_epi64(a, x); +-} +- +-#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) +- +-static really_inline +-m256 set32x8(u32 in) { +- return _mm256_set1_epi8(in); +-} +- +-#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) +-#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) +- +-static really_inline +-m256 set2x128(m128 a) { +- return _mm256_broadcastsi128_si256(a); +-} +- +-#else +- +-static really_really_inline +-m256 lshift64_m256(m256 a, int b) { +- m256 rv = a; +- rv.lo = lshift64_m128(rv.lo, b); +- rv.hi = lshift64_m128(rv.hi, b); +- return rv; +-} +- +-static really_inline +-m256 rshift64_m256(m256 a, int b) { +- m256 rv = a; +- rv.lo = rshift64_m128(rv.lo, b); +- rv.hi = rshift64_m128(rv.hi, b); +- return rv; +-} +-static really_inline +-m256 set32x8(u32 in) { +- m256 rv; +- rv.lo = set16x8((u8) in); +- rv.hi = rv.lo; +- return rv; +-} +- +-static really_inline +-m256 eq256(m256 a, m256 b) { +- m256 rv; +- rv.lo = eq128(a.lo, b.lo); +- rv.hi = eq128(a.hi, b.hi); +- return rv; +-} +- +-static really_inline +-u32 movemask256(m256 a) { +- u32 lo_mask = movemask128(a.lo); +- u32 hi_mask = movemask128(a.hi); +- return lo_mask | (hi_mask << 16); +-} +- +-static really_inline +-m256 set2x128(m128 a) { +- m256 rv = {a, a}; +- return rv; +-} +-#endif +- +-static really_inline m256 zeroes256(void) { +-#if defined(HAVE_AVX2) +- return _mm256_setzero_si256(); +-#else +- m256 rv = {zeroes128(), zeroes128()}; +- return rv; +-#endif +-} +- +-static really_inline m256 ones256(void) { +-#if defined(HAVE_AVX2) +- m256 rv = _mm256_set1_epi8(0xFF); +-#else +- m256 rv = {ones128(), ones128()}; +-#endif +- return rv; +-} +- +-#if defined(HAVE_AVX2) +-static really_inline m256 and256(m256 a, m256 b) { +- return _mm256_and_si256(a, b); +-} +-#else +-static really_inline m256 and256(m256 a, m256 b) { +- m256 rv; +- rv.lo = and128(a.lo, b.lo); +- rv.hi = and128(a.hi, b.hi); +- return rv; +-} +-#endif +- +-#if defined(HAVE_AVX2) +-static really_inline m256 or256(m256 a, m256 b) { +- return _mm256_or_si256(a, b); +-} +-#else +-static really_inline m256 or256(m256 a, m256 b) { +- m256 rv; +- rv.lo = or128(a.lo, b.lo); +- rv.hi = or128(a.hi, b.hi); +- return rv; +-} +-#endif +- +-#if defined(HAVE_AVX2) +-static really_inline m256 xor256(m256 a, m256 b) { +- return _mm256_xor_si256(a, b); +-} +-#else +-static really_inline m256 xor256(m256 a, m256 b) { +- m256 rv; +- rv.lo = xor128(a.lo, b.lo); +- rv.hi = xor128(a.hi, b.hi); +- return rv; +-} +-#endif +- +-#if defined(HAVE_AVX2) +-static really_inline m256 not256(m256 a) { +- return _mm256_xor_si256(a, ones256()); +-} +-#else +-static really_inline m256 not256(m256 a) { +- m256 rv; +- rv.lo = not128(a.lo); +- rv.hi = not128(a.hi); +- return rv; +-} +-#endif +- +-#if defined(HAVE_AVX2) +-static really_inline m256 andnot256(m256 a, m256 b) { +- return _mm256_andnot_si256(a, b); +-} +-#else +-static really_inline m256 andnot256(m256 a, m256 b) { +- m256 rv; +- rv.lo = andnot128(a.lo, b.lo); +- rv.hi = andnot128(a.hi, b.hi); +- return rv; +-} +-#endif +- +-static really_inline int diff256(m256 a, m256 b) { +-#if defined(HAVE_AVX2) +- return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); +-#else +- return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); +-#endif +-} +- +-static really_inline int isnonzero256(m256 a) { +-#if defined(HAVE_AVX2) +- return !!diff256(a, zeroes256()); +-#else +- return isnonzero128(or128(a.lo, a.hi)); +-#endif +-} +- +-/** +- * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit +- * mask indicating which 32-bit words contain differences. +- */ +-static really_inline u32 diffrich256(m256 a, m256 b) { +-#if defined(HAVE_AVX2) +- a = _mm256_cmpeq_epi32(a, b); +- return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; +-#else +- m128 z = zeroes128(); +- a.lo = _mm_cmpeq_epi32(a.lo, b.lo); +- a.hi = _mm_cmpeq_epi32(a.hi, b.hi); +- m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); +- return ~(_mm_movemask_epi8(packed)) & 0xff; +-#endif +-} +- +-/** +- * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and +- * returns an 8-bit mask indicating which 64-bit words contain differences. +- */ +-static really_inline u32 diffrich64_256(m256 a, m256 b) { +- u32 d = diffrich256(a, b); +- return (d | (d >> 1)) & 0x55555555; +-} +- +-// aligned load +-static really_inline m256 load256(const void *ptr) { +- assert(ISALIGNED_N(ptr, alignof(m256))); +-#if defined(HAVE_AVX2) +- return _mm256_load_si256((const m256 *)ptr); +-#else +- m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; +- return rv; +-#endif +-} +- +-// aligned load of 128-bit value to low and high part of 256-bit value +-static really_inline m256 load2x128(const void *ptr) { +-#if defined(HAVE_AVX2) +- return set2x128(load128(ptr)); +-#else +- assert(ISALIGNED_N(ptr, alignof(m128))); +- m256 rv; +- rv.hi = rv.lo = load128(ptr); +- return rv; +-#endif +-} +- +-static really_inline m256 loadu2x128(const void *ptr) { +- return set2x128(loadu128(ptr)); +-} +- +-// aligned store +-static really_inline void store256(void *ptr, m256 a) { +- assert(ISALIGNED_N(ptr, alignof(m256))); +-#if defined(HAVE_AVX2) +- _mm256_store_si256((m256 *)ptr, a); +-#else +- ptr = assume_aligned(ptr, 16); +- *(m256 *)ptr = a; +-#endif +-} +- +-// unaligned load +-static really_inline m256 loadu256(const void *ptr) { +-#if defined(HAVE_AVX2) +- return _mm256_loadu_si256((const m256 *)ptr); +-#else +- m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; +- return rv; +-#endif +-} +- +-// unaligned store +-static really_inline void storeu256(void *ptr, m256 a) { +-#if defined(HAVE_AVX2) +- _mm256_storeu_si256((m256 *)ptr, a); +-#else +- storeu128(ptr, a.lo); +- storeu128((char *)ptr + 16, a.hi); +-#endif +-} +- +-// packed unaligned store of first N bytes +-static really_inline +-void storebytes256(void *ptr, m256 a, unsigned int n) { +- assert(n <= sizeof(a)); +- memcpy(ptr, &a, n); +-} +- +-// packed unaligned load of first N bytes, pad with zero +-static really_inline +-m256 loadbytes256(const void *ptr, unsigned int n) { +- m256 a = zeroes256(); +- assert(n <= sizeof(a)); +- memcpy(&a, ptr, n); +- return a; +-} +- +-static really_inline +-m256 mask1bit256(unsigned int n) { +- assert(n < sizeof(m256) * 8); +- u32 mask_idx = ((n % 8) * 64) + 95; +- mask_idx -= n / 8; +- return loadu256(&simd_onebit_masks[mask_idx]); +-} +- +-static really_inline +-m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { +-#if defined(HAVE_AVX2) +- return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); +-#else +- m256 rv; +- rv.hi = set64x2(hi_1, hi_0); +- rv.lo = set64x2(lo_1, lo_0); +- return rv; +-#endif +-} +- +-#if !defined(HAVE_AVX2) +-// switches on bit N in the given vector. +-static really_inline +-void setbit256(m256 *ptr, unsigned int n) { +- assert(n < sizeof(*ptr) * 8); +- m128 *sub; +- if (n < 128) { +- sub = &ptr->lo; +- } else { +- sub = &ptr->hi; +- n -= 128; +- } +- setbit128(sub, n); +-} +- +-// switches off bit N in the given vector. +-static really_inline +-void clearbit256(m256 *ptr, unsigned int n) { +- assert(n < sizeof(*ptr) * 8); +- m128 *sub; +- if (n < 128) { +- sub = &ptr->lo; +- } else { +- sub = &ptr->hi; +- n -= 128; +- } +- clearbit128(sub, n); +-} +- +-// tests bit N in the given vector. +-static really_inline +-char testbit256(m256 val, unsigned int n) { +- assert(n < sizeof(val) * 8); +- m128 sub; +- if (n < 128) { +- sub = val.lo; +- } else { +- sub = val.hi; +- n -= 128; +- } +- return testbit128(sub, n); +-} +- +-static really_really_inline +-m128 movdq_hi(m256 x) { +- return x.hi; +-} +- +-static really_really_inline +-m128 movdq_lo(m256 x) { +- return x.lo; +-} +- +-static really_inline +-m256 combine2x128(m128 hi, m128 lo) { +- m256 rv = {lo, hi}; +- return rv; +-} +- +-#else // AVX2 +- +-// switches on bit N in the given vector. +-static really_inline +-void setbit256(m256 *ptr, unsigned int n) { +- *ptr = or256(mask1bit256(n), *ptr); +-} +- +-static really_inline +-void clearbit256(m256 *ptr, unsigned int n) { +- *ptr = andnot256(mask1bit256(n), *ptr); +-} +- +-// tests bit N in the given vector. +-static really_inline +-char testbit256(m256 val, unsigned int n) { +- const m256 mask = mask1bit256(n); +- return !_mm256_testz_si256(mask, val); +-} +- +-static really_really_inline +-m128 movdq_hi(m256 x) { +- return _mm256_extracti128_si256(x, 1); +-} +- +-static really_really_inline +-m128 movdq_lo(m256 x) { +- return _mm256_extracti128_si256(x, 0); +-} +- +-#define cast256to128(a) _mm256_castsi256_si128(a) +-#define cast128to256(a) _mm256_castsi128_si256(a) +-#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) +-#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) +-#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) +-#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) +-#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) +-#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) +-#define extractlow64from256(a) movq(cast256to128(a)) +-#define extractlow32from256(a) movd(cast256to128(a)) +-#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) +-#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) +-#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) +- +-static really_inline +-m256 combine2x128(m128 hi, m128 lo) { +-#if defined(_mm256_set_m128i) +- return _mm256_set_m128i(hi, lo); +-#else +- return insert128to256(cast128to256(lo), hi, 1); +-#endif +-} +-#endif //AVX2 +- +-#if defined(HAVE_AVX512) +-#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) +-#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) +-#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) +-#define set2x256(a) _mm512_broadcast_i64x4(a) +-#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) +-#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) +-#endif +- +-/**** +- **** 384-bit Primitives +- ****/ +- +-static really_inline m384 and384(m384 a, m384 b) { +- m384 rv; +- rv.lo = and128(a.lo, b.lo); +- rv.mid = and128(a.mid, b.mid); +- rv.hi = and128(a.hi, b.hi); +- return rv; +-} +- +-static really_inline m384 or384(m384 a, m384 b) { +- m384 rv; +- rv.lo = or128(a.lo, b.lo); +- rv.mid = or128(a.mid, b.mid); +- rv.hi = or128(a.hi, b.hi); +- return rv; +-} +- +-static really_inline m384 xor384(m384 a, m384 b) { +- m384 rv; +- rv.lo = xor128(a.lo, b.lo); +- rv.mid = xor128(a.mid, b.mid); +- rv.hi = xor128(a.hi, b.hi); +- return rv; +-} +-static really_inline m384 not384(m384 a) { +- m384 rv; +- rv.lo = not128(a.lo); +- rv.mid = not128(a.mid); +- rv.hi = not128(a.hi); +- return rv; +-} +-static really_inline m384 andnot384(m384 a, m384 b) { +- m384 rv; +- rv.lo = andnot128(a.lo, b.lo); +- rv.mid = andnot128(a.mid, b.mid); +- rv.hi = andnot128(a.hi, b.hi); +- return rv; +-} +- +-static really_really_inline +-m384 lshift64_m384(m384 a, unsigned b) { +- m384 rv; +- rv.lo = lshift64_m128(a.lo, b); +- rv.mid = lshift64_m128(a.mid, b); +- rv.hi = lshift64_m128(a.hi, b); +- return rv; +-} +- +-static really_inline m384 zeroes384(void) { +- m384 rv = {zeroes128(), zeroes128(), zeroes128()}; +- return rv; +-} +- +-static really_inline m384 ones384(void) { +- m384 rv = {ones128(), ones128(), ones128()}; +- return rv; +-} +- +-static really_inline int diff384(m384 a, m384 b) { +- return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); +-} +- +-static really_inline int isnonzero384(m384 a) { +- return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); +-} +- +-/** +- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit +- * mask indicating which 32-bit words contain differences. +- */ +-static really_inline u32 diffrich384(m384 a, m384 b) { +- m128 z = zeroes128(); +- a.lo = _mm_cmpeq_epi32(a.lo, b.lo); +- a.mid = _mm_cmpeq_epi32(a.mid, b.mid); +- a.hi = _mm_cmpeq_epi32(a.hi, b.hi); +- m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), +- _mm_packs_epi32(a.hi, z)); +- return ~(_mm_movemask_epi8(packed)) & 0xfff; +-} +- +-/** +- * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and +- * returns a 12-bit mask indicating which 64-bit words contain differences. +- */ +-static really_inline u32 diffrich64_384(m384 a, m384 b) { +- u32 d = diffrich384(a, b); +- return (d | (d >> 1)) & 0x55555555; +-} +- +-// aligned load +-static really_inline m384 load384(const void *ptr) { +- assert(ISALIGNED_16(ptr)); +- m384 rv = { load128(ptr), load128((const char *)ptr + 16), +- load128((const char *)ptr + 32) }; +- return rv; +-} +- +-// aligned store +-static really_inline void store384(void *ptr, m384 a) { +- assert(ISALIGNED_16(ptr)); +- ptr = assume_aligned(ptr, 16); +- *(m384 *)ptr = a; +-} +- +-// unaligned load +-static really_inline m384 loadu384(const void *ptr) { +- m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), +- loadu128((const char *)ptr + 32)}; +- return rv; +-} +- +-// packed unaligned store of first N bytes +-static really_inline +-void storebytes384(void *ptr, m384 a, unsigned int n) { +- assert(n <= sizeof(a)); +- memcpy(ptr, &a, n); +-} +- +-// packed unaligned load of first N bytes, pad with zero +-static really_inline +-m384 loadbytes384(const void *ptr, unsigned int n) { +- m384 a = zeroes384(); +- assert(n <= sizeof(a)); +- memcpy(&a, ptr, n); +- return a; +-} +- +-// switches on bit N in the given vector. +-static really_inline +-void setbit384(m384 *ptr, unsigned int n) { +- assert(n < sizeof(*ptr) * 8); +- m128 *sub; +- if (n < 128) { +- sub = &ptr->lo; +- } else if (n < 256) { +- sub = &ptr->mid; +- } else { +- sub = &ptr->hi; +- } +- setbit128(sub, n % 128); +-} +- +-// switches off bit N in the given vector. +-static really_inline +-void clearbit384(m384 *ptr, unsigned int n) { +- assert(n < sizeof(*ptr) * 8); +- m128 *sub; +- if (n < 128) { +- sub = &ptr->lo; +- } else if (n < 256) { +- sub = &ptr->mid; +- } else { +- sub = &ptr->hi; +- } +- clearbit128(sub, n % 128); +-} +- +-// tests bit N in the given vector. +-static really_inline +-char testbit384(m384 val, unsigned int n) { +- assert(n < sizeof(val) * 8); +- m128 sub; +- if (n < 128) { +- sub = val.lo; +- } else if (n < 256) { +- sub = val.mid; +- } else { +- sub = val.hi; +- } +- return testbit128(sub, n % 128); +-} +- +-/**** +- **** 512-bit Primitives +- ****/ +- +-#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) +-#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) +- +-static really_inline +-m512 zeroes512(void) { +-#if defined(HAVE_AVX512) +- return _mm512_setzero_si512(); +-#else +- m512 rv = {zeroes256(), zeroes256()}; +- return rv; +-#endif +-} +- +-static really_inline +-m512 ones512(void) { +-#if defined(HAVE_AVX512) +- return _mm512_set1_epi8(0xFF); +- //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); +-#else +- m512 rv = {ones256(), ones256()}; +- return rv; +-#endif +-} +- +-#if defined(HAVE_AVX512) +-static really_inline +-m512 set64x8(u8 a) { +- return _mm512_set1_epi8(a); +-} +- +-static really_inline +-m512 set8x64(u64a a) { +- return _mm512_set1_epi64(a); +-} +- +-static really_inline +-m512 set16x32(u32 a) { +- return _mm512_set1_epi32(a); +-} +- +-static really_inline +-m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, +- u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { +- return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, +- lo_3, lo_2, lo_1, lo_0); +-} +- +-static really_inline +-m512 swap256in512(m512 a) { +- m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); +- return vpermq512(idx, a); +-} +- +-static really_inline +-m512 set4x128(m128 a) { +- return _mm512_broadcast_i32x4(a); +-} +- +-static really_inline +-m512 sadd_u8_m512(m512 a, m512 b) { +- return _mm512_adds_epu8(a, b); +-} +- +-static really_inline +-m512 max_u8_m512(m512 a, m512 b) { +- return _mm512_max_epu8(a, b); +-} +- +-static really_inline +-m512 min_u8_m512(m512 a, m512 b) { +- return _mm512_min_epu8(a, b); +-} +- +-static really_inline +-m512 sub_u8_m512(m512 a, m512 b) { +- return _mm512_sub_epi8(a, b); +-} +-#endif +- +-static really_inline +-m512 and512(m512 a, m512 b) { +-#if defined(HAVE_AVX512) +- return _mm512_and_si512(a, b); +-#else +- m512 rv; +- rv.lo = and256(a.lo, b.lo); +- rv.hi = and256(a.hi, b.hi); +- return rv; +-#endif +-} +- +-static really_inline +-m512 or512(m512 a, m512 b) { +-#if defined(HAVE_AVX512) +- return _mm512_or_si512(a, b); +-#else +- m512 rv; +- rv.lo = or256(a.lo, b.lo); +- rv.hi = or256(a.hi, b.hi); +- return rv; +-#endif +-} +- +-static really_inline +-m512 xor512(m512 a, m512 b) { +-#if defined(HAVE_AVX512) +- return _mm512_xor_si512(a, b); +-#else +- m512 rv; +- rv.lo = xor256(a.lo, b.lo); +- rv.hi = xor256(a.hi, b.hi); +- return rv; +-#endif +-} +- +-static really_inline +-m512 not512(m512 a) { +-#if defined(HAVE_AVX512) +- return _mm512_xor_si512(a, ones512()); +-#else +- m512 rv; +- rv.lo = not256(a.lo); +- rv.hi = not256(a.hi); +- return rv; +-#endif +-} +- +-static really_inline +-m512 andnot512(m512 a, m512 b) { +-#if defined(HAVE_AVX512) +- return _mm512_andnot_si512(a, b); +-#else +- m512 rv; +- rv.lo = andnot256(a.lo, b.lo); +- rv.hi = andnot256(a.hi, b.hi); +- return rv; +-#endif +-} +- +-#if defined(HAVE_AVX512) +-static really_really_inline +-m512 lshift64_m512(m512 a, unsigned b) { +-#if defined(HAVE__BUILTIN_CONSTANT_P) +- if (__builtin_constant_p(b)) { +- return _mm512_slli_epi64(a, b); +- } +-#endif +- m128 x = _mm_cvtsi32_si128(b); +- return _mm512_sll_epi64(a, x); +-} +-#else +-static really_really_inline +-m512 lshift64_m512(m512 a, unsigned b) { +- m512 rv; +- rv.lo = lshift64_m256(a.lo, b); +- rv.hi = lshift64_m256(a.hi, b); +- return rv; +-} +-#endif +- +-#if defined(HAVE_AVX512) +-#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) +-#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) +-#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) +-#endif +- +-#if !defined(_MM_CMPINT_NE) +-#define _MM_CMPINT_NE 0x4 +-#endif +- +-static really_inline +-int diff512(m512 a, m512 b) { +-#if defined(HAVE_AVX512) +- return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); +-#else +- return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); +-#endif +-} +- +-static really_inline +-int isnonzero512(m512 a) { +-#if defined(HAVE_AVX512) +- return diff512(a, zeroes512()); +-#elif defined(HAVE_AVX2) +- m256 x = or256(a.lo, a.hi); +- return !!diff256(x, zeroes256()); +-#else +- m128 x = or128(a.lo.lo, a.lo.hi); +- m128 y = or128(a.hi.lo, a.hi.hi); +- return isnonzero128(or128(x, y)); +-#endif +-} +- +-/** +- * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit +- * mask indicating which 32-bit words contain differences. +- */ +-static really_inline +-u32 diffrich512(m512 a, m512 b) { +-#if defined(HAVE_AVX512) +- return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); +-#elif defined(HAVE_AVX2) +- return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); +-#else +- a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); +- a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); +- a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); +- a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); +- m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), +- _mm_packs_epi32(a.hi.lo, a.hi.hi)); +- return ~(_mm_movemask_epi8(packed)) & 0xffff; +-#endif +-} +- +-/** +- * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and +- * returns a 16-bit mask indicating which 64-bit words contain differences. +- */ +-static really_inline +-u32 diffrich64_512(m512 a, m512 b) { +- //TODO: cmp_epi64? +- u32 d = diffrich512(a, b); +- return (d | (d >> 1)) & 0x55555555; +-} +- +-// aligned load +-static really_inline +-m512 load512(const void *ptr) { +-#if defined(HAVE_AVX512) +- return _mm512_load_si512(ptr); +-#else +- assert(ISALIGNED_N(ptr, alignof(m256))); +- m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; +- return rv; +-#endif +-} +- +-// aligned store +-static really_inline +-void store512(void *ptr, m512 a) { +- assert(ISALIGNED_N(ptr, alignof(m512))); +-#if defined(HAVE_AVX512) +- return _mm512_store_si512(ptr, a); +-#elif defined(HAVE_AVX2) +- m512 *x = (m512 *)ptr; +- store256(&x->lo, a.lo); +- store256(&x->hi, a.hi); +-#else +- ptr = assume_aligned(ptr, 16); +- *(m512 *)ptr = a; +-#endif +-} +- +-// unaligned load +-static really_inline +-m512 loadu512(const void *ptr) { +-#if defined(HAVE_AVX512) +- return _mm512_loadu_si512(ptr); +-#else +- m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; +- return rv; +-#endif +-} +- +-// unaligned store +-static really_inline +-void storeu512(void *ptr, m512 a) { +-#if defined(HAVE_AVX512) +- _mm512_storeu_si512((m512 *)ptr, a); +-#elif defined(HAVE_AVX2) +- storeu256(ptr, a.lo); +- storeu256((char *)ptr + 32, a.hi); +-#else +- storeu128(ptr, a.lo.lo); +- storeu128((char *)ptr + 16, a.lo.hi); +- storeu128((char *)ptr + 32, a.hi.lo); +- storeu128((char *)ptr + 48, a.hi.hi); +-#endif +-} +- +-#if defined(HAVE_AVX512) +-static really_inline +-m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { +- return _mm512_maskz_loadu_epi8(k, ptr); +-} +- +-static really_inline +-m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { +- return _mm512_mask_loadu_epi8(src, k, ptr); +-} +- +-static really_inline +-void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) { +- _mm512_mask_storeu_epi8(ptr, k, a); +-} +- +-static really_inline +-m512 set_mask_m512(__mmask64 k) { +- return _mm512_movm_epi8(k); +-} +- +-static really_inline +-m256 loadu_maskz_m256(__mmask32 k, const void *ptr) { +- return _mm256_maskz_loadu_epi8(k, ptr); +-} +-#endif +- +-// packed unaligned store of first N bytes +-static really_inline +-void storebytes512(void *ptr, m512 a, unsigned int n) { +- assert(n <= sizeof(a)); +- memcpy(ptr, &a, n); +-} +- +-// packed unaligned load of first N bytes, pad with zero +-static really_inline +-m512 loadbytes512(const void *ptr, unsigned int n) { +- m512 a = zeroes512(); +- assert(n <= sizeof(a)); +- memcpy(&a, ptr, n); +- return a; +-} +- +-static really_inline +-m512 mask1bit512(unsigned int n) { +- assert(n < sizeof(m512) * 8); +- u32 mask_idx = ((n % 8) * 64) + 95; +- mask_idx -= n / 8; +- return loadu512(&simd_onebit_masks[mask_idx]); +-} +- +-// switches on bit N in the given vector. +-static really_inline +-void setbit512(m512 *ptr, unsigned int n) { +- assert(n < sizeof(*ptr) * 8); +-#if !defined(HAVE_AVX2) +- m128 *sub; +- if (n < 128) { +- sub = &ptr->lo.lo; +- } else if (n < 256) { +- sub = &ptr->lo.hi; +- } else if (n < 384) { +- sub = &ptr->hi.lo; +- } else { +- sub = &ptr->hi.hi; +- } +- setbit128(sub, n % 128); +-#elif defined(HAVE_AVX512) +- *ptr = or512(mask1bit512(n), *ptr); +-#else +- m256 *sub; +- if (n < 256) { +- sub = &ptr->lo; +- } else { +- sub = &ptr->hi; +- n -= 256; +- } +- setbit256(sub, n); +-#endif +-} +- +-// switches off bit N in the given vector. +-static really_inline +-void clearbit512(m512 *ptr, unsigned int n) { +- assert(n < sizeof(*ptr) * 8); +-#if !defined(HAVE_AVX2) +- m128 *sub; +- if (n < 128) { +- sub = &ptr->lo.lo; +- } else if (n < 256) { +- sub = &ptr->lo.hi; +- } else if (n < 384) { +- sub = &ptr->hi.lo; +- } else { +- sub = &ptr->hi.hi; +- } +- clearbit128(sub, n % 128); +-#elif defined(HAVE_AVX512) +- *ptr = andnot512(mask1bit512(n), *ptr); +-#else +- m256 *sub; +- if (n < 256) { +- sub = &ptr->lo; +- } else { +- sub = &ptr->hi; +- n -= 256; +- } +- clearbit256(sub, n); +-#endif +-} +- +-// tests bit N in the given vector. +-static really_inline +-char testbit512(m512 val, unsigned int n) { +- assert(n < sizeof(val) * 8); +-#if !defined(HAVE_AVX2) +- m128 sub; +- if (n < 128) { +- sub = val.lo.lo; +- } else if (n < 256) { +- sub = val.lo.hi; +- } else if (n < 384) { +- sub = val.hi.lo; +- } else { +- sub = val.hi.hi; +- } +- return testbit128(sub, n % 128); +-#elif defined(HAVE_AVX512) +- const m512 mask = mask1bit512(n); +- return !!_mm512_test_epi8_mask(mask, val); +-#else +- m256 sub; +- if (n < 256) { +- sub = val.lo; +- } else { +- sub = val.hi; +- n -= 256; +- } +- return testbit256(sub, n); +-#endif +-} +- + #endif +diff --git a/src/util/simd_x86.h b/src/util/simd_x86.h +new file mode 100644 +index 0000000..5fa727e +--- /dev/null ++++ b/src/util/simd_x86.h +@@ -0,0 +1,1420 @@ ++/* ++ * Copyright (c) 2015-2021, Intel Corporation ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions are met: ++ * ++ * * Redistributions of source code must retain the above copyright notice, ++ * this list of conditions and the following disclaimer. ++ * * Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * * Neither the name of Intel Corporation nor the names of its contributors ++ * may be used to endorse or promote products derived from this software ++ * without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE ++ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++ * POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++/** \file ++ * \brief SIMD types and primitive operations. ++ */ ++ ++#ifndef SIMD_UTILS ++#define SIMD_UTILS ++ ++#if !defined(_WIN32) && !defined(__SSSE3__) ++#error SSSE3 instructions must be enabled ++#endif ++ ++#include "config.h" ++#include "ue2common.h" ++#include "simd_types.h" ++#include "unaligned.h" ++#include "util/arch.h" ++#include "util/intrinsics.h" ++ ++#include // for memcpy ++ ++// Define a common assume_aligned using an appropriate compiler built-in, if ++// it's available. Note that we need to handle C or C++ compilation. ++#ifdef __cplusplus ++# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED ++# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) ++# endif ++#else ++# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED ++# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) ++# endif ++#endif ++ ++// Fallback to identity case. ++#ifndef assume_aligned ++#define assume_aligned(x, y) (x) ++#endif ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern const char vbs_mask_data[]; ++#ifdef __cplusplus ++} ++#endif ++ ++static really_inline m128 ones128(void) { ++#if defined(__GNUC__) || defined(__INTEL_COMPILER) ++ /* gcc gets this right */ ++ return _mm_set1_epi8(0xFF); ++#else ++ /* trick from Intel's optimization guide to generate all-ones. ++ * ICC converts this to the single cmpeq instruction */ ++ return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); ++#endif ++} ++ ++static really_inline m128 zeroes128(void) { ++ return _mm_setzero_si128(); ++} ++ ++/** \brief Bitwise not for m128*/ ++static really_inline m128 not128(m128 a) { ++ return _mm_xor_si128(a, ones128()); ++} ++ ++/** \brief Return 1 if a and b are different otherwise 0 */ ++static really_inline int diff128(m128 a, m128 b) { ++ return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); ++} ++ ++static really_inline int isnonzero128(m128 a) { ++ return !!diff128(a, zeroes128()); ++} ++ ++/** ++ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich128(m128 a, m128 b) { ++ a = _mm_cmpeq_epi32(a, b); ++ return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; ++} ++ ++/** ++ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and ++ * returns a 4-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_128(m128 a, m128 b) { ++#if defined(HAVE_SSE41) ++ a = _mm_cmpeq_epi64(a, b); ++ return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; ++#else ++ u32 d = diffrich128(a, b); ++ return (d | (d >> 1)) & 0x5; ++#endif ++} ++ ++static really_really_inline ++m128 lshift64_m128(m128 a, unsigned b) { ++#if defined(HAVE__BUILTIN_CONSTANT_P) ++ if (__builtin_constant_p(b)) { ++ return _mm_slli_epi64(a, b); ++ } ++#endif ++ m128 x = _mm_cvtsi32_si128(b); ++ return _mm_sll_epi64(a, x); ++} ++ ++#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) ++#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) ++#define movemask128(a) ((u32)_mm_movemask_epi8((a))) ++ ++#if defined(HAVE_AVX512) ++static really_inline m128 cast512to128(const m512 in) { ++ return _mm512_castsi512_si128(in); ++} ++#endif ++ ++static really_inline m128 set16x8(u8 c) { ++ return _mm_set1_epi8(c); ++} ++ ++static really_inline m128 set4x32(u32 c) { ++ return _mm_set1_epi32(c); ++} ++ ++static really_inline u32 movd(const m128 in) { ++ return _mm_cvtsi128_si32(in); ++} ++ ++static really_inline u64a movq(const m128 in) { ++#if defined(ARCH_X86_64) ++ return _mm_cvtsi128_si64(in); ++#else // 32-bit - this is horrific ++ u32 lo = movd(in); ++ u32 hi = movd(_mm_srli_epi64(in, 32)); ++ return (u64a)hi << 32 | lo; ++#endif ++} ++ ++#if defined(HAVE_AVX512) ++static really_inline u32 movd512(const m512 in) { ++ // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), ++ // so we use 2-step convertions to work around. ++ return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); ++} ++ ++static really_inline u64a movq512(const m512 in) { ++ // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in), ++ // so we use 2-step convertions to work around. ++ return movq(_mm512_castsi512_si128(in)); ++} ++#endif ++ ++/* another form of movq */ ++static really_inline ++m128 load_m128_from_u64a(const u64a *p) { ++ return _mm_set_epi64x(0LL, *p); ++} ++ ++#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) ++#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) ++ ++#if defined(HAVE_SSE41) ++#define extract32from128(a, imm) _mm_extract_epi32(a, imm) ++#define extract64from128(a, imm) _mm_extract_epi64(a, imm) ++#else ++#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) ++#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) ++#endif ++ ++#if !defined(HAVE_AVX2) ++// TODO: this entire file needs restructuring - this carveout is awful ++#define extractlow64from256(a) movq(a.lo) ++#define extractlow32from256(a) movd(a.lo) ++#if defined(HAVE_SSE41) ++#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) ++#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) ++#else ++#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) ++#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) ++#endif ++ ++#endif // !AVX2 ++ ++static really_inline m128 and128(m128 a, m128 b) { ++ return _mm_and_si128(a,b); ++} ++ ++static really_inline m128 xor128(m128 a, m128 b) { ++ return _mm_xor_si128(a,b); ++} ++ ++static really_inline m128 or128(m128 a, m128 b) { ++ return _mm_or_si128(a,b); ++} ++ ++#if defined(HAVE_AVX512VBMI) ++static really_inline m512 expand128(m128 a) { ++ return _mm512_broadcast_i32x4(a); ++} ++ ++static really_inline m512 expand256(m256 a) { ++ return _mm512_broadcast_i64x4(a); ++} ++ ++static really_inline m512 expand384(m384 a) { ++ u64a *lo = (u64a*)&a.lo; ++ u64a *mid = (u64a*)&a.mid; ++ u64a *hi = (u64a*)&a.hi; ++ return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0], ++ lo[1], lo[0]); ++} ++#endif ++ ++static really_inline m128 andnot128(m128 a, m128 b) { ++ return _mm_andnot_si128(a, b); ++} ++ ++// aligned load ++static really_inline m128 load128(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ ptr = assume_aligned(ptr, 16); ++ return _mm_load_si128((const m128 *)ptr); ++} ++ ++// aligned store ++static really_inline void store128(void *ptr, m128 a) { ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ ptr = assume_aligned(ptr, 16); ++ *(m128 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m128 loadu128(const void *ptr) { ++ return _mm_loadu_si128((const m128 *)ptr); ++} ++ ++// unaligned store ++static really_inline void storeu128(void *ptr, m128 a) { ++ _mm_storeu_si128 ((m128 *)ptr, a); ++} ++ ++// packed unaligned store of first N bytes ++static really_inline ++void storebytes128(void *ptr, m128 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline ++m128 loadbytes128(const void *ptr, unsigned int n) { ++ m128 a = zeroes128(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++extern const u8 simd_onebit_masks[]; ++#ifdef __cplusplus ++} ++#endif ++ ++static really_inline ++m128 mask1bit128(unsigned int n) { ++ assert(n < sizeof(m128) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu128(&simd_onebit_masks[mask_idx]); ++} ++ ++// switches on bit N in the given vector. ++static really_inline ++void setbit128(m128 *ptr, unsigned int n) { ++ *ptr = or128(mask1bit128(n), *ptr); ++} ++ ++// switches off bit N in the given vector. ++static really_inline ++void clearbit128(m128 *ptr, unsigned int n) { ++ *ptr = andnot128(mask1bit128(n), *ptr); ++} ++ ++// tests bit N in the given vector. ++static really_inline ++char testbit128(m128 val, unsigned int n) { ++ const m128 mask = mask1bit128(n); ++#if defined(HAVE_SSE41) ++ return !_mm_testz_si128(mask, val); ++#else ++ return isnonzero128(and128(mask, val)); ++#endif ++} ++ ++// offset must be an immediate ++#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) ++ ++static really_inline ++m128 pshufb_m128(m128 a, m128 b) { ++ m128 result; ++ result = _mm_shuffle_epi8(a, b); ++ return result; ++} ++ ++static really_inline ++m256 pshufb_m256(m256 a, m256 b) { ++#if defined(HAVE_AVX2) ++ return _mm256_shuffle_epi8(a, b); ++#else ++ m256 rv; ++ rv.lo = pshufb_m128(a.lo, b.lo); ++ rv.hi = pshufb_m128(a.hi, b.hi); ++ return rv; ++#endif ++} ++ ++#if defined(HAVE_AVX512) ++static really_inline ++m512 pshufb_m512(m512 a, m512 b) { ++ return _mm512_shuffle_epi8(a, b); ++} ++ ++static really_inline ++m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { ++ return _mm512_maskz_shuffle_epi8(k, a, b); ++} ++ ++#if defined(HAVE_AVX512VBMI) ++#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) ++#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) ++#endif ++ ++#endif ++ ++static really_inline ++m128 variable_byte_shift_m128(m128 in, s32 amount) { ++ assert(amount >= -16 && amount <= 16); ++ m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); ++ return pshufb_m128(in, shift_mask); ++} ++ ++static really_inline ++m128 max_u8_m128(m128 a, m128 b) { ++ return _mm_max_epu8(a, b); ++} ++ ++static really_inline ++m128 min_u8_m128(m128 a, m128 b) { ++ return _mm_min_epu8(a, b); ++} ++ ++static really_inline ++m128 sadd_u8_m128(m128 a, m128 b) { ++ return _mm_adds_epu8(a, b); ++} ++ ++static really_inline ++m128 sub_u8_m128(m128 a, m128 b) { ++ return _mm_sub_epi8(a, b); ++} ++ ++static really_inline ++m128 set64x2(u64a hi, u64a lo) { ++ return _mm_set_epi64x(hi, lo); ++} ++ ++/**** ++ **** 256-bit Primitives ++ ****/ ++ ++#if defined(HAVE_AVX2) ++ ++static really_really_inline ++m256 lshift64_m256(m256 a, unsigned b) { ++#if defined(HAVE__BUILTIN_CONSTANT_P) ++ if (__builtin_constant_p(b)) { ++ return _mm256_slli_epi64(a, b); ++ } ++#endif ++ m128 x = _mm_cvtsi32_si128(b); ++ return _mm256_sll_epi64(a, x); ++} ++ ++#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) ++ ++static really_inline ++m256 set32x8(u32 in) { ++ return _mm256_set1_epi8(in); ++} ++ ++#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) ++#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) ++ ++static really_inline ++m256 set2x128(m128 a) { ++ return _mm256_broadcastsi128_si256(a); ++} ++ ++#else ++ ++static really_really_inline ++m256 lshift64_m256(m256 a, int b) { ++ m256 rv = a; ++ rv.lo = lshift64_m128(rv.lo, b); ++ rv.hi = lshift64_m128(rv.hi, b); ++ return rv; ++} ++ ++static really_inline ++m256 rshift64_m256(m256 a, int b) { ++ m256 rv = a; ++ rv.lo = rshift64_m128(rv.lo, b); ++ rv.hi = rshift64_m128(rv.hi, b); ++ return rv; ++} ++static really_inline ++m256 set32x8(u32 in) { ++ m256 rv; ++ rv.lo = set16x8((u8) in); ++ rv.hi = rv.lo; ++ return rv; ++} ++ ++static really_inline ++m256 eq256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = eq128(a.lo, b.lo); ++ rv.hi = eq128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline ++u32 movemask256(m256 a) { ++ u32 lo_mask = movemask128(a.lo); ++ u32 hi_mask = movemask128(a.hi); ++ return lo_mask | (hi_mask << 16); ++} ++ ++static really_inline ++m256 set2x128(m128 a) { ++ m256 rv = {a, a}; ++ return rv; ++} ++#endif ++ ++static really_inline m256 zeroes256(void) { ++#if defined(HAVE_AVX2) ++ return _mm256_setzero_si256(); ++#else ++ m256 rv = {zeroes128(), zeroes128()}; ++ return rv; ++#endif ++} ++ ++static really_inline m256 ones256(void) { ++#if defined(HAVE_AVX2) ++ m256 rv = _mm256_set1_epi8(0xFF); ++#else ++ m256 rv = {ones128(), ones128()}; ++#endif ++ return rv; ++} ++ ++#if defined(HAVE_AVX2) ++static really_inline m256 and256(m256 a, m256 b) { ++ return _mm256_and_si256(a, b); ++} ++#else ++static really_inline m256 and256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = and128(a.lo, b.lo); ++ rv.hi = and128(a.hi, b.hi); ++ return rv; ++} ++#endif ++ ++#if defined(HAVE_AVX2) ++static really_inline m256 or256(m256 a, m256 b) { ++ return _mm256_or_si256(a, b); ++} ++#else ++static really_inline m256 or256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = or128(a.lo, b.lo); ++ rv.hi = or128(a.hi, b.hi); ++ return rv; ++} ++#endif ++ ++#if defined(HAVE_AVX2) ++static really_inline m256 xor256(m256 a, m256 b) { ++ return _mm256_xor_si256(a, b); ++} ++#else ++static really_inline m256 xor256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = xor128(a.lo, b.lo); ++ rv.hi = xor128(a.hi, b.hi); ++ return rv; ++} ++#endif ++ ++#if defined(HAVE_AVX2) ++static really_inline m256 not256(m256 a) { ++ return _mm256_xor_si256(a, ones256()); ++} ++#else ++static really_inline m256 not256(m256 a) { ++ m256 rv; ++ rv.lo = not128(a.lo); ++ rv.hi = not128(a.hi); ++ return rv; ++} ++#endif ++ ++#if defined(HAVE_AVX2) ++static really_inline m256 andnot256(m256 a, m256 b) { ++ return _mm256_andnot_si256(a, b); ++} ++#else ++static really_inline m256 andnot256(m256 a, m256 b) { ++ m256 rv; ++ rv.lo = andnot128(a.lo, b.lo); ++ rv.hi = andnot128(a.hi, b.hi); ++ return rv; ++} ++#endif ++ ++static really_inline int diff256(m256 a, m256 b) { ++#if defined(HAVE_AVX2) ++ return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); ++#else ++ return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); ++#endif ++} ++ ++static really_inline int isnonzero256(m256 a) { ++#if defined(HAVE_AVX2) ++ return !!diff256(a, zeroes256()); ++#else ++ return isnonzero128(or128(a.lo, a.hi)); ++#endif ++} ++ ++/** ++ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich256(m256 a, m256 b) { ++#if defined(HAVE_AVX2) ++ a = _mm256_cmpeq_epi32(a, b); ++ return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; ++#else ++ m128 z = zeroes128(); ++ a.lo = _mm_cmpeq_epi32(a.lo, b.lo); ++ a.hi = _mm_cmpeq_epi32(a.hi, b.hi); ++ m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); ++ return ~(_mm_movemask_epi8(packed)) & 0xff; ++#endif ++} ++ ++/** ++ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and ++ * returns an 8-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_256(m256 a, m256 b) { ++ u32 d = diffrich256(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline m256 load256(const void *ptr) { ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++#if defined(HAVE_AVX2) ++ return _mm256_load_si256((const m256 *)ptr); ++#else ++ m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; ++ return rv; ++#endif ++} ++ ++// aligned load of 128-bit value to low and high part of 256-bit value ++static really_inline m256 load2x128(const void *ptr) { ++#if defined(HAVE_AVX2) ++ return set2x128(load128(ptr)); ++#else ++ assert(ISALIGNED_N(ptr, alignof(m128))); ++ m256 rv; ++ rv.hi = rv.lo = load128(ptr); ++ return rv; ++#endif ++} ++ ++static really_inline m256 loadu2x128(const void *ptr) { ++ return set2x128(loadu128(ptr)); ++} ++ ++// aligned store ++static really_inline void store256(void *ptr, m256 a) { ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++#if defined(HAVE_AVX2) ++ _mm256_store_si256((m256 *)ptr, a); ++#else ++ ptr = assume_aligned(ptr, 16); ++ *(m256 *)ptr = a; ++#endif ++} ++ ++// unaligned load ++static really_inline m256 loadu256(const void *ptr) { ++#if defined(HAVE_AVX2) ++ return _mm256_loadu_si256((const m256 *)ptr); ++#else ++ m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; ++ return rv; ++#endif ++} ++ ++// unaligned store ++static really_inline void storeu256(void *ptr, m256 a) { ++#if defined(HAVE_AVX2) ++ _mm256_storeu_si256((m256 *)ptr, a); ++#else ++ storeu128(ptr, a.lo); ++ storeu128((char *)ptr + 16, a.hi); ++#endif ++} ++ ++// packed unaligned store of first N bytes ++static really_inline ++void storebytes256(void *ptr, m256 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline ++m256 loadbytes256(const void *ptr, unsigned int n) { ++ m256 a = zeroes256(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++static really_inline ++m256 mask1bit256(unsigned int n) { ++ assert(n < sizeof(m256) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu256(&simd_onebit_masks[mask_idx]); ++} ++ ++static really_inline ++m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { ++#if defined(HAVE_AVX2) ++ return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); ++#else ++ m256 rv; ++ rv.hi = set64x2(hi_1, hi_0); ++ rv.lo = set64x2(lo_1, lo_0); ++ return rv; ++#endif ++} ++ ++#if !defined(HAVE_AVX2) ++// switches on bit N in the given vector. ++static really_inline ++void setbit256(m256 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else { ++ sub = &ptr->hi; ++ n -= 128; ++ } ++ setbit128(sub, n); ++} ++ ++// switches off bit N in the given vector. ++static really_inline ++void clearbit256(m256 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else { ++ sub = &ptr->hi; ++ n -= 128; ++ } ++ clearbit128(sub, n); ++} ++ ++// tests bit N in the given vector. ++static really_inline ++char testbit256(m256 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo; ++ } else { ++ sub = val.hi; ++ n -= 128; ++ } ++ return testbit128(sub, n); ++} ++ ++static really_really_inline ++m128 movdq_hi(m256 x) { ++ return x.hi; ++} ++ ++static really_really_inline ++m128 movdq_lo(m256 x) { ++ return x.lo; ++} ++ ++static really_inline ++m256 combine2x128(m128 hi, m128 lo) { ++ m256 rv = {lo, hi}; ++ return rv; ++} ++ ++#else // AVX2 ++ ++// switches on bit N in the given vector. ++static really_inline ++void setbit256(m256 *ptr, unsigned int n) { ++ *ptr = or256(mask1bit256(n), *ptr); ++} ++ ++static really_inline ++void clearbit256(m256 *ptr, unsigned int n) { ++ *ptr = andnot256(mask1bit256(n), *ptr); ++} ++ ++// tests bit N in the given vector. ++static really_inline ++char testbit256(m256 val, unsigned int n) { ++ const m256 mask = mask1bit256(n); ++ return !_mm256_testz_si256(mask, val); ++} ++ ++static really_really_inline ++m128 movdq_hi(m256 x) { ++ return _mm256_extracti128_si256(x, 1); ++} ++ ++static really_really_inline ++m128 movdq_lo(m256 x) { ++ return _mm256_extracti128_si256(x, 0); ++} ++ ++#define cast256to128(a) _mm256_castsi256_si128(a) ++#define cast128to256(a) _mm256_castsi128_si256(a) ++#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) ++#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) ++#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) ++#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) ++#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) ++#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) ++#define extractlow64from256(a) movq(cast256to128(a)) ++#define extractlow32from256(a) movd(cast256to128(a)) ++#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) ++#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) ++#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) ++ ++static really_inline ++m256 combine2x128(m128 hi, m128 lo) { ++#if defined(_mm256_set_m128i) ++ return _mm256_set_m128i(hi, lo); ++#else ++ return insert128to256(cast128to256(lo), hi, 1); ++#endif ++} ++#endif //AVX2 ++ ++#if defined(HAVE_AVX512) ++#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) ++#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) ++#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) ++#define set2x256(a) _mm512_broadcast_i64x4(a) ++#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) ++#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) ++#endif ++ ++/**** ++ **** 384-bit Primitives ++ ****/ ++ ++static really_inline m384 and384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = and128(a.lo, b.lo); ++ rv.mid = and128(a.mid, b.mid); ++ rv.hi = and128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m384 or384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = or128(a.lo, b.lo); ++ rv.mid = or128(a.mid, b.mid); ++ rv.hi = or128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_inline m384 xor384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = xor128(a.lo, b.lo); ++ rv.mid = xor128(a.mid, b.mid); ++ rv.hi = xor128(a.hi, b.hi); ++ return rv; ++} ++static really_inline m384 not384(m384 a) { ++ m384 rv; ++ rv.lo = not128(a.lo); ++ rv.mid = not128(a.mid); ++ rv.hi = not128(a.hi); ++ return rv; ++} ++static really_inline m384 andnot384(m384 a, m384 b) { ++ m384 rv; ++ rv.lo = andnot128(a.lo, b.lo); ++ rv.mid = andnot128(a.mid, b.mid); ++ rv.hi = andnot128(a.hi, b.hi); ++ return rv; ++} ++ ++static really_really_inline ++m384 lshift64_m384(m384 a, unsigned b) { ++ m384 rv; ++ rv.lo = lshift64_m128(a.lo, b); ++ rv.mid = lshift64_m128(a.mid, b); ++ rv.hi = lshift64_m128(a.hi, b); ++ return rv; ++} ++ ++static really_inline m384 zeroes384(void) { ++ m384 rv = {zeroes128(), zeroes128(), zeroes128()}; ++ return rv; ++} ++ ++static really_inline m384 ones384(void) { ++ m384 rv = {ones128(), ones128(), ones128()}; ++ return rv; ++} ++ ++static really_inline int diff384(m384 a, m384 b) { ++ return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); ++} ++ ++static really_inline int isnonzero384(m384 a) { ++ return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); ++} ++ ++/** ++ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline u32 diffrich384(m384 a, m384 b) { ++ m128 z = zeroes128(); ++ a.lo = _mm_cmpeq_epi32(a.lo, b.lo); ++ a.mid = _mm_cmpeq_epi32(a.mid, b.mid); ++ a.hi = _mm_cmpeq_epi32(a.hi, b.hi); ++ m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), ++ _mm_packs_epi32(a.hi, z)); ++ return ~(_mm_movemask_epi8(packed)) & 0xfff; ++} ++ ++/** ++ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and ++ * returns a 12-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline u32 diffrich64_384(m384 a, m384 b) { ++ u32 d = diffrich384(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline m384 load384(const void *ptr) { ++ assert(ISALIGNED_16(ptr)); ++ m384 rv = { load128(ptr), load128((const char *)ptr + 16), ++ load128((const char *)ptr + 32) }; ++ return rv; ++} ++ ++// aligned store ++static really_inline void store384(void *ptr, m384 a) { ++ assert(ISALIGNED_16(ptr)); ++ ptr = assume_aligned(ptr, 16); ++ *(m384 *)ptr = a; ++} ++ ++// unaligned load ++static really_inline m384 loadu384(const void *ptr) { ++ m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), ++ loadu128((const char *)ptr + 32)}; ++ return rv; ++} ++ ++// packed unaligned store of first N bytes ++static really_inline ++void storebytes384(void *ptr, m384 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline ++m384 loadbytes384(const void *ptr, unsigned int n) { ++ m384 a = zeroes384(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++// switches on bit N in the given vector. ++static really_inline ++void setbit384(m384 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else if (n < 256) { ++ sub = &ptr->mid; ++ } else { ++ sub = &ptr->hi; ++ } ++ setbit128(sub, n % 128); ++} ++ ++// switches off bit N in the given vector. ++static really_inline ++void clearbit384(m384 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo; ++ } else if (n < 256) { ++ sub = &ptr->mid; ++ } else { ++ sub = &ptr->hi; ++ } ++ clearbit128(sub, n % 128); ++} ++ ++// tests bit N in the given vector. ++static really_inline ++char testbit384(m384 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo; ++ } else if (n < 256) { ++ sub = val.mid; ++ } else { ++ sub = val.hi; ++ } ++ return testbit128(sub, n % 128); ++} ++ ++/**** ++ **** 512-bit Primitives ++ ****/ ++ ++#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) ++#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) ++ ++static really_inline ++m512 zeroes512(void) { ++#if defined(HAVE_AVX512) ++ return _mm512_setzero_si512(); ++#else ++ m512 rv = {zeroes256(), zeroes256()}; ++ return rv; ++#endif ++} ++ ++static really_inline ++m512 ones512(void) { ++#if defined(HAVE_AVX512) ++ return _mm512_set1_epi8(0xFF); ++ //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); ++#else ++ m512 rv = {ones256(), ones256()}; ++ return rv; ++#endif ++} ++ ++#if defined(HAVE_AVX512) ++static really_inline ++m512 set64x8(u8 a) { ++ return _mm512_set1_epi8(a); ++} ++ ++static really_inline ++m512 set8x64(u64a a) { ++ return _mm512_set1_epi64(a); ++} ++ ++static really_inline ++m512 set16x32(u32 a) { ++ return _mm512_set1_epi32(a); ++} ++ ++static really_inline ++m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, ++ u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { ++ return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, ++ lo_3, lo_2, lo_1, lo_0); ++} ++ ++static really_inline ++m512 swap256in512(m512 a) { ++ m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); ++ return vpermq512(idx, a); ++} ++ ++static really_inline ++m512 set4x128(m128 a) { ++ return _mm512_broadcast_i32x4(a); ++} ++ ++static really_inline ++m512 sadd_u8_m512(m512 a, m512 b) { ++ return _mm512_adds_epu8(a, b); ++} ++ ++static really_inline ++m512 max_u8_m512(m512 a, m512 b) { ++ return _mm512_max_epu8(a, b); ++} ++ ++static really_inline ++m512 min_u8_m512(m512 a, m512 b) { ++ return _mm512_min_epu8(a, b); ++} ++ ++static really_inline ++m512 sub_u8_m512(m512 a, m512 b) { ++ return _mm512_sub_epi8(a, b); ++} ++#endif ++ ++static really_inline ++m512 and512(m512 a, m512 b) { ++#if defined(HAVE_AVX512) ++ return _mm512_and_si512(a, b); ++#else ++ m512 rv; ++ rv.lo = and256(a.lo, b.lo); ++ rv.hi = and256(a.hi, b.hi); ++ return rv; ++#endif ++} ++ ++static really_inline ++m512 or512(m512 a, m512 b) { ++#if defined(HAVE_AVX512) ++ return _mm512_or_si512(a, b); ++#else ++ m512 rv; ++ rv.lo = or256(a.lo, b.lo); ++ rv.hi = or256(a.hi, b.hi); ++ return rv; ++#endif ++} ++ ++static really_inline ++m512 xor512(m512 a, m512 b) { ++#if defined(HAVE_AVX512) ++ return _mm512_xor_si512(a, b); ++#else ++ m512 rv; ++ rv.lo = xor256(a.lo, b.lo); ++ rv.hi = xor256(a.hi, b.hi); ++ return rv; ++#endif ++} ++ ++static really_inline ++m512 not512(m512 a) { ++#if defined(HAVE_AVX512) ++ return _mm512_xor_si512(a, ones512()); ++#else ++ m512 rv; ++ rv.lo = not256(a.lo); ++ rv.hi = not256(a.hi); ++ return rv; ++#endif ++} ++ ++static really_inline ++m512 andnot512(m512 a, m512 b) { ++#if defined(HAVE_AVX512) ++ return _mm512_andnot_si512(a, b); ++#else ++ m512 rv; ++ rv.lo = andnot256(a.lo, b.lo); ++ rv.hi = andnot256(a.hi, b.hi); ++ return rv; ++#endif ++} ++ ++#if defined(HAVE_AVX512) ++static really_really_inline ++m512 lshift64_m512(m512 a, unsigned b) { ++#if defined(HAVE__BUILTIN_CONSTANT_P) ++ if (__builtin_constant_p(b)) { ++ return _mm512_slli_epi64(a, b); ++ } ++#endif ++ m128 x = _mm_cvtsi32_si128(b); ++ return _mm512_sll_epi64(a, x); ++} ++#else ++static really_really_inline ++m512 lshift64_m512(m512 a, unsigned b) { ++ m512 rv; ++ rv.lo = lshift64_m256(a.lo, b); ++ rv.hi = lshift64_m256(a.hi, b); ++ return rv; ++} ++#endif ++ ++#if defined(HAVE_AVX512) ++#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) ++#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) ++#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) ++#endif ++ ++#if !defined(_MM_CMPINT_NE) ++#define _MM_CMPINT_NE 0x4 ++#endif ++ ++static really_inline ++int diff512(m512 a, m512 b) { ++#if defined(HAVE_AVX512) ++ return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); ++#else ++ return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); ++#endif ++} ++ ++static really_inline ++int isnonzero512(m512 a) { ++#if defined(HAVE_AVX512) ++ return diff512(a, zeroes512()); ++#elif defined(HAVE_AVX2) ++ m256 x = or256(a.lo, a.hi); ++ return !!diff256(x, zeroes256()); ++#else ++ m128 x = or128(a.lo.lo, a.lo.hi); ++ m128 y = or128(a.hi.lo, a.hi.hi); ++ return isnonzero128(or128(x, y)); ++#endif ++} ++ ++/** ++ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit ++ * mask indicating which 32-bit words contain differences. ++ */ ++static really_inline ++u32 diffrich512(m512 a, m512 b) { ++#if defined(HAVE_AVX512) ++ return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); ++#elif defined(HAVE_AVX2) ++ return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); ++#else ++ a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); ++ a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); ++ a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); ++ a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); ++ m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), ++ _mm_packs_epi32(a.hi.lo, a.hi.hi)); ++ return ~(_mm_movemask_epi8(packed)) & 0xffff; ++#endif ++} ++ ++/** ++ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and ++ * returns a 16-bit mask indicating which 64-bit words contain differences. ++ */ ++static really_inline ++u32 diffrich64_512(m512 a, m512 b) { ++ //TODO: cmp_epi64? ++ u32 d = diffrich512(a, b); ++ return (d | (d >> 1)) & 0x55555555; ++} ++ ++// aligned load ++static really_inline ++m512 load512(const void *ptr) { ++#if defined(HAVE_AVX512) ++ return _mm512_load_si512(ptr); ++#else ++ assert(ISALIGNED_N(ptr, alignof(m256))); ++ m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; ++ return rv; ++#endif ++} ++ ++// aligned store ++static really_inline ++void store512(void *ptr, m512 a) { ++ assert(ISALIGNED_N(ptr, alignof(m512))); ++#if defined(HAVE_AVX512) ++ return _mm512_store_si512(ptr, a); ++#elif defined(HAVE_AVX2) ++ m512 *x = (m512 *)ptr; ++ store256(&x->lo, a.lo); ++ store256(&x->hi, a.hi); ++#else ++ ptr = assume_aligned(ptr, 16); ++ *(m512 *)ptr = a; ++#endif ++} ++ ++// unaligned load ++static really_inline ++m512 loadu512(const void *ptr) { ++#if defined(HAVE_AVX512) ++ return _mm512_loadu_si512(ptr); ++#else ++ m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; ++ return rv; ++#endif ++} ++ ++// unaligned store ++static really_inline ++void storeu512(void *ptr, m512 a) { ++#if defined(HAVE_AVX512) ++ _mm512_storeu_si512((m512 *)ptr, a); ++#elif defined(HAVE_AVX2) ++ storeu256(ptr, a.lo); ++ storeu256((char *)ptr + 32, a.hi); ++#else ++ storeu128(ptr, a.lo.lo); ++ storeu128((char *)ptr + 16, a.lo.hi); ++ storeu128((char *)ptr + 32, a.hi.lo); ++ storeu128((char *)ptr + 48, a.hi.hi); ++#endif ++} ++ ++#if defined(HAVE_AVX512) ++static really_inline ++m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { ++ return _mm512_maskz_loadu_epi8(k, ptr); ++} ++ ++static really_inline ++m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { ++ return _mm512_mask_loadu_epi8(src, k, ptr); ++} ++ ++static really_inline ++void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) { ++ _mm512_mask_storeu_epi8(ptr, k, a); ++} ++ ++static really_inline ++m512 set_mask_m512(__mmask64 k) { ++ return _mm512_movm_epi8(k); ++} ++ ++static really_inline ++m256 loadu_maskz_m256(__mmask32 k, const void *ptr) { ++ return _mm256_maskz_loadu_epi8(k, ptr); ++} ++#endif ++ ++// packed unaligned store of first N bytes ++static really_inline ++void storebytes512(void *ptr, m512 a, unsigned int n) { ++ assert(n <= sizeof(a)); ++ memcpy(ptr, &a, n); ++} ++ ++// packed unaligned load of first N bytes, pad with zero ++static really_inline ++m512 loadbytes512(const void *ptr, unsigned int n) { ++ m512 a = zeroes512(); ++ assert(n <= sizeof(a)); ++ memcpy(&a, ptr, n); ++ return a; ++} ++ ++static really_inline ++m512 mask1bit512(unsigned int n) { ++ assert(n < sizeof(m512) * 8); ++ u32 mask_idx = ((n % 8) * 64) + 95; ++ mask_idx -= n / 8; ++ return loadu512(&simd_onebit_masks[mask_idx]); ++} ++ ++// switches on bit N in the given vector. ++static really_inline ++void setbit512(m512 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++#if !defined(HAVE_AVX2) ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo.lo; ++ } else if (n < 256) { ++ sub = &ptr->lo.hi; ++ } else if (n < 384) { ++ sub = &ptr->hi.lo; ++ } else { ++ sub = &ptr->hi.hi; ++ } ++ setbit128(sub, n % 128); ++#elif defined(HAVE_AVX512) ++ *ptr = or512(mask1bit512(n), *ptr); ++#else ++ m256 *sub; ++ if (n < 256) { ++ sub = &ptr->lo; ++ } else { ++ sub = &ptr->hi; ++ n -= 256; ++ } ++ setbit256(sub, n); ++#endif ++} ++ ++// switches off bit N in the given vector. ++static really_inline ++void clearbit512(m512 *ptr, unsigned int n) { ++ assert(n < sizeof(*ptr) * 8); ++#if !defined(HAVE_AVX2) ++ m128 *sub; ++ if (n < 128) { ++ sub = &ptr->lo.lo; ++ } else if (n < 256) { ++ sub = &ptr->lo.hi; ++ } else if (n < 384) { ++ sub = &ptr->hi.lo; ++ } else { ++ sub = &ptr->hi.hi; ++ } ++ clearbit128(sub, n % 128); ++#elif defined(HAVE_AVX512) ++ *ptr = andnot512(mask1bit512(n), *ptr); ++#else ++ m256 *sub; ++ if (n < 256) { ++ sub = &ptr->lo; ++ } else { ++ sub = &ptr->hi; ++ n -= 256; ++ } ++ clearbit256(sub, n); ++#endif ++} ++ ++// tests bit N in the given vector. ++static really_inline ++char testbit512(m512 val, unsigned int n) { ++ assert(n < sizeof(val) * 8); ++#if !defined(HAVE_AVX2) ++ m128 sub; ++ if (n < 128) { ++ sub = val.lo.lo; ++ } else if (n < 256) { ++ sub = val.lo.hi; ++ } else if (n < 384) { ++ sub = val.hi.lo; ++ } else { ++ sub = val.hi.hi; ++ } ++ return testbit128(sub, n % 128); ++#elif defined(HAVE_AVX512) ++ const m512 mask = mask1bit512(n); ++ return !!_mm512_test_epi8_mask(mask, val); ++#else ++ m256 sub; ++ if (n < 256) { ++ sub = val.lo; ++ } else { ++ sub = val.hi; ++ n -= 256; ++ } ++ return testbit256(sub, n); ++#endif ++} ++ ++#endif +diff --git a/src/util/state_compress.c b/src/util/state_compress.c +index 7238849..d71f543 100644 +--- a/src/util/state_compress.c ++++ b/src/util/state_compress.c +@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { + u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), + expand32(v[2], m[2]), expand32(v[3], m[3]) }; + +- return _mm_set_epi32(x[3], x[2], x[1], x[0]); ++ return __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0); + } + #endif + +@@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { + static really_inline + m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { + // First, decompose our vectors into 64-bit chunks. +- u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) }; ++ u64a m[2] = { movq(mvec), movq(__lsx_vsrli_h(mvec, 8)) }; + + u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + u64a v[2]; +@@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { + + u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; + +- return _mm_set_epi64x(x[1], x[0]); ++ return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0); + } + #endif + +@@ -264,8 +264,8 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { + expand32(v[6], m[6]), expand32(v[7], m[7]) }; + + #if !defined(HAVE_AVX2) +- m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), +- .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; ++ m256 xvec = { .lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0), ++ .hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[7]),x[6],2),x[5],1),x[4],0) }; + #else + m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); +@@ -291,8 +291,8 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { + expand64(v[2], m[2]), expand64(v[3], m[3]) }; + + #if !defined(HAVE_AVX2) +- m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), +- .hi = _mm_set_epi64x(x[3], x[2]) }; ++ m256 xvec = { .lo = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0), ++ .hi = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0) }; + #else + m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); + #endif +@@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { + expand32(v[8], m[8]), expand32(v[9], m[9]), + expand32(v[10], m[10]), expand32(v[11], m[11]) }; + +- m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), +- .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]), +- .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) }; ++ m384 xvec = { .lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0), ++ .mid = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[7]),x[6],2),x[5],1),x[4],0), ++ .hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[11]),x[10],2),x[9],1),x[8],0) }; + return xvec; + } + #endif +@@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { + expand64(v[2], m[2]), expand64(v[3], m[3]), + expand64(v[4], m[4]), expand64(v[5], m[5]) }; + +- m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), +- .mid = _mm_set_epi64x(x[3], x[2]), +- .hi = _mm_set_epi64x(x[5], x[4]) }; ++ m384 xvec = { .lo = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0), ++ .mid = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0), ++ .hi = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[5]),x[4],0) }; + return xvec; + } + #endif +@@ -558,10 +558,10 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { + xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], + x[11], x[10], x[9], x[8]); + #else +- xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); +- xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); +- xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); +- xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); ++ xvec.lo.lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0); ++ xvec.lo.hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[7]),x[6],2),x[5],1),x[4],0); ++ xvec.hi.lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[11]),x[10],2),x[9],1),x[8],0); ++ xvec.hi.hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[15]),x[14],2),x[13],1),x[12],0); + #endif + return xvec; + } +@@ -594,10 +594,10 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { + m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), + .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; + #else +- m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), +- _mm_set_epi64x(x[3], x[2]) }, +- .hi = { _mm_set_epi64x(x[5], x[4]), +- _mm_set_epi64x(x[7], x[6]) } }; ++ m512 xvec = { .lo = { __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0), ++ __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0) }, ++ .hi = { __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[5]),x[4],0), ++ __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[7]),x[6],0) } }; + #endif + return xvec; + } +-- +2.27.0 + diff --git a/thirdparty/patches/add-the-parameter-mlsx.patch b/thirdparty/patches/add-the-parameter-mlsx.patch new file mode 100644 index 0000000000..ac1c5d3fc6 --- /dev/null +++ b/thirdparty/patches/add-the-parameter-mlsx.patch @@ -0,0 +1,28 @@ +From 60308fe6711842de16742bee5e602c4a3454f461 Mon Sep 17 00:00:00 2001 +From: Jingyun Hua +Date: Thu, 3 Aug 2023 09:32:46 +0800 +Subject: [PATCH] add the parameter -mlsx + +Change-Id: I3f11b0d38e71357b589ae9ae0c5b3791a00bf0ec +--- + CMakeLists.txt | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 4289817..d8f5279 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -259,6 +259,10 @@ else() + set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}") + endif() + ++ if (ARCH_LOONGARCH64) ++ set(ARCH_CXX_FLAGS "-mlsx") ++ endif() ++ + if(CMAKE_COMPILER_IS_GNUCC) + # spurious warnings? + set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized") +-- +2.27.0 + diff --git a/thirdparty/patches/config.guess b/thirdparty/patches/config.guess new file mode 100644 index 0000000000..48a684601b --- /dev/null +++ b/thirdparty/patches/config.guess @@ -0,0 +1,1815 @@ +#! /bin/sh +# Attempt to guess a canonical system name. +# Copyright 1992-2024 Free Software Foundation, Inc. + +# shellcheck disable=SC2006,SC2268 # see below for rationale + +timestamp='2024-07-27' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that +# program. This Exception is an additional permission under section 7 +# of the GNU General Public License, version 3 ("GPLv3"). +# +# Originally written by Per Bothner; maintained since 2000 by Ben Elliston. +# +# You can get the latest version of this script from: +# https://git.savannah.gnu.org/cgit/config.git/plain/config.guess +# +# Please send patches to . + + +# The "shellcheck disable" line above the timestamp inhibits complaints +# about features and limitations of the classic Bourne shell that were +# superseded or lifted in POSIX. However, this script identifies a wide +# variety of pre-POSIX systems that do not have POSIX shells at all, and +# even some reasonably current systems (Solaris 10 as case-in-point) still +# have a pre-POSIX /bin/sh. + + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] + +Output the configuration name of the system '$me' is run on. + +Options: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.guess ($timestamp) + +Originally written by Per Bothner. +Copyright 1992-2024 Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try '$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + * ) + break ;; + esac +done + +if test $# != 0; then + echo "$me: too many arguments$help" >&2 + exit 1 +fi + +# Just in case it came from the environment. +GUESS= + +# CC_FOR_BUILD -- compiler used by this script. Note that the use of a +# compiler to aid in system detection is discouraged as it requires +# temporary files to be created and, as you can see below, it is a +# headache to deal with in a portable fashion. + +# Historically, 'CC_FOR_BUILD' used to be named 'HOST_CC'. We still +# use 'HOST_CC' if defined, but it is deprecated. + +# Portable tmp directory creation inspired by the Autoconf team. + +tmp= +# shellcheck disable=SC2172 +trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15 + +set_cc_for_build() { + # prevent multiple calls if $tmp is already set + test "$tmp" && return 0 + : "${TMPDIR=/tmp}" + # shellcheck disable=SC2039,SC3028 + { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || + { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } || + { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } || + { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } + dummy=$tmp/dummy + case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in + ,,) echo "int x;" > "$dummy.c" + for driver in cc gcc c17 c99 c89 ; do + if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then + CC_FOR_BUILD=$driver + break + fi + done + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; + esac +} + +# This is needed to find uname on a Pyramid OSx when run in the BSD universe. +# (ghazi@noc.rutgers.edu 1994-08-24) +if test -f /.attbin/uname ; then + PATH=$PATH:/.attbin ; export PATH +fi + +UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown +UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown +UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown +UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown + +case $UNAME_SYSTEM in +Linux|GNU|GNU/*) + LIBC=unknown + + set_cc_for_build + cat <<-EOF > "$dummy.c" + #if defined(__ANDROID__) + LIBC=android + #else + #include + #if defined(__UCLIBC__) + LIBC=uclibc + #elif defined(__dietlibc__) + LIBC=dietlibc + #elif defined(__GLIBC__) + LIBC=gnu + #elif defined(__LLVM_LIBC__) + LIBC=llvm + #else + #include + /* First heuristic to detect musl libc. */ + #ifdef __DEFINED_va_list + LIBC=musl + #endif + #endif + #endif + EOF + cc_set_libc=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'` + eval "$cc_set_libc" + + # Second heuristic to detect musl libc. + if [ "$LIBC" = unknown ] && + command -v ldd >/dev/null && + ldd --version 2>&1 | grep -q ^musl; then + LIBC=musl + fi + + # If the system lacks a compiler, then just pick glibc. + # We could probably try harder. + if [ "$LIBC" = unknown ]; then + LIBC=gnu + fi + ;; +esac + +# Note: order is significant - the case branches are not exclusive. + +case $UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION in + *:NetBSD:*:*) + # NetBSD (nbsd) targets should (where applicable) match one or + # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, + # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently + # switched to ELF, *-*-netbsd* would select the old + # object file format. This provides both forward + # compatibility and a consistent mechanism for selecting the + # object file format. + # + # Note: NetBSD doesn't particularly care about the vendor + # portion of the name. We always set it to "unknown". + UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \ + /sbin/sysctl -n hw.machine_arch 2>/dev/null || \ + /usr/sbin/sysctl -n hw.machine_arch 2>/dev/null || \ + echo unknown)` + case $UNAME_MACHINE_ARCH in + aarch64eb) machine=aarch64_be-unknown ;; + armeb) machine=armeb-unknown ;; + arm*) machine=arm-unknown ;; + sh3el) machine=shl-unknown ;; + sh3eb) machine=sh-unknown ;; + sh5el) machine=sh5le-unknown ;; + earmv*) + arch=`echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,'` + endian=`echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p'` + machine=${arch}${endian}-unknown + ;; + *) machine=$UNAME_MACHINE_ARCH-unknown ;; + esac + # The Operating System including object format, if it has switched + # to ELF recently (or will in the future) and ABI. + case $UNAME_MACHINE_ARCH in + earm*) + os=netbsdelf + ;; + arm*|i386|m68k|ns32k|sh3*|sparc|vax) + set_cc_for_build + if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ELF__ + then + # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). + # Return netbsd for either. FIX? + os=netbsd + else + os=netbsdelf + fi + ;; + *) + os=netbsd + ;; + esac + # Determine ABI tags. + case $UNAME_MACHINE_ARCH in + earm*) + expr='s/^earmv[0-9]/-eabi/;s/eb$//' + abi=`echo "$UNAME_MACHINE_ARCH" | sed -e "$expr"` + ;; + esac + # The OS release + # Debian GNU/NetBSD machines have a different userland, and + # thus, need a distinct triplet. However, they do not need + # kernel version information, so it can be replaced with a + # suitable tag, in the style of linux-gnu. + case $UNAME_VERSION in + Debian*) + release='-gnu' + ;; + *) + release=`echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2` + ;; + esac + # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: + # contains redundant information, the shorter form: + # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. + GUESS=$machine-${os}${release}${abi-} + ;; + *:Bitrig:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'` + GUESS=$UNAME_MACHINE_ARCH-unknown-bitrig$UNAME_RELEASE + ;; + *:OpenBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` + GUESS=$UNAME_MACHINE_ARCH-unknown-openbsd$UNAME_RELEASE + ;; + *:SecBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/SecBSD.//'` + GUESS=$UNAME_MACHINE_ARCH-unknown-secbsd$UNAME_RELEASE + ;; + *:LibertyBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'` + GUESS=$UNAME_MACHINE_ARCH-unknown-libertybsd$UNAME_RELEASE + ;; + *:MidnightBSD:*:*) + GUESS=$UNAME_MACHINE-unknown-midnightbsd$UNAME_RELEASE + ;; + *:ekkoBSD:*:*) + GUESS=$UNAME_MACHINE-unknown-ekkobsd$UNAME_RELEASE + ;; + *:SolidBSD:*:*) + GUESS=$UNAME_MACHINE-unknown-solidbsd$UNAME_RELEASE + ;; + *:OS108:*:*) + GUESS=$UNAME_MACHINE-unknown-os108_$UNAME_RELEASE + ;; + macppc:MirBSD:*:*) + GUESS=powerpc-unknown-mirbsd$UNAME_RELEASE + ;; + *:MirBSD:*:*) + GUESS=$UNAME_MACHINE-unknown-mirbsd$UNAME_RELEASE + ;; + *:Sortix:*:*) + GUESS=$UNAME_MACHINE-unknown-sortix + ;; + *:Twizzler:*:*) + GUESS=$UNAME_MACHINE-unknown-twizzler + ;; + *:Redox:*:*) + GUESS=$UNAME_MACHINE-unknown-redox + ;; + mips:OSF1:*.*) + GUESS=mips-dec-osf1 + ;; + alpha:OSF1:*:*) + # Reset EXIT trap before exiting to avoid spurious non-zero exit code. + trap '' 0 + case $UNAME_RELEASE in + *4.0) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` + ;; + *5.*) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` + ;; + esac + # According to Compaq, /usr/sbin/psrinfo has been available on + # OSF/1 and Tru64 systems produced since 1995. I hope that + # covers most systems running today. This code pipes the CPU + # types through head -n 1, so we only detect the type of CPU 0. + ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` + case $ALPHA_CPU_TYPE in + "EV4 (21064)") + UNAME_MACHINE=alpha ;; + "EV4.5 (21064)") + UNAME_MACHINE=alpha ;; + "LCA4 (21066/21068)") + UNAME_MACHINE=alpha ;; + "EV5 (21164)") + UNAME_MACHINE=alphaev5 ;; + "EV5.6 (21164A)") + UNAME_MACHINE=alphaev56 ;; + "EV5.6 (21164PC)") + UNAME_MACHINE=alphapca56 ;; + "EV5.7 (21164PC)") + UNAME_MACHINE=alphapca57 ;; + "EV6 (21264)") + UNAME_MACHINE=alphaev6 ;; + "EV6.7 (21264A)") + UNAME_MACHINE=alphaev67 ;; + "EV6.8CB (21264C)") + UNAME_MACHINE=alphaev68 ;; + "EV6.8AL (21264B)") + UNAME_MACHINE=alphaev68 ;; + "EV6.8CX (21264D)") + UNAME_MACHINE=alphaev68 ;; + "EV6.9A (21264/EV69A)") + UNAME_MACHINE=alphaev69 ;; + "EV7 (21364)") + UNAME_MACHINE=alphaev7 ;; + "EV7.9 (21364A)") + UNAME_MACHINE=alphaev79 ;; + esac + # A Pn.n version is a patched version. + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. + OSF_REL=`echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` + GUESS=$UNAME_MACHINE-dec-osf$OSF_REL + ;; + Amiga*:UNIX_System_V:4.0:*) + GUESS=m68k-unknown-sysv4 + ;; + *:[Aa]miga[Oo][Ss]:*:*) + GUESS=$UNAME_MACHINE-unknown-amigaos + ;; + *:[Mm]orph[Oo][Ss]:*:*) + GUESS=$UNAME_MACHINE-unknown-morphos + ;; + *:OS/390:*:*) + GUESS=i370-ibm-openedition + ;; + *:z/VM:*:*) + GUESS=s390-ibm-zvmoe + ;; + *:OS400:*:*) + GUESS=powerpc-ibm-os400 + ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + GUESS=arm-acorn-riscix$UNAME_RELEASE + ;; + arm*:riscos:*:*|arm*:RISCOS:*:*) + GUESS=arm-unknown-riscos + ;; + SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) + GUESS=hppa1.1-hitachi-hiuxmpp + ;; + Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) + # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. + case `(/bin/universe) 2>/dev/null` in + att) GUESS=pyramid-pyramid-sysv3 ;; + *) GUESS=pyramid-pyramid-bsd ;; + esac + ;; + NILE*:*:*:dcosx) + GUESS=pyramid-pyramid-svr4 + ;; + DRS?6000:unix:4.0:6*) + GUESS=sparc-icl-nx6 + ;; + DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) + case `/usr/bin/uname -p` in + sparc) GUESS=sparc-icl-nx7 ;; + esac + ;; + s390x:SunOS:*:*) + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=$UNAME_MACHINE-ibm-solaris2$SUN_REL + ;; + sun4H:SunOS:5.*:*) + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=sparc-hal-solaris2$SUN_REL + ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=sparc-sun-solaris2$SUN_REL + ;; + i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) + GUESS=i386-pc-auroraux$UNAME_RELEASE + ;; + i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) + set_cc_for_build + SUN_ARCH=i386 + # If there is a compiler, see if it is configured for 64-bit objects. + # Note that the Sun cc does not turn __LP64__ into 1 like gcc does. + # This test works for both compilers. + if test "$CC_FOR_BUILD" != no_compiler_found; then + if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -m64 -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + SUN_ARCH=x86_64 + fi + fi + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=$SUN_ARCH-pc-solaris2$SUN_REL + ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=sparc-sun-solaris3$SUN_REL + ;; + sun4*:SunOS:*:*) + case `/usr/bin/arch -k` in + Series*|S4*) + UNAME_RELEASE=`uname -v` + ;; + esac + # Japanese Language versions have a version number like '4.1.3-JL'. + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/'` + GUESS=sparc-sun-sunos$SUN_REL + ;; + sun3*:SunOS:*:*) + GUESS=m68k-sun-sunos$UNAME_RELEASE + ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3 + case `/bin/arch` in + sun3) + GUESS=m68k-sun-sunos$UNAME_RELEASE + ;; + sun4) + GUESS=sparc-sun-sunos$UNAME_RELEASE + ;; + esac + ;; + aushp:SunOS:*:*) + GUESS=sparc-auspex-sunos$UNAME_RELEASE + ;; + # The situation for MiNT is a little confusing. The machine name + # can be virtually everything (everything which is not + # "atarist" or "atariste" at least should have a processor + # > m68000). The system name ranges from "MiNT" over "FreeMiNT" + # to the lowercase version "mint" (or "freemint"). Finally + # the system name "TOS" denotes a system which is actually not + # MiNT. But MiNT is downward compatible to TOS, so this should + # be no problem. + atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) + GUESS=m68k-atari-mint$UNAME_RELEASE + ;; + atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) + GUESS=m68k-atari-mint$UNAME_RELEASE + ;; + *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) + GUESS=m68k-atari-mint$UNAME_RELEASE + ;; + milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) + GUESS=m68k-milan-mint$UNAME_RELEASE + ;; + hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) + GUESS=m68k-hades-mint$UNAME_RELEASE + ;; + *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) + GUESS=m68k-unknown-mint$UNAME_RELEASE + ;; + m68k:machten:*:*) + GUESS=m68k-apple-machten$UNAME_RELEASE + ;; + powerpc:machten:*:*) + GUESS=powerpc-apple-machten$UNAME_RELEASE + ;; + RISC*:Mach:*:*) + GUESS=mips-dec-mach_bsd4.3 + ;; + RISC*:ULTRIX:*:*) + GUESS=mips-dec-ultrix$UNAME_RELEASE + ;; + VAX*:ULTRIX*:*:*) + GUESS=vax-dec-ultrix$UNAME_RELEASE + ;; + 2020:CLIX:*:* | 2430:CLIX:*:*) + GUESS=clipper-intergraph-clix$UNAME_RELEASE + ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + set_cc_for_build + sed 's/^ //' << EOF > "$dummy.c" +#ifdef __cplusplus +#include /* for printf() prototype */ + int main (int argc, char *argv[]) { +#else + int main (argc, argv) int argc; char *argv[]; { +#endif + #if defined (host_mips) && defined (MIPSEB) + #if defined (SYSTYPE_SYSV) + printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_SVR4) + printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) + printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0); + #endif + #endif + exit (-1); + } +EOF + $CC_FOR_BUILD -o "$dummy" "$dummy.c" && + dummyarg=`echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p'` && + SYSTEM_NAME=`"$dummy" "$dummyarg"` && + { echo "$SYSTEM_NAME"; exit; } + GUESS=mips-mips-riscos$UNAME_RELEASE + ;; + Motorola:PowerMAX_OS:*:*) + GUESS=powerpc-motorola-powermax + ;; + Motorola:*:4.3:PL8-*) + GUESS=powerpc-harris-powermax + ;; + Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) + GUESS=powerpc-harris-powermax + ;; + Night_Hawk:Power_UNIX:*:*) + GUESS=powerpc-harris-powerunix + ;; + m88k:CX/UX:7*:*) + GUESS=m88k-harris-cxux7 + ;; + m88k:*:4*:R4*) + GUESS=m88k-motorola-sysv4 + ;; + m88k:*:3*:R3*) + GUESS=m88k-motorola-sysv3 + ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` + if test "$UNAME_PROCESSOR" = mc88100 || test "$UNAME_PROCESSOR" = mc88110 + then + if test "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx || \ + test "$TARGET_BINARY_INTERFACE"x = x + then + GUESS=m88k-dg-dgux$UNAME_RELEASE + else + GUESS=m88k-dg-dguxbcs$UNAME_RELEASE + fi + else + GUESS=i586-dg-dgux$UNAME_RELEASE + fi + ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + GUESS=m88k-dolphin-sysv3 + ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + GUESS=m88k-motorola-sysv3 + ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + GUESS=m88k-tektronix-sysv3 + ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + GUESS=m68k-tektronix-bsd + ;; + *:IRIX*:*:*) + IRIX_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/g'` + GUESS=mips-sgi-irix$IRIX_REL + ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. + GUESS=romp-ibm-aix # uname -m gives an 8 hex-code CPU id + ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i*86:AIX:*:*) + GUESS=i386-ibm-aix + ;; + ia64:AIX:*:*) + if test -x /usr/bin/oslevel ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=$UNAME_VERSION.$UNAME_RELEASE + fi + GUESS=$UNAME_MACHINE-ibm-aix$IBM_REV + ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + set_cc_for_build + sed 's/^ //' << EOF > "$dummy.c" + #include + + int + main () + { + if (!__power_pc()) + exit(1); + puts("powerpc-ibm-aix3.2.5"); + exit(0); + } +EOF + if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` + then + GUESS=$SYSTEM_NAME + else + GUESS=rs6000-ibm-aix3.2.5 + fi + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + GUESS=rs6000-ibm-aix3.2.4 + else + GUESS=rs6000-ibm-aix3.2 + fi + ;; + *:AIX:*:[4567]) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` + if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then + IBM_ARCH=rs6000 + else + IBM_ARCH=powerpc + fi + if test -x /usr/bin/lslpp ; then + IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | \ + awk -F: '{ print $3 }' | sed s/[0-9]*$/0/` + else + IBM_REV=$UNAME_VERSION.$UNAME_RELEASE + fi + GUESS=$IBM_ARCH-ibm-aix$IBM_REV + ;; + *:AIX:*:*) + GUESS=rs6000-ibm-aix + ;; + ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*) + GUESS=romp-ibm-bsd4.4 + ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and + GUESS=romp-ibm-bsd$UNAME_RELEASE # 4.3 with uname added to + ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + GUESS=rs6000-bull-bosx + ;; + DPX/2?00:B.O.S.:*:*) + GUESS=m68k-bull-sysv3 + ;; + 9000/[34]??:4.3bsd:1.*:*) + GUESS=m68k-hp-bsd + ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + GUESS=m68k-hp-bsd4.4 + ;; + 9000/[34678]??:HP-UX:*:*) + HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'` + case $UNAME_MACHINE in + 9000/31?) HP_ARCH=m68000 ;; + 9000/[34]??) HP_ARCH=m68k ;; + 9000/[678][0-9][0-9]) + if test -x /usr/bin/getconf; then + sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` + sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` + case $sc_cpu_version in + 523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0 + 528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1 + 532) # CPU_PA_RISC2_0 + case $sc_kernel_bits in + 32) HP_ARCH=hppa2.0n ;; + 64) HP_ARCH=hppa2.0w ;; + '') HP_ARCH=hppa2.0 ;; # HP-UX 10.20 + esac ;; + esac + fi + if test "$HP_ARCH" = ""; then + set_cc_for_build + sed 's/^ //' << EOF > "$dummy.c" + + #define _HPUX_SOURCE + #include + #include + + int + main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); + + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } +EOF + (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=`"$dummy"` + test -z "$HP_ARCH" && HP_ARCH=hppa + fi ;; + esac + if test "$HP_ARCH" = hppa2.0w + then + set_cc_for_build + + # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating + # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler + # generating 64-bit code. GNU and HP use different nomenclature: + # + # $ CC_FOR_BUILD=cc ./config.guess + # => hppa2.0w-hp-hpux11.23 + # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess + # => hppa64-hp-hpux11.23 + + if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | + grep -q __LP64__ + then + HP_ARCH=hppa2.0w + else + HP_ARCH=hppa64 + fi + fi + GUESS=$HP_ARCH-hp-hpux$HPUX_REV + ;; + ia64:HP-UX:*:*) + HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'` + GUESS=ia64-hp-hpux$HPUX_REV + ;; + 3050*:HI-UX:*:*) + set_cc_for_build + sed 's/^ //' << EOF > "$dummy.c" + #include + int + main () + { + long cpu = sysconf (_SC_CPU_VERSION); + /* The order matters, because CPU_IS_HP_MC68K erroneously returns + true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct + results, however. */ + if (CPU_IS_PA_RISC (cpu)) + { + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; + case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; + default: puts ("hppa-hitachi-hiuxwe2"); break; + } + } + else if (CPU_IS_HP_MC68K (cpu)) + puts ("m68k-hitachi-hiuxwe2"); + else puts ("unknown-hitachi-hiuxwe2"); + exit (0); + } +EOF + $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` && + { echo "$SYSTEM_NAME"; exit; } + GUESS=unknown-hitachi-hiuxwe2 + ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*) + GUESS=hppa1.1-hp-bsd + ;; + 9000/8??:4.3bsd:*:*) + GUESS=hppa1.0-hp-bsd + ;; + *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) + GUESS=hppa1.0-hp-mpeix + ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*) + GUESS=hppa1.1-hp-osf + ;; + hp8??:OSF1:*:*) + GUESS=hppa1.0-hp-osf + ;; + i*86:OSF1:*:*) + if test -x /usr/sbin/sysversion ; then + GUESS=$UNAME_MACHINE-unknown-osf1mk + else + GUESS=$UNAME_MACHINE-unknown-osf1 + fi + ;; + parisc*:Lites*:*:*) + GUESS=hppa1.1-hp-lites + ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + GUESS=c1-convex-bsd + ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + GUESS=c34-convex-bsd + ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + GUESS=c38-convex-bsd + ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + GUESS=c4-convex-bsd + ;; + CRAY*Y-MP:*:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=ymp-cray-unicos$CRAY_REL + ;; + CRAY*[A-Z]90:*:*:*) + echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ + -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*TS:*:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=t90-cray-unicos$CRAY_REL + ;; + CRAY*T3E:*:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=alphaev5-cray-unicosmk$CRAY_REL + ;; + CRAY*SV1:*:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=sv1-cray-unicos$CRAY_REL + ;; + *:UNICOS/mp:*:*) + CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'` + GUESS=craynv-cray-unicosmp$CRAY_REL + ;; + F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) + FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz` + FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` + FUJITSU_REL=`echo "$UNAME_RELEASE" | sed -e 's/ /_/'` + GUESS=${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL} + ;; + 5000:UNIX_System_V:4.*:*) + FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'` + FUJITSU_REL=`echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'` + GUESS=sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL} + ;; + i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) + GUESS=$UNAME_MACHINE-pc-bsdi$UNAME_RELEASE + ;; + sparc*:BSD/OS:*:*) + GUESS=sparc-unknown-bsdi$UNAME_RELEASE + ;; + *:BSD/OS:*:*) + GUESS=$UNAME_MACHINE-unknown-bsdi$UNAME_RELEASE + ;; + arm:FreeBSD:*:*) + UNAME_PROCESSOR=`uname -p` + set_cc_for_build + if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_PCS_VFP + then + FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabi + else + FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabihf + fi + ;; + *:FreeBSD:*:*) + UNAME_PROCESSOR=`uname -p` + case $UNAME_PROCESSOR in + amd64) + UNAME_PROCESSOR=x86_64 ;; + i386) + UNAME_PROCESSOR=i586 ;; + esac + FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL + ;; + i*:CYGWIN*:*) + GUESS=$UNAME_MACHINE-pc-cygwin + ;; + *:MINGW64*:*) + GUESS=$UNAME_MACHINE-pc-mingw64 + ;; + *:MINGW*:*) + GUESS=$UNAME_MACHINE-pc-mingw32 + ;; + *:MSYS*:*) + GUESS=$UNAME_MACHINE-pc-msys + ;; + i*:PW*:*) + GUESS=$UNAME_MACHINE-pc-pw32 + ;; + *:SerenityOS:*:*) + GUESS=$UNAME_MACHINE-pc-serenity + ;; + *:Interix*:*) + case $UNAME_MACHINE in + x86) + GUESS=i586-pc-interix$UNAME_RELEASE + ;; + authenticamd | genuineintel | EM64T) + GUESS=x86_64-unknown-interix$UNAME_RELEASE + ;; + IA64) + GUESS=ia64-unknown-interix$UNAME_RELEASE + ;; + esac ;; + i*:UWIN*:*) + GUESS=$UNAME_MACHINE-pc-uwin + ;; + amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) + GUESS=x86_64-pc-cygwin + ;; + prep*:SunOS:5.*:*) + SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'` + GUESS=powerpcle-unknown-solaris2$SUN_REL + ;; + *:GNU:*:*) + # the GNU system + GNU_ARCH=`echo "$UNAME_MACHINE" | sed -e 's,[-/].*$,,'` + GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's,/.*$,,'` + GUESS=$GNU_ARCH-unknown-$LIBC$GNU_REL + ;; + *:GNU/*:*:*) + # other systems with GNU libc and userland + GNU_SYS=`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"` + GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_MACHINE-unknown-$GNU_SYS$GNU_REL-$LIBC + ;; + x86_64:[Mm]anagarm:*:*|i?86:[Mm]anagarm:*:*) + GUESS="$UNAME_MACHINE-pc-managarm-mlibc" + ;; + *:[Mm]anagarm:*:*) + GUESS="$UNAME_MACHINE-unknown-managarm-mlibc" + ;; + *:Minix:*:*) + GUESS=$UNAME_MACHINE-unknown-minix + ;; + aarch64:Linux:*:*) + set_cc_for_build + CPU=$UNAME_MACHINE + LIBCABI=$LIBC + if test "$CC_FOR_BUILD" != no_compiler_found; then + ABI=64 + sed 's/^ //' << EOF > "$dummy.c" + #ifdef __ARM_EABI__ + #ifdef __ARM_PCS_VFP + ABI=eabihf + #else + ABI=eabi + #endif + #endif +EOF + cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'` + eval "$cc_set_abi" + case $ABI in + eabi | eabihf) CPU=armv8l; LIBCABI=$LIBC$ABI ;; + esac + fi + GUESS=$CPU-unknown-linux-$LIBCABI + ;; + aarch64_be:Linux:*:*) + UNAME_MACHINE=aarch64_be + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep -q ld.so.1 + if test "$?" = 0 ; then LIBC=gnulibc1 ; fi + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + arc:Linux:*:* | arceb:Linux:*:* | arc32:Linux:*:* | arc64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + arm*:Linux:*:*) + set_cc_for_build + if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_EABI__ + then + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + else + if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_PCS_VFP + then + GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabi + else + GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabihf + fi + fi + ;; + avr32*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + cris:Linux:*:*) + GUESS=$UNAME_MACHINE-axis-linux-$LIBC + ;; + crisv32:Linux:*:*) + GUESS=$UNAME_MACHINE-axis-linux-$LIBC + ;; + e2k:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + frv:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + hexagon:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + i*86:Linux:*:*) + GUESS=$UNAME_MACHINE-pc-linux-$LIBC + ;; + ia64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + k1om:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + kvx:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + kvx:cos:*:*) + GUESS=$UNAME_MACHINE-unknown-cos + ;; + kvx:mbr:*:*) + GUESS=$UNAME_MACHINE-unknown-mbr + ;; + loongarch32:Linux:*:* | loongarch64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + m32r*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + m68*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + mips:Linux:*:* | mips64:Linux:*:*) + set_cc_for_build + IS_GLIBC=0 + test x"${LIBC}" = xgnu && IS_GLIBC=1 + sed 's/^ //' << EOF > "$dummy.c" + #undef CPU + #undef mips + #undef mipsel + #undef mips64 + #undef mips64el + #if ${IS_GLIBC} && defined(_ABI64) + LIBCABI=gnuabi64 + #else + #if ${IS_GLIBC} && defined(_ABIN32) + LIBCABI=gnuabin32 + #else + LIBCABI=${LIBC} + #endif + #endif + + #if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6 + CPU=mipsisa64r6 + #else + #if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6 + CPU=mipsisa32r6 + #else + #if defined(__mips64) + CPU=mips64 + #else + CPU=mips + #endif + #endif + #endif + + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + MIPS_ENDIAN=el + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + MIPS_ENDIAN= + #else + MIPS_ENDIAN= + #endif + #endif +EOF + cc_set_vars=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI'` + eval "$cc_set_vars" + test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; } + ;; + mips64el:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + openrisc*:Linux:*:*) + GUESS=or1k-unknown-linux-$LIBC + ;; + or32:Linux:*:* | or1k*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + padre:Linux:*:*) + GUESS=sparc-unknown-linux-$LIBC + ;; + parisc64:Linux:*:* | hppa64:Linux:*:*) + GUESS=hppa64-unknown-linux-$LIBC + ;; + parisc:Linux:*:* | hppa:Linux:*:*) + # Look for CPU level + case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in + PA7*) GUESS=hppa1.1-unknown-linux-$LIBC ;; + PA8*) GUESS=hppa2.0-unknown-linux-$LIBC ;; + *) GUESS=hppa-unknown-linux-$LIBC ;; + esac + ;; + ppc64:Linux:*:*) + GUESS=powerpc64-unknown-linux-$LIBC + ;; + ppc:Linux:*:*) + GUESS=powerpc-unknown-linux-$LIBC + ;; + ppc64le:Linux:*:*) + GUESS=powerpc64le-unknown-linux-$LIBC + ;; + ppcle:Linux:*:*) + GUESS=powerpcle-unknown-linux-$LIBC + ;; + riscv32:Linux:*:* | riscv32be:Linux:*:* | riscv64:Linux:*:* | riscv64be:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + s390:Linux:*:* | s390x:Linux:*:*) + GUESS=$UNAME_MACHINE-ibm-linux-$LIBC + ;; + sh64*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + sh*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + sparc:Linux:*:* | sparc64:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + tile*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + vax:Linux:*:*) + GUESS=$UNAME_MACHINE-dec-linux-$LIBC + ;; + x86_64:Linux:*:*) + set_cc_for_build + CPU=$UNAME_MACHINE + LIBCABI=$LIBC + if test "$CC_FOR_BUILD" != no_compiler_found; then + ABI=64 + sed 's/^ //' << EOF > "$dummy.c" + #ifdef __i386__ + ABI=x86 + #else + #ifdef __ILP32__ + ABI=x32 + #endif + #endif +EOF + cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'` + eval "$cc_set_abi" + case $ABI in + x86) CPU=i686 ;; + x32) LIBCABI=${LIBC}x32 ;; + esac + fi + GUESS=$CPU-pc-linux-$LIBCABI + ;; + xtensa*:Linux:*:*) + GUESS=$UNAME_MACHINE-unknown-linux-$LIBC + ;; + i*86:DYNIX/ptx:4*:*) + # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. + # earlier versions are messed up and put the nodename in both + # sysname and nodename. + GUESS=i386-sequent-sysv4 + ;; + i*86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + GUESS=$UNAME_MACHINE-pc-sysv4.2uw$UNAME_VERSION + ;; + i*86:OS/2:*:*) + # If we were able to find 'uname', then EMX Unix compatibility + # is probably installed. + GUESS=$UNAME_MACHINE-pc-os2-emx + ;; + i*86:XTS-300:*:STOP) + GUESS=$UNAME_MACHINE-unknown-stop + ;; + i*86:atheos:*:*) + GUESS=$UNAME_MACHINE-unknown-atheos + ;; + i*86:syllable:*:*) + GUESS=$UNAME_MACHINE-pc-syllable + ;; + i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) + GUESS=i386-unknown-lynxos$UNAME_RELEASE + ;; + i*86:*DOS:*:*) + GUESS=$UNAME_MACHINE-pc-msdosdjgpp + ;; + i*86:*:4.*:*) + UNAME_REL=`echo "$UNAME_RELEASE" | sed 's/\/MP$//'` + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then + GUESS=$UNAME_MACHINE-univel-sysv$UNAME_REL + else + GUESS=$UNAME_MACHINE-pc-sysv$UNAME_REL + fi + ;; + i*86:*:5:[678]*) + # UnixWare 7.x, OpenUNIX and OpenServer 6. + case `/bin/uname -X | grep "^Machine"` in + *486*) UNAME_MACHINE=i486 ;; + *Pentium) UNAME_MACHINE=i586 ;; + *Pent*|*Celeron) UNAME_MACHINE=i686 ;; + esac + GUESS=$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} + ;; + i*86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then + UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` + (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 + (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ + && UNAME_MACHINE=i686 + (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ + && UNAME_MACHINE=i686 + GUESS=$UNAME_MACHINE-pc-sco$UNAME_REL + else + GUESS=$UNAME_MACHINE-pc-sysv32 + fi + ;; + pc:*:*:*) + # Left here for compatibility: + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i586. + # Note: whatever this is, it MUST be the same as what config.sub + # prints for the "djgpp" host, or else GDB configure will decide that + # this is a cross-build. + GUESS=i586-pc-msdosdjgpp + ;; + Intel:Mach:3*:*) + GUESS=i386-pc-mach3 + ;; + paragon:*:*:*) + GUESS=i860-intel-osf1 + ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + GUESS=i860-stardent-sysv$UNAME_RELEASE # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + GUESS=i860-unknown-sysv$UNAME_RELEASE # Unknown i860-SVR4 + fi + ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + GUESS=m68010-convergent-sysv + ;; + mc68k:UNIX:SYSTEM5:3.51m) + GUESS=m68k-convergent-sysv + ;; + M680?0:D-NIX:5.3:*) + GUESS=m68k-diab-dnix + ;; + M68*:*:R3V[5678]*:*) + test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; + 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3"$OS_REL"; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4; exit; } ;; + NCR*:*:4.2:* | MPRAS*:*:4.2:*) + OS_REL='.3' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3"$OS_REL"; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } + /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \ + && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;; + m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) + GUESS=m68k-unknown-lynxos$UNAME_RELEASE + ;; + mc68030:UNIX_System_V:4.*:*) + GUESS=m68k-atari-sysv4 + ;; + TSUNAMI:LynxOS:2.*:*) + GUESS=sparc-unknown-lynxos$UNAME_RELEASE + ;; + rs6000:LynxOS:2.*:*) + GUESS=rs6000-unknown-lynxos$UNAME_RELEASE + ;; + PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) + GUESS=powerpc-unknown-lynxos$UNAME_RELEASE + ;; + SM[BE]S:UNIX_SV:*:*) + GUESS=mips-dde-sysv$UNAME_RELEASE + ;; + RM*:ReliantUNIX-*:*:*) + GUESS=mips-sni-sysv4 + ;; + RM*:SINIX-*:*:*) + GUESS=mips-sni-sysv4 + ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` + GUESS=$UNAME_MACHINE-sni-sysv4 + else + GUESS=ns32k-sni-sysv + fi + ;; + PENTIUM:*:4.0*:*) # Unisys 'ClearPath HMP IX 4000' SVR4/MP effort + # says + GUESS=i586-unisys-sysv4 + ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes . + # How about differentiating between stratus architectures? -djm + GUESS=hppa1.1-stratus-sysv4 + ;; + *:*:*:FTX*) + # From seanf@swdc.stratus.com. + GUESS=i860-stratus-sysv4 + ;; + i*86:VOS:*:*) + # From Paul.Green@stratus.com. + GUESS=$UNAME_MACHINE-stratus-vos + ;; + *:VOS:*:*) + # From Paul.Green@stratus.com. + GUESS=hppa1.1-stratus-vos + ;; + mc68*:A/UX:*:*) + GUESS=m68k-apple-aux$UNAME_RELEASE + ;; + news*:NEWS-OS:6*:*) + GUESS=mips-sony-newsos6 + ;; + R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) + if test -d /usr/nec; then + GUESS=mips-nec-sysv$UNAME_RELEASE + else + GUESS=mips-unknown-sysv$UNAME_RELEASE + fi + ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + GUESS=powerpc-be-beos + ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + GUESS=powerpc-apple-beos + ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + GUESS=i586-pc-beos + ;; + BePC:Haiku:*:*) # Haiku running on Intel PC compatible. + GUESS=i586-pc-haiku + ;; + ppc:Haiku:*:*) # Haiku running on Apple PowerPC + GUESS=powerpc-apple-haiku + ;; + *:Haiku:*:*) # Haiku modern gcc (not bound by BeOS compat) + GUESS=$UNAME_MACHINE-unknown-haiku + ;; + SX-4:SUPER-UX:*:*) + GUESS=sx4-nec-superux$UNAME_RELEASE + ;; + SX-5:SUPER-UX:*:*) + GUESS=sx5-nec-superux$UNAME_RELEASE + ;; + SX-6:SUPER-UX:*:*) + GUESS=sx6-nec-superux$UNAME_RELEASE + ;; + SX-7:SUPER-UX:*:*) + GUESS=sx7-nec-superux$UNAME_RELEASE + ;; + SX-8:SUPER-UX:*:*) + GUESS=sx8-nec-superux$UNAME_RELEASE + ;; + SX-8R:SUPER-UX:*:*) + GUESS=sx8r-nec-superux$UNAME_RELEASE + ;; + SX-ACE:SUPER-UX:*:*) + GUESS=sxace-nec-superux$UNAME_RELEASE + ;; + Power*:Rhapsody:*:*) + GUESS=powerpc-apple-rhapsody$UNAME_RELEASE + ;; + *:Rhapsody:*:*) + GUESS=$UNAME_MACHINE-apple-rhapsody$UNAME_RELEASE + ;; + arm64:Darwin:*:*) + GUESS=aarch64-apple-darwin$UNAME_RELEASE + ;; + *:Darwin:*:*) + UNAME_PROCESSOR=`uname -p` + case $UNAME_PROCESSOR in + unknown) UNAME_PROCESSOR=powerpc ;; + esac + if command -v xcode-select > /dev/null 2> /dev/null && \ + ! xcode-select --print-path > /dev/null 2> /dev/null ; then + # Avoid executing cc if there is no toolchain installed as + # cc will be a stub that puts up a graphical alert + # prompting the user to install developer tools. + CC_FOR_BUILD=no_compiler_found + else + set_cc_for_build + fi + if test "$CC_FOR_BUILD" != no_compiler_found; then + if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + case $UNAME_PROCESSOR in + i386) UNAME_PROCESSOR=x86_64 ;; + powerpc) UNAME_PROCESSOR=powerpc64 ;; + esac + fi + # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc + if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_PPC >/dev/null + then + UNAME_PROCESSOR=powerpc + fi + elif test "$UNAME_PROCESSOR" = i386 ; then + # uname -m returns i386 or x86_64 + UNAME_PROCESSOR=$UNAME_MACHINE + fi + GUESS=$UNAME_PROCESSOR-apple-darwin$UNAME_RELEASE + ;; + *:procnto*:*:* | *:QNX:[0123456789]*:*) + UNAME_PROCESSOR=`uname -p` + if test "$UNAME_PROCESSOR" = x86; then + UNAME_PROCESSOR=i386 + UNAME_MACHINE=pc + fi + GUESS=$UNAME_PROCESSOR-$UNAME_MACHINE-nto-qnx$UNAME_RELEASE + ;; + *:QNX:*:4*) + GUESS=i386-pc-qnx + ;; + NEO-*:NONSTOP_KERNEL:*:*) + GUESS=neo-tandem-nsk$UNAME_RELEASE + ;; + NSE-*:NONSTOP_KERNEL:*:*) + GUESS=nse-tandem-nsk$UNAME_RELEASE + ;; + NSR-*:NONSTOP_KERNEL:*:*) + GUESS=nsr-tandem-nsk$UNAME_RELEASE + ;; + NSV-*:NONSTOP_KERNEL:*:*) + GUESS=nsv-tandem-nsk$UNAME_RELEASE + ;; + NSX-*:NONSTOP_KERNEL:*:*) + GUESS=nsx-tandem-nsk$UNAME_RELEASE + ;; + *:NonStop-UX:*:*) + GUESS=mips-compaq-nonstopux + ;; + BS2000:POSIX*:*:*) + GUESS=bs2000-siemens-sysv + ;; + DS/*:UNIX_System_V:*:*) + GUESS=$UNAME_MACHINE-$UNAME_SYSTEM-$UNAME_RELEASE + ;; + *:Plan9:*:*) + # "uname -m" is not consistent, so use $cputype instead. 386 + # is converted to i386 for consistency with other x86 + # operating systems. + if test "${cputype-}" = 386; then + UNAME_MACHINE=i386 + elif test "x${cputype-}" != x; then + UNAME_MACHINE=$cputype + fi + GUESS=$UNAME_MACHINE-unknown-plan9 + ;; + *:TOPS-10:*:*) + GUESS=pdp10-unknown-tops10 + ;; + *:TENEX:*:*) + GUESS=pdp10-unknown-tenex + ;; + KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) + GUESS=pdp10-dec-tops20 + ;; + XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) + GUESS=pdp10-xkl-tops20 + ;; + *:TOPS-20:*:*) + GUESS=pdp10-unknown-tops20 + ;; + *:ITS:*:*) + GUESS=pdp10-unknown-its + ;; + SEI:*:*:SEIUX) + GUESS=mips-sei-seiux$UNAME_RELEASE + ;; + *:DragonFly:*:*) + DRAGONFLY_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'` + GUESS=$UNAME_MACHINE-unknown-dragonfly$DRAGONFLY_REL + ;; + *:*VMS:*:*) + UNAME_MACHINE=`(uname -p) 2>/dev/null` + case $UNAME_MACHINE in + A*) GUESS=alpha-dec-vms ;; + I*) GUESS=ia64-dec-vms ;; + V*) GUESS=vax-dec-vms ;; + esac ;; + *:XENIX:*:SysV) + GUESS=i386-pc-xenix + ;; + i*86:skyos:*:*) + SKYOS_REL=`echo "$UNAME_RELEASE" | sed -e 's/ .*$//'` + GUESS=$UNAME_MACHINE-pc-skyos$SKYOS_REL + ;; + i*86:rdos:*:*) + GUESS=$UNAME_MACHINE-pc-rdos + ;; + i*86:Fiwix:*:*) + GUESS=$UNAME_MACHINE-pc-fiwix + ;; + *:AROS:*:*) + GUESS=$UNAME_MACHINE-unknown-aros + ;; + x86_64:VMkernel:*:*) + GUESS=$UNAME_MACHINE-unknown-esx + ;; + amd64:Isilon\ OneFS:*:*) + GUESS=x86_64-unknown-onefs + ;; + *:Unleashed:*:*) + GUESS=$UNAME_MACHINE-unknown-unleashed$UNAME_RELEASE + ;; + *:Ironclad:*:*) + GUESS=$UNAME_MACHINE-unknown-ironclad + ;; +esac + +# Do we have a guess based on uname results? +if test "x$GUESS" != x; then + echo "$GUESS" + exit +fi + +# No uname command or uname output not recognized. +set_cc_for_build +cat > "$dummy.c" < +#include +#endif +#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__) +#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__) +#include +#if defined(_SIZE_T_) || defined(SIGLOST) +#include +#endif +#endif +#endif +int +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); +#endif + +#if defined (vax) +#if !defined (ultrix) +#include +#if defined (BSD) +#if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +#else +#if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +#else + printf ("vax-dec-bsd\n"); exit (0); +#endif +#endif +#else + printf ("vax-dec-bsd\n"); exit (0); +#endif +#else +#if defined(_SIZE_T_) || defined(SIGLOST) + struct utsname un; + uname (&un); + printf ("vax-dec-ultrix%s\n", un.release); exit (0); +#else + printf ("vax-dec-ultrix\n"); exit (0); +#endif +#endif +#endif +#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__) +#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__) +#if defined(_SIZE_T_) || defined(SIGLOST) + struct utsname *un; + uname (&un); + printf ("mips-dec-ultrix%s\n", un.release); exit (0); +#else + printf ("mips-dec-ultrix\n"); exit (0); +#endif +#endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=`"$dummy"` && + { echo "$SYSTEM_NAME"; exit; } + +# Apollos put the system type in the environment. +test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; } + +echo "$0: unable to guess system type" >&2 + +case $UNAME_MACHINE:$UNAME_SYSTEM in + mips:Linux | mips64:Linux) + # If we got here on MIPS GNU/Linux, output extra information. + cat >&2 <&2 <&2 </dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null` + +hostinfo = `(hostinfo) 2>/dev/null` +/bin/universe = `(/bin/universe) 2>/dev/null` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` +/bin/arch = `(/bin/arch) 2>/dev/null` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` + +UNAME_MACHINE = "$UNAME_MACHINE" +UNAME_RELEASE = "$UNAME_RELEASE" +UNAME_SYSTEM = "$UNAME_SYSTEM" +UNAME_VERSION = "$UNAME_VERSION" +EOF +fi + +exit 1 + +# Local variables: +# eval: (add-hook 'before-save-hook 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/thirdparty/patches/config.sub b/thirdparty/patches/config.sub new file mode 100644 index 0000000000..4aaae46f6f --- /dev/null +++ b/thirdparty/patches/config.sub @@ -0,0 +1,2354 @@ +#! /bin/sh +# Configuration validation subroutine script. +# Copyright 1992-2024 Free Software Foundation, Inc. + +# shellcheck disable=SC2006,SC2268,SC2162 # see below for rationale + +timestamp='2024-05-27' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, see . +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that +# program. This Exception is an additional permission under section 7 +# of the GNU General Public License, version 3 ("GPLv3"). + + +# Please send patches to . +# +# Configuration subroutine to validate and canonicalize a configuration type. +# Supply the specified configuration type as an argument. +# If it is invalid, we print an error message on stderr and exit with code 1. +# Otherwise, we print the canonical config type on stdout and succeed. + +# You can get the latest version of this script from: +# https://git.savannah.gnu.org/cgit/config.git/plain/config.sub + +# This file is supposed to be the same for all GNU packages +# and recognize all the CPU types, system types and aliases +# that are meaningful with *any* GNU software. +# Each package is responsible for reporting which valid configurations +# it does not support. The user should be able to distinguish +# a failure to support a valid configuration from a meaningless +# configuration. + +# The goal of this file is to map all the various variations of a given +# machine specification into a single specification in the form: +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or in some cases, the newer four-part form: +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# It is wrong to echo any other type of specification. + +# The "shellcheck disable" line above the timestamp inhibits complaints +# about features and limitations of the classic Bourne shell that were +# superseded or lifted in POSIX. However, this script identifies a wide +# variety of pre-POSIX systems that do not have POSIX shells at all, and +# even some reasonably current systems (Solaris 10 as case-in-point) still +# have a pre-POSIX /bin/sh. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS + +Canonicalize a configuration name. + +Options: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.sub ($timestamp) + +Copyright 1992-2024 Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try '$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + + *local*) + # First pass through any local machine types. + echo "$1" + exit ;; + + * ) + break ;; + esac +done + +case $# in + 0) echo "$me: missing argument$help" >&2 + exit 1;; + 1) ;; + *) echo "$me: too many arguments$help" >&2 + exit 1;; +esac + +# Split fields of configuration type +saved_IFS=$IFS +IFS="-" read field1 field2 field3 field4 <&2 + exit 1 + ;; + *-*-*-*) + basic_machine=$field1-$field2 + basic_os=$field3-$field4 + ;; + *-*-*) + # Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two + # parts + maybe_os=$field2-$field3 + case $maybe_os in + cloudabi*-eabi* \ + | kfreebsd*-gnu* \ + | knetbsd*-gnu* \ + | kopensolaris*-gnu* \ + | linux-* \ + | managarm-* \ + | netbsd*-eabi* \ + | netbsd*-gnu* \ + | nto-qnx* \ + | os2-emx* \ + | rtmk-nova* \ + | storm-chaos* \ + | uclinux-gnu* \ + | uclinux-uclibc* \ + | windows-* ) + basic_machine=$field1 + basic_os=$maybe_os + ;; + android-linux) + basic_machine=$field1-unknown + basic_os=linux-android + ;; + *) + basic_machine=$field1-$field2 + basic_os=$field3 + ;; + esac + ;; + *-*) + case $field1-$field2 in + # Shorthands that happen to contain a single dash + convex-c[12] | convex-c3[248]) + basic_machine=$field2-convex + basic_os= + ;; + decstation-3100) + basic_machine=mips-dec + basic_os= + ;; + *-*) + # Second component is usually, but not always the OS + case $field2 in + # Do not treat sunos as a manufacturer + sun*os*) + basic_machine=$field1 + basic_os=$field2 + ;; + # Manufacturers + 3100* \ + | 32* \ + | 3300* \ + | 3600* \ + | 7300* \ + | acorn \ + | altos* \ + | apollo \ + | apple \ + | atari \ + | att* \ + | axis \ + | be \ + | bull \ + | cbm \ + | ccur \ + | cisco \ + | commodore \ + | convergent* \ + | convex* \ + | cray \ + | crds \ + | dec* \ + | delta* \ + | dg \ + | digital \ + | dolphin \ + | encore* \ + | gould \ + | harris \ + | highlevel \ + | hitachi* \ + | hp \ + | ibm* \ + | intergraph \ + | isi* \ + | knuth \ + | masscomp \ + | microblaze* \ + | mips* \ + | motorola* \ + | ncr* \ + | news \ + | next \ + | ns \ + | oki \ + | omron* \ + | pc533* \ + | rebel \ + | rom68k \ + | rombug \ + | semi \ + | sequent* \ + | siemens \ + | sgi* \ + | siemens \ + | sim \ + | sni \ + | sony* \ + | stratus \ + | sun \ + | sun[234]* \ + | tektronix \ + | tti* \ + | ultra \ + | unicom* \ + | wec \ + | winbond \ + | wrs) + basic_machine=$field1-$field2 + basic_os= + ;; + zephyr*) + basic_machine=$field1-unknown + basic_os=$field2 + ;; + *) + basic_machine=$field1 + basic_os=$field2 + ;; + esac + ;; + esac + ;; + *) + # Convert single-component short-hands not valid as part of + # multi-component configurations. + case $field1 in + 386bsd) + basic_machine=i386-pc + basic_os=bsd + ;; + a29khif) + basic_machine=a29k-amd + basic_os=udi + ;; + adobe68k) + basic_machine=m68010-adobe + basic_os=scout + ;; + alliant) + basic_machine=fx80-alliant + basic_os= + ;; + altos | altos3068) + basic_machine=m68k-altos + basic_os= + ;; + am29k) + basic_machine=a29k-none + basic_os=bsd + ;; + amdahl) + basic_machine=580-amdahl + basic_os=sysv + ;; + amiga) + basic_machine=m68k-unknown + basic_os= + ;; + amigaos | amigados) + basic_machine=m68k-unknown + basic_os=amigaos + ;; + amigaunix | amix) + basic_machine=m68k-unknown + basic_os=sysv4 + ;; + apollo68) + basic_machine=m68k-apollo + basic_os=sysv + ;; + apollo68bsd) + basic_machine=m68k-apollo + basic_os=bsd + ;; + aros) + basic_machine=i386-pc + basic_os=aros + ;; + aux) + basic_machine=m68k-apple + basic_os=aux + ;; + balance) + basic_machine=ns32k-sequent + basic_os=dynix + ;; + blackfin) + basic_machine=bfin-unknown + basic_os=linux + ;; + cegcc) + basic_machine=arm-unknown + basic_os=cegcc + ;; + cray) + basic_machine=j90-cray + basic_os=unicos + ;; + crds | unos) + basic_machine=m68k-crds + basic_os= + ;; + da30) + basic_machine=m68k-da30 + basic_os= + ;; + decstation | pmax | pmin | dec3100 | decstatn) + basic_machine=mips-dec + basic_os= + ;; + delta88) + basic_machine=m88k-motorola + basic_os=sysv3 + ;; + dicos) + basic_machine=i686-pc + basic_os=dicos + ;; + djgpp) + basic_machine=i586-pc + basic_os=msdosdjgpp + ;; + ebmon29k) + basic_machine=a29k-amd + basic_os=ebmon + ;; + es1800 | OSE68k | ose68k | ose | OSE) + basic_machine=m68k-ericsson + basic_os=ose + ;; + gmicro) + basic_machine=tron-gmicro + basic_os=sysv + ;; + go32) + basic_machine=i386-pc + basic_os=go32 + ;; + h8300hms) + basic_machine=h8300-hitachi + basic_os=hms + ;; + h8300xray) + basic_machine=h8300-hitachi + basic_os=xray + ;; + h8500hms) + basic_machine=h8500-hitachi + basic_os=hms + ;; + harris) + basic_machine=m88k-harris + basic_os=sysv3 + ;; + hp300 | hp300hpux) + basic_machine=m68k-hp + basic_os=hpux + ;; + hp300bsd) + basic_machine=m68k-hp + basic_os=bsd + ;; + hppaosf) + basic_machine=hppa1.1-hp + basic_os=osf + ;; + hppro) + basic_machine=hppa1.1-hp + basic_os=proelf + ;; + i386mach) + basic_machine=i386-mach + basic_os=mach + ;; + isi68 | isi) + basic_machine=m68k-isi + basic_os=sysv + ;; + m68knommu) + basic_machine=m68k-unknown + basic_os=linux + ;; + magnum | m3230) + basic_machine=mips-mips + basic_os=sysv + ;; + merlin) + basic_machine=ns32k-utek + basic_os=sysv + ;; + mingw64) + basic_machine=x86_64-pc + basic_os=mingw64 + ;; + mingw32) + basic_machine=i686-pc + basic_os=mingw32 + ;; + mingw32ce) + basic_machine=arm-unknown + basic_os=mingw32ce + ;; + monitor) + basic_machine=m68k-rom68k + basic_os=coff + ;; + morphos) + basic_machine=powerpc-unknown + basic_os=morphos + ;; + moxiebox) + basic_machine=moxie-unknown + basic_os=moxiebox + ;; + msdos) + basic_machine=i386-pc + basic_os=msdos + ;; + msys) + basic_machine=i686-pc + basic_os=msys + ;; + mvs) + basic_machine=i370-ibm + basic_os=mvs + ;; + nacl) + basic_machine=le32-unknown + basic_os=nacl + ;; + ncr3000) + basic_machine=i486-ncr + basic_os=sysv4 + ;; + netbsd386) + basic_machine=i386-pc + basic_os=netbsd + ;; + netwinder) + basic_machine=armv4l-rebel + basic_os=linux + ;; + news | news700 | news800 | news900) + basic_machine=m68k-sony + basic_os=newsos + ;; + news1000) + basic_machine=m68030-sony + basic_os=newsos + ;; + necv70) + basic_machine=v70-nec + basic_os=sysv + ;; + nh3000) + basic_machine=m68k-harris + basic_os=cxux + ;; + nh[45]000) + basic_machine=m88k-harris + basic_os=cxux + ;; + nindy960) + basic_machine=i960-intel + basic_os=nindy + ;; + mon960) + basic_machine=i960-intel + basic_os=mon960 + ;; + nonstopux) + basic_machine=mips-compaq + basic_os=nonstopux + ;; + os400) + basic_machine=powerpc-ibm + basic_os=os400 + ;; + OSE68000 | ose68000) + basic_machine=m68000-ericsson + basic_os=ose + ;; + os68k) + basic_machine=m68k-none + basic_os=os68k + ;; + paragon) + basic_machine=i860-intel + basic_os=osf + ;; + parisc) + basic_machine=hppa-unknown + basic_os=linux + ;; + psp) + basic_machine=mipsallegrexel-sony + basic_os=psp + ;; + pw32) + basic_machine=i586-unknown + basic_os=pw32 + ;; + rdos | rdos64) + basic_machine=x86_64-pc + basic_os=rdos + ;; + rdos32) + basic_machine=i386-pc + basic_os=rdos + ;; + rom68k) + basic_machine=m68k-rom68k + basic_os=coff + ;; + sa29200) + basic_machine=a29k-amd + basic_os=udi + ;; + sei) + basic_machine=mips-sei + basic_os=seiux + ;; + sequent) + basic_machine=i386-sequent + basic_os= + ;; + sps7) + basic_machine=m68k-bull + basic_os=sysv2 + ;; + st2000) + basic_machine=m68k-tandem + basic_os= + ;; + stratus) + basic_machine=i860-stratus + basic_os=sysv4 + ;; + sun2) + basic_machine=m68000-sun + basic_os= + ;; + sun2os3) + basic_machine=m68000-sun + basic_os=sunos3 + ;; + sun2os4) + basic_machine=m68000-sun + basic_os=sunos4 + ;; + sun3) + basic_machine=m68k-sun + basic_os= + ;; + sun3os3) + basic_machine=m68k-sun + basic_os=sunos3 + ;; + sun3os4) + basic_machine=m68k-sun + basic_os=sunos4 + ;; + sun4) + basic_machine=sparc-sun + basic_os= + ;; + sun4os3) + basic_machine=sparc-sun + basic_os=sunos3 + ;; + sun4os4) + basic_machine=sparc-sun + basic_os=sunos4 + ;; + sun4sol2) + basic_machine=sparc-sun + basic_os=solaris2 + ;; + sun386 | sun386i | roadrunner) + basic_machine=i386-sun + basic_os= + ;; + sv1) + basic_machine=sv1-cray + basic_os=unicos + ;; + symmetry) + basic_machine=i386-sequent + basic_os=dynix + ;; + t3e) + basic_machine=alphaev5-cray + basic_os=unicos + ;; + t90) + basic_machine=t90-cray + basic_os=unicos + ;; + toad1) + basic_machine=pdp10-xkl + basic_os=tops20 + ;; + tpf) + basic_machine=s390x-ibm + basic_os=tpf + ;; + udi29k) + basic_machine=a29k-amd + basic_os=udi + ;; + ultra3) + basic_machine=a29k-nyu + basic_os=sym1 + ;; + v810 | necv810) + basic_machine=v810-nec + basic_os=none + ;; + vaxv) + basic_machine=vax-dec + basic_os=sysv + ;; + vms) + basic_machine=vax-dec + basic_os=vms + ;; + vsta) + basic_machine=i386-pc + basic_os=vsta + ;; + vxworks960) + basic_machine=i960-wrs + basic_os=vxworks + ;; + vxworks68) + basic_machine=m68k-wrs + basic_os=vxworks + ;; + vxworks29k) + basic_machine=a29k-wrs + basic_os=vxworks + ;; + xbox) + basic_machine=i686-pc + basic_os=mingw32 + ;; + ymp) + basic_machine=ymp-cray + basic_os=unicos + ;; + *) + basic_machine=$1 + basic_os= + ;; + esac + ;; +esac + +# Decode 1-component or ad-hoc basic machines +case $basic_machine in + # Here we handle the default manufacturer of certain CPU types. It is in + # some cases the only manufacturer, in others, it is the most popular. + w89k) + cpu=hppa1.1 + vendor=winbond + ;; + op50n) + cpu=hppa1.1 + vendor=oki + ;; + op60c) + cpu=hppa1.1 + vendor=oki + ;; + ibm*) + cpu=i370 + vendor=ibm + ;; + orion105) + cpu=clipper + vendor=highlevel + ;; + mac | mpw | mac-mpw) + cpu=m68k + vendor=apple + ;; + pmac | pmac-mpw) + cpu=powerpc + vendor=apple + ;; + + # Recognize the various machine names and aliases which stand + # for a CPU type and a company and sometimes even an OS. + 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) + cpu=m68000 + vendor=att + ;; + 3b*) + cpu=we32k + vendor=att + ;; + bluegene*) + cpu=powerpc + vendor=ibm + basic_os=cnk + ;; + decsystem10* | dec10*) + cpu=pdp10 + vendor=dec + basic_os=tops10 + ;; + decsystem20* | dec20*) + cpu=pdp10 + vendor=dec + basic_os=tops20 + ;; + delta | 3300 | delta-motorola | 3300-motorola | motorola-delta | motorola-3300) + cpu=m68k + vendor=motorola + ;; + # This used to be dpx2*, but that gets the RS6000-based + # DPX/20 and the x86-based DPX/2-100 wrong. See + # https://oldskool.silicium.org/stations/bull_dpx20.htm + # https://www.feb-patrimoine.com/english/bull_dpx2.htm + # https://www.feb-patrimoine.com/english/unix_and_bull.htm + dpx2 | dpx2[23]00 | dpx2[23]xx) + cpu=m68k + vendor=bull + ;; + dpx2100 | dpx21xx) + cpu=i386 + vendor=bull + ;; + dpx20) + cpu=rs6000 + vendor=bull + ;; + encore | umax | mmax) + cpu=ns32k + vendor=encore + ;; + elxsi) + cpu=elxsi + vendor=elxsi + basic_os=${basic_os:-bsd} + ;; + fx2800) + cpu=i860 + vendor=alliant + ;; + genix) + cpu=ns32k + vendor=ns + ;; + h3050r* | hiux*) + cpu=hppa1.1 + vendor=hitachi + basic_os=hiuxwe2 + ;; + hp3k9[0-9][0-9] | hp9[0-9][0-9]) + cpu=hppa1.0 + vendor=hp + ;; + hp9k2[0-9][0-9] | hp9k31[0-9]) + cpu=m68000 + vendor=hp + ;; + hp9k3[2-9][0-9]) + cpu=m68k + vendor=hp + ;; + hp9k6[0-9][0-9] | hp6[0-9][0-9]) + cpu=hppa1.0 + vendor=hp + ;; + hp9k7[0-79][0-9] | hp7[0-79][0-9]) + cpu=hppa1.1 + vendor=hp + ;; + hp9k78[0-9] | hp78[0-9]) + # FIXME: really hppa2.0-hp + cpu=hppa1.1 + vendor=hp + ;; + hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) + # FIXME: really hppa2.0-hp + cpu=hppa1.1 + vendor=hp + ;; + hp9k8[0-9][13679] | hp8[0-9][13679]) + cpu=hppa1.1 + vendor=hp + ;; + hp9k8[0-9][0-9] | hp8[0-9][0-9]) + cpu=hppa1.0 + vendor=hp + ;; + i*86v32) + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv32 + ;; + i*86v4*) + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv4 + ;; + i*86v) + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=sysv + ;; + i*86sol2) + cpu=`echo "$1" | sed -e 's/86.*/86/'` + vendor=pc + basic_os=solaris2 + ;; + j90 | j90-cray) + cpu=j90 + vendor=cray + basic_os=${basic_os:-unicos} + ;; + iris | iris4d) + cpu=mips + vendor=sgi + case $basic_os in + irix*) + ;; + *) + basic_os=irix4 + ;; + esac + ;; + miniframe) + cpu=m68000 + vendor=convergent + ;; + *mint | mint[0-9]* | *MiNT | *MiNT[0-9]*) + cpu=m68k + vendor=atari + basic_os=mint + ;; + news-3600 | risc-news) + cpu=mips + vendor=sony + basic_os=newsos + ;; + next | m*-next) + cpu=m68k + vendor=next + ;; + np1) + cpu=np1 + vendor=gould + ;; + op50n-* | op60c-*) + cpu=hppa1.1 + vendor=oki + basic_os=proelf + ;; + pa-hitachi) + cpu=hppa1.1 + vendor=hitachi + basic_os=hiuxwe2 + ;; + pbd) + cpu=sparc + vendor=tti + ;; + pbb) + cpu=m68k + vendor=tti + ;; + pc532) + cpu=ns32k + vendor=pc532 + ;; + pn) + cpu=pn + vendor=gould + ;; + power) + cpu=power + vendor=ibm + ;; + ps2) + cpu=i386 + vendor=ibm + ;; + rm[46]00) + cpu=mips + vendor=siemens + ;; + rtpc | rtpc-*) + cpu=romp + vendor=ibm + ;; + sde) + cpu=mipsisa32 + vendor=sde + basic_os=${basic_os:-elf} + ;; + simso-wrs) + cpu=sparclite + vendor=wrs + basic_os=vxworks + ;; + tower | tower-32) + cpu=m68k + vendor=ncr + ;; + vpp*|vx|vx-*) + cpu=f301 + vendor=fujitsu + ;; + w65) + cpu=w65 + vendor=wdc + ;; + w89k-*) + cpu=hppa1.1 + vendor=winbond + basic_os=proelf + ;; + none) + cpu=none + vendor=none + ;; + leon|leon[3-9]) + cpu=sparc + vendor=$basic_machine + ;; + leon-*|leon[3-9]-*) + cpu=sparc + vendor=`echo "$basic_machine" | sed 's/-.*//'` + ;; + + *-*) + saved_IFS=$IFS + IFS="-" read cpu vendor <&2 + exit 1 + ;; + esac + ;; +esac + +# Here we canonicalize certain aliases for manufacturers. +case $vendor in + digital*) + vendor=dec + ;; + commodore*) + vendor=cbm + ;; + *) + ;; +esac + +# Decode manufacturer-specific aliases for certain operating systems. + +if test x"$basic_os" != x +then + +# First recognize some ad-hoc cases, or perhaps split kernel-os, or else just +# set os. +obj= +case $basic_os in + gnu/linux*) + kernel=linux + os=`echo "$basic_os" | sed -e 's|gnu/linux|gnu|'` + ;; + os2-emx) + kernel=os2 + os=`echo "$basic_os" | sed -e 's|os2-emx|emx|'` + ;; + nto-qnx*) + kernel=nto + os=`echo "$basic_os" | sed -e 's|nto-qnx|qnx|'` + ;; + *-*) + saved_IFS=$IFS + IFS="-" read kernel os <&2 + fi + ;; + *) + echo "Invalid configuration '$1': OS '$os' not recognized" 1>&2 + exit 1 + ;; +esac + +case $obj in + aout* | coff* | elf* | pe*) + ;; + '') + # empty is fine + ;; + *) + echo "Invalid configuration '$1': Machine code format '$obj' not recognized" 1>&2 + exit 1 + ;; +esac + +# Here we handle the constraint that a (synthetic) cpu and os are +# valid only in combination with each other and nowhere else. +case $cpu-$os in + # The "javascript-unknown-ghcjs" triple is used by GHC; we + # accept it here in order to tolerate that, but reject any + # variations. + javascript-ghcjs) + ;; + javascript-* | *-ghcjs) + echo "Invalid configuration '$1': cpu '$cpu' is not valid with os '$os$obj'" 1>&2 + exit 1 + ;; +esac + +# As a final step for OS-related things, validate the OS-kernel combination +# (given a valid OS), if there is a kernel. +case $kernel-$os-$obj in + linux-gnu*- | linux-android*- | linux-dietlibc*- | linux-llvm*- \ + | linux-mlibc*- | linux-musl*- | linux-newlib*- \ + | linux-relibc*- | linux-uclibc*- | linux-ohos*- ) + ;; + uclinux-uclibc*- | uclinux-gnu*- ) + ;; + managarm-mlibc*- | managarm-kernel*- ) + ;; + windows*-msvc*-) + ;; + -dietlibc*- | -llvm*- | -mlibc*- | -musl*- | -newlib*- | -relibc*- \ + | -uclibc*- ) + # These are just libc implementations, not actual OSes, and thus + # require a kernel. + echo "Invalid configuration '$1': libc '$os' needs explicit kernel." 1>&2 + exit 1 + ;; + -kernel*- ) + echo "Invalid configuration '$1': '$os' needs explicit kernel." 1>&2 + exit 1 + ;; + *-kernel*- ) + echo "Invalid configuration '$1': '$kernel' does not support '$os'." 1>&2 + exit 1 + ;; + *-msvc*- ) + echo "Invalid configuration '$1': '$os' needs 'windows'." 1>&2 + exit 1 + ;; + kfreebsd*-gnu*- | knetbsd*-gnu*- | netbsd*-gnu*- | kopensolaris*-gnu*-) + ;; + vxworks-simlinux- | vxworks-simwindows- | vxworks-spe-) + ;; + nto-qnx*-) + ;; + os2-emx-) + ;; + rtmk-nova-) + ;; + *-eabi*- | *-gnueabi*-) + ;; + none--*) + # None (no kernel, i.e. freestanding / bare metal), + # can be paired with an machine code file format + ;; + -*-) + # Blank kernel with real OS is always fine. + ;; + --*) + # Blank kernel and OS with real machine code file format is always fine. + ;; + *-*-*) + echo "Invalid configuration '$1': Kernel '$kernel' not known to work with OS '$os'." 1>&2 + exit 1 + ;; +esac + +# Here we handle the case where we know the os, and the CPU type, but not the +# manufacturer. We pick the logical manufacturer. +case $vendor in + unknown) + case $cpu-$os in + *-riscix*) + vendor=acorn + ;; + *-sunos* | *-solaris*) + vendor=sun + ;; + *-cnk* | *-aix*) + vendor=ibm + ;; + *-beos*) + vendor=be + ;; + *-hpux*) + vendor=hp + ;; + *-mpeix*) + vendor=hp + ;; + *-hiux*) + vendor=hitachi + ;; + *-unos*) + vendor=crds + ;; + *-dgux*) + vendor=dg + ;; + *-luna*) + vendor=omron + ;; + *-genix*) + vendor=ns + ;; + *-clix*) + vendor=intergraph + ;; + *-mvs* | *-opened*) + vendor=ibm + ;; + *-os400*) + vendor=ibm + ;; + s390-* | s390x-*) + vendor=ibm + ;; + *-ptx*) + vendor=sequent + ;; + *-tpf*) + vendor=ibm + ;; + *-vxsim* | *-vxworks* | *-windiss*) + vendor=wrs + ;; + *-aux*) + vendor=apple + ;; + *-hms*) + vendor=hitachi + ;; + *-mpw* | *-macos*) + vendor=apple + ;; + *-*mint | *-mint[0-9]* | *-*MiNT | *-MiNT[0-9]*) + vendor=atari + ;; + *-vos*) + vendor=stratus + ;; + esac + ;; +esac + +echo "$cpu-$vendor${kernel:+-$kernel}${os:+-$os}${obj:+-$obj}" +exit + +# Local variables: +# eval: (add-hook 'before-save-hook 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/thirdparty/patches/libhdfs3-add-loongarch-support.patch b/thirdparty/patches/libhdfs3-add-loongarch-support.patch new file mode 100644 index 0000000000..1c6c0cf17e --- /dev/null +++ b/thirdparty/patches/libhdfs3-add-loongarch-support.patch @@ -0,0 +1,61 @@ +From 596eb924f95a6e28abbbdbb9b9c212a0ed418250 Mon Sep 17 00:00:00 2001 +From: huchangqi +Date: Mon, 19 May 2025 15:42:30 +0800 +Subject: [PATCH] libhdfs3 add loongarch support + +--- + src/common/HWCrc32c.cpp | 33 ++++++++++++++++++++++++++++++++- + 1 file changed, 32 insertions(+), 1 deletion(-) + +diff --git a/src/common/HWCrc32c.cpp b/src/common/HWCrc32c.cpp +index f61b4b6e10f..62dbb421fe8 100644 +--- a/src/common/HWCrc32c.cpp ++++ b/src/common/HWCrc32c.cpp +@@ -66,6 +66,35 @@ static inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) { + #endif + #elif ((defined(__arm__) || defined(__aarch64__))) + #include "sse2neon.h" ++#elif defined(__loongarch_lp64) ++#include ++ ++namespace Hdfs { ++namespace Internal { ++ ++static inline uint64_t _mm_crc32_u64(uint64_t crc, uint64_t value) { ++ crc = __crc_w_d_w(value, crc); ++ return crc; ++} ++ ++static inline uint32_t _mm_crc32_u32(uint32_t crc, uint64_t value) { ++ crc = __crc_w_w_w(value, crc); ++ return crc; ++} ++ ++static inline uint32_t _mm_crc32_u16(uint32_t crc, uint16_t value) { ++ crc = __crc_w_h_w(value, crc); ++ return crc; ++} ++ ++static inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) { ++ crc = __crc_w_b_w(value, crc); ++ return crc; ++} ++ ++} ++} ++ + #endif + + namespace Hdfs { +@@ -82,6 +111,8 @@ bool HWCrc32c::available() { + return (ecx & (1 << 20)) != 0; + #elif ((defined(__arm__) || defined(__aarch64__))) + return true; ++#elif defined(__loongarch_lp64) ++ return true; + #else + return false; + #endif +-- +2.46.0 + diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh index 7bc66c4677..6644b0324e 100644 --- a/thirdparty/vars.sh +++ b/thirdparty/vars.sh @@ -198,10 +198,10 @@ LEVELDB_SOURCE=leveldb-1.23 LEVELDB_MD5SUM="afbde776fb8760312009963f09a586c7" # brpc -BRPC_DOWNLOAD="https://github.com/apache/brpc/archive/refs/tags/1.4.0.tar.gz" -BRPC_NAME="brpc-1.4.0.tar.gz" -BRPC_SOURCE="brpc-1.4.0" -BRPC_MD5SUM="6af9d50822c33a3abc56a1ec0af0e0bc" +BRPC_DOWNLOAD="https://github.com/apache/brpc/archive/refs/tags/1.12.1.tar.gz" +BRPC_NAME="brpc-1.12.1.tar.gz" +BRPC_SOURCE="brpc-1.12.1" +BRPC_MD5SUM="8bc704bafadc3752edb61eb531fd951c" # rocksdb ROCKSDB_DOWNLOAD="https://github.com/facebook/rocksdb/archive/v5.14.2.tar.gz" @@ -315,10 +315,11 @@ JEMALLOC_DORIS_SOURCE="jemalloc-5.3.0" JEMALLOC_DORIS_MD5SUM="09a8328574dab22a7df848eae6dbbf53" # libunwind +# libunwind need use loongarch version LIBUNWIND_DOWNLOAD="https://github.com/libunwind/libunwind/releases/download/v1.6.2/libunwind-1.6.2.tar.gz" LIBUNWIND_NAME="libunwind-1.6.2.tar.gz" LIBUNWIND_SOURCE="libunwind-1.6.2" -LIBUNWIND_MD5SUM="f625b6a98ac1976116c71708a73dc44a" +LIBUNWIND_MD5SUM="dfa959c84479e4a10da44c7eda3b83ff" # cctz CCTZ_DOWNLOAD="https://github.com/google/cctz/archive/v2.3.tar.gz"