diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 11a416158a..9427d08742 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -236,7 +236,7 @@ if (NOT OS_MACOSX) # MACOSX's lld will core dump
     TRY_TO_CHANGE_LINKER("lld" "LLD")
     TRY_TO_CHANGE_LINKER("gold" "GNU gold")
     if (NOT CUSTUM_LINKER_COMMAND STREQUAL "ld")
-        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${CUSTUM_LINKER_COMMAND}")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${CUSTUM_LINKER_COMMAND} -Wl,--no-relax -lsframe")
     endif()
 endif()
 
@@ -247,13 +247,14 @@ set(CMAKE_C_STANDARD 17)
 add_compile_options(-g
                     -Wall
                     -Wextra
-                    -Werror
+                    #-Werror
                     -pthread
                     -fstrict-aliasing
                     -fno-omit-frame-pointer
                     $<$<COMPILE_LANGUAGE:CXX>:-Wnon-virtual-dtor>)
 
 add_compile_options(-Wno-unused-parameter
+                    -Wno-unused-function
                     $<$<COMPILE_LANGUAGE:CXX>:-Wno-incompatible-pointer-types>
                     -Wno-unknown-warning-option
                     -Wno-deprecated-declarations
@@ -376,9 +377,9 @@ endif()
 # For CMAKE_BUILD_TYPE=Release
 #   -O3: Enable all compiler optimizations
 #   -DNDEBUG: Turn off dchecks/asserts/debug only code.
-set(CXX_FLAGS_RELEASE "${CXX_GCC_FLAGS} -O3 -DNDEBUG")
-set(CXX_FLAGS_ASAN "${CXX_GCC_FLAGS} -O0 -fsanitize=address -DADDRESS_SANITIZER")
-set(CXX_FLAGS_LSAN "${CXX_GCC_FLAGS} -O0 -fsanitize=leak -DLEAK_SANITIZER")
+set(CXX_FLAGS_RELEASE "${CXX_GCC_FLAGS} -O3 -DNDEBUG -mcmodel=medium")
+set(CXX_FLAGS_ASAN "${CXX_GCC_FLAGS} -O0 -fsanitize=address -DADDRESS_SANITIZER -mcmodel=medium")
+set(CXX_FLAGS_LSAN "${CXX_GCC_FLAGS} -O0 -fsanitize=leak -DLEAK_SANITIZER -mcmodel=medium")
 
 # Set the flags to the undefined behavior sanitizer, also known as "ubsan"
 # Turn on sanitizer and debug symbols to get stack traces:
diff --git a/be/src/util/sse_util.hpp b/be/src/util/sse_util.hpp
index 95d1064a36..bf0967ef5c 100644
--- a/be/src/util/sse_util.hpp
+++ b/be/src/util/sse_util.hpp
@@ -27,6 +27,65 @@
 #include <immintrin.h> // IWYU pragma: export
 #include <mm_malloc.h> // IWYU pragma: export
 #include <smmintrin.h> // IWYU pragma: export
+#elif defined(__loongarch_lp64)
+
+#include <lsxintrin.h>
+
+#define _SIDD_UBYTE_OPS                 0x00
+#define _SIDD_UWORD_OPS                 0x01
+#define _SIDD_SBYTE_OPS                 0x02
+#define _SIDD_SWORD_OPS                 0x03
+
+/* These macros specify the comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY             0x00
+#define _SIDD_CMP_RANGES                0x04
+#define _SIDD_CMP_EQUAL_EACH            0x08
+#define _SIDD_CMP_EQUAL_ORDERED         0x0c
+
+/* These macros specify the polarity.  */
+#define _SIDD_POSITIVE_POLARITY         0x00
+#define _SIDD_NEGATIVE_POLARITY         0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
+
+/* These macros specify the output selection in _mm_cmpXstri ().  */
+#define _SIDD_LEAST_SIGNIFICANT         0x00
+#define _SIDD_MOST_SIGNIFICANT          0x40
+
+/* These macros specify the output selection in _mm_cmpXstrm ().  */
+#define _SIDD_BIT_MASK                  0x00
+#define _SIDD_UNIT_MASK                 0x40
+
+
+#define _mm_load_si128(_ip_)                    (__m128i)__lsx_vld((const __m128i*)(_ip_), 0)
+#define _mm_storeu_si128(_ip_, _u_)             __lsx_vstx((__m128i)(_u_), (__m128i*)(_ip_), 0)
+#define _mm_or_si128(_u_, _v_)                  (__m128i)__lsx_vor_v((__m128i)(_u_), (__m128i)(_v_))
+#define _mm_loadu_si128(_ip_)                   (__m128i)__lsx_vldx((const __m128i*)(_ip_), 0)
+#define _mm_setzero_si128()                       __lsx_vreplgr2vr_w( 0   )
+
+static ALWAYS_INLINE uint16_t _mm_movemask_epi8(__m128i v) {
+    // 步骤1：提取每个字节的最高位（符号位）
+    __m128i signs = __lsx_vsrai_b(v, 7);  // 所有字节算术右移7位, 保留符合位
+
+    // 步骤2：创建位掩码 (LSB-first: 0x01, 0x02, 0x04,...)
+    static const uint8_t mask_data[16] = {
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,  // 低8字节
+        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80   // 高8字节
+    };
+    __m128i mask = __lsx_vld((const void*)mask_data, 0);  // 从内存加载掩码
+
+    // 步骤3：应用位掩码
+    __m128i masked = __lsx_vand_v(signs, mask);
+
+    // 步骤4：水平相加（8-bit → 16-bit → 32-bit）
+    __m128i sum16 = __lsx_vhaddw_hu_bu(masked, masked);
+    __m128i sum32 = __lsx_vhaddw_wu_hu(sum16, sum16);
+    __m128i sum64 = __lsx_vhaddw_du_wu(sum32, sum32);
+
+    // 步骤5：提取低16位结果
+    return (uint16_t)__lsx_vpickve2gr_bu(sum64, 0) | (((uint16_t)__lsx_vpickve2gr_bu(sum64, 8)) << 8);
+}
+
 #endif
 
 namespace doris {
diff --git a/build.sh b/build.sh
index f2cb70c3ab..9f0384b7cf 100755
--- a/build.sh
+++ b/build.sh
@@ -476,7 +476,7 @@ if [[ "${BUILD_HIVE_UDF}" -eq 1 ]]; then
     modules+=("hive-udf")
 fi
 if [[ "${BUILD_BE_JAVA_EXTENSIONS}" -eq 1 ]]; then
-    modules+=("fe-common")
+    # modules+=("fe-common")
     modules+=("be-java-extensions/hudi-scanner")
     # don't compile hadoop-hudi-scanner for 2.1 now
     # modules+=("be-java-extensions/hadoop-hudi-scanner")
@@ -503,8 +503,8 @@ FE_MODULES="$(
 
 # Clean and build Backend
 if [[ "${BUILD_BE}" -eq 1 ]]; then
-    update_submodule "be/src/apache-orc" "apache-orc" "https://github.com/apache/doris-thirdparty/archive/refs/heads/orc-for-doris-21.tar.gz"
-    update_submodule "be/src/clucene" "clucene" "https://github.com/apache/doris-thirdparty/archive/refs/heads/clucene-2.1.tar.gz"
+    #update_submodule "be/src/apache-orc" "apache-orc" "https://github.com/apache/doris-thirdparty/archive/refs/heads/orc-for-doris-21.tar.gz"
+    #update_submodule "be/src/clucene" "clucene" "https://github.com/apache/doris-thirdparty/archive/refs/heads/clucene-2.1.tar.gz"
     if [[ -e "${DORIS_HOME}/gensrc/build/gen_cpp/version.h" ]]; then
         rm -f "${DORIS_HOME}/gensrc/build/gen_cpp/version.h"
     fi
@@ -548,7 +548,7 @@ if [[ "${BUILD_BE}" -eq 1 ]]; then
         -DENABLE_STACKTRACE="${ENABLE_STACKTRACE}" \
         -DUSE_AVX2="${USE_AVX2}" \
         -DGLIBC_COMPATIBILITY="${GLIBC_COMPATIBILITY}" \
-        -DEXTRA_CXX_FLAGS="${EXTRA_CXX_FLAGS}" \
+        -DEXTRA_CXX_FLAGS="${EXTRA_CXX_FLAGS} -mcmodel=medium" \
         -DENABLE_CLANG_COVERAGE="${DENABLE_CLANG_COVERAGE}" \
         -DDORIS_JAVA_HOME="${JAVA_HOME}" \
         "${DORIS_HOME}/be"
@@ -605,28 +605,28 @@ if [[ "${BUILD_FE}" -eq 1 ]]; then
 fi
 
 # Clean and build Frontend
-if [[ "${FE_MODULES}" != '' ]]; then
-    echo "Build Frontend Modules: ${FE_MODULES}"
-    cd "${DORIS_HOME}/fe"
-    if [[ "${CLEAN}" -eq 1 ]]; then
-        clean_fe
-    fi
-    if [[ "${DISABLE_JAVA_CHECK_STYLE}" = "ON" ]]; then
-        # Allowed user customer set env param USER_SETTINGS_MVN_REPO means settings.xml file path
-        if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then
-            "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C
-        else
-            "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} -T 1C
-        fi
-    else
-        if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then
-            "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C
-        else
-            "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} -T 1C
-        fi
-    fi
-    cd "${DORIS_HOME}"
-fi
+#if [[ "${FE_MODULES}" != '' ]]; then
+#    echo "Build Frontend Modules: ${FE_MODULES}"
+#    cd "${DORIS_HOME}/fe"
+#    if [[ "${CLEAN}" -eq 1 ]]; then
+#        clean_fe
+#    fi
+#    if [[ "${DISABLE_JAVA_CHECK_STYLE}" = "ON" ]]; then
+#        # Allowed user customer set env param USER_SETTINGS_MVN_REPO means settings.xml file path
+#        if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then
+#            "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C
+#        else
+#            "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests -Dcheckstyle.skip=true ${MVN_OPT:+${MVN_OPT}} -T 1C
+#        fi
+#    else
+#        if [[ -n ${USER_SETTINGS_MVN_REPO} && -f ${USER_SETTINGS_MVN_REPO} ]]; then
+#            "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} -gs "${USER_SETTINGS_MVN_REPO}" -T 1C
+#        else
+#            "${MVN_CMD}" package -pl ${FE_MODULES:+${FE_MODULES}} -Dskip.doc=true -DskipTests ${MVN_OPT:+${MVN_OPT}} -T 1C
+#        fi
+#    fi
+#    cd "${DORIS_HOME}"
+#fi
 
 # Clean and prepare output dir
 DORIS_OUTPUT=${DORIS_OUTPUT:="${DORIS_HOME}/output/"}
diff --git a/loongarch_env.sh b/loongarch_env.sh
new file mode 100644
index 0000000000..bbbe0279af
--- /dev/null
+++ b/loongarch_env.sh
@@ -0,0 +1,3 @@
+export GLIBC_COMPATIBILITY=OFF
+export USE_AVX2=0
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh
index abdad19a97..5442142382 100755
--- a/thirdparty/build-thirdparty.sh
+++ b/thirdparty/build-thirdparty.sh
@@ -147,7 +147,7 @@ if [[ "${CLEAN}" -eq 1 ]] && [[ -d "${TP_SOURCE_DIR}" ]]; then
 fi
 
 # Download thirdparties.
-"${TP_DIR}/download-thirdparty.sh"
+#"${TP_DIR}/download-thirdparty.sh"
 
 export LD_LIBRARY_PATH="${TP_DIR}/installed/lib:${LD_LIBRARY_PATH}"
 
@@ -317,8 +317,8 @@ build_libbacktrace() {
     check_if_source_exist "${LIBBACKTRACE_SOURCE}"
     cd "${TP_SOURCE_DIR}/${LIBBACKTRACE_SOURCE}"
 
-    CPPFLAGS="-I${TP_INCLUDE_DIR}" \
-        CXXFLAGS="-I${TP_INCLUDE_DIR}" \
+    CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
+        CXXFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         ./configure --prefix="${TP_INSTALL_DIR}"
 
@@ -334,8 +334,8 @@ build_libevent() {
     mkdir -p "${BUILD_DIR}"
     cd "${BUILD_DIR}"
 
-    CFLAGS="-std=c99 -D_BSD_SOURCE -fno-omit-frame-pointer -g -ggdb -O2 -I${TP_INCLUDE_DIR}" \
-        CPPLAGS="-I${TP_INCLUDE_DIR}" \
+    CFLAGS="-std=c99 -D_BSD_SOURCE -fno-omit-frame-pointer -g -ggdb -O2 -I${TP_INCLUDE_DIR} -mcmodel=medium" \
+        CPPLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DEVENT__DISABLE_TESTS=ON \
         -DEVENT__DISABLE_OPENSSL=ON -DEVENT__DISABLE_SAMPLES=ON -DEVENT__DISABLE_REGRESS=ON ..
@@ -354,13 +354,17 @@ build_openssl() {
         OPENSSL_PLATFORM="darwin64-${MACHINE_TYPE}-cc"
     elif [[ "${MACHINE_TYPE}" == "aarch64" ]]; then
         OPENSSL_PLATFORM="linux-aarch64"
+    elif [[ "${MACHINE_TYPE}" == "loongarch64" ]]; then
+        # todo: add loongarch64 asm
+        OPENSSL_PLATFORM="linux-generic64"
+        OPENSSL_CONFIG_FLAGS="no-asm"
     fi
 
     check_if_source_exist "${OPENSSL_SOURCE}"
     cd "${TP_SOURCE_DIR}/${OPENSSL_SOURCE}"
 
-    CPPFLAGS="-I${TP_INCLUDE_DIR}" \
-        CXXFLAGS="-I${TP_INCLUDE_DIR}" \
+    CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
+        CXXFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         LIBDIR="lib" \
         ./Configure --prefix="${TP_INSTALL_DIR}" --with-rand-seed=devrandom -shared "${OPENSSL_PLATFORM}"
@@ -385,8 +389,8 @@ build_thrift() {
     cd "${TP_SOURCE_DIR}/${THRIFT_SOURCE}"
 
     if [[ "${KERNEL}" != 'Darwin' ]]; then
-        cflags="-I${TP_INCLUDE_DIR}"
-        cxxflags="-I${TP_INCLUDE_DIR} ${warning_unused_but_set_variable} -Wno-inconsistent-missing-override"
+        cflags="-I${TP_INCLUDE_DIR} -mcmodel=medium"
+        cxxflags="-I${TP_INCLUDE_DIR} ${warning_unused_but_set_variable} -Wno-inconsistent-missing-override -mcmodel=medium"
         ldflags="-L${TP_LIB_DIR} --static"
     else
         cflags="-I${TP_INCLUDE_DIR} -Wno-implicit-function-declaration -Wno-inconsistent-missing-override"
@@ -426,7 +430,7 @@ build_protobuf() {
     mkdir -p cmake/build
     cd cmake/build
 
-    CXXFLAGS="-O2 -I${TP_INCLUDE_DIR}" \
+    CXXFLAGS="-O2 -I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="${ldflags}" \
         "${CMAKE_CMD}" -DCMAKE_BUILD_TYPE=Release \
         -DCMAKE_PREFIX_PATH="${TP_INSTALL_DIR}" \
@@ -456,6 +460,7 @@ build_gflags() {
     rm -rf CMakeCache.txt CMakeFiles/
 
     "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
+        -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \
         -DCMAKE_BUILD_TYPE=Release -DCMAKE_POSITION_INDEPENDENT_CODE=On ../
 
     "${BUILD_SYSTEM}" -j "${PARALLEL}"
@@ -472,7 +477,7 @@ build_glog() {
         rm -rf config.*
         autoreconf -i
 
-        CPPFLAGS="-I${TP_INCLUDE_DIR} -fpermissive -fPIC" \
+        CPPFLAGS="-I${TP_INCLUDE_DIR} -fpermissive -fPIC -mcmodel=medium" \
             LDFLAGS="-L${TP_LIB_DIR}" \
             ./configure --prefix="${TP_INSTALL_DIR}" --enable-frame-pointers --disable-shared --enable-static
 
@@ -485,6 +490,7 @@ build_glog() {
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
             -DWITH_UNWIND=OFF \
             -DBUILD_SHARED_LIBS=OFF \
+            -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \
             -DWITH_TLS=OFF
 
         cmake --build build --target install
@@ -538,7 +544,7 @@ build_snappy() {
 
     rm -rf CMakeCache.txt CMakeFiles/
 
-    CFLAGS="-O3" CXXFLAGS="-O3" "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
+    CFLAGS="-O3 -mcmodel=medium" CXXFLAGS="-O3 -mcmodel=medium" "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
         -DCMAKE_INSTALL_INCLUDEDIR="${TP_INCLUDE_DIR}"/snappy \
         -DSNAPPY_BUILD_TESTS=0 ../
@@ -561,7 +567,7 @@ build_gperftools() {
         ./autogen.sh
     fi
 
-    CPPFLAGS="-I${TP_INCLUDE_DIR}" \
+    CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         LD_LIBRARY_PATH="${TP_LIB_DIR}" \
         LDFLAGS="-L${TP_LIB_DIR}" \
@@ -577,8 +583,8 @@ build_zlib() {
     check_if_source_exist "${ZLIB_SOURCE}"
     cd "${TP_SOURCE_DIR}/${ZLIB_SOURCE}"
 
-    CFLAGS="-O3 -fPIC" \
-        CPPFLAGS="-I${TP_INCLUDE_DIR}" \
+    CFLAGS="-O3 -fPIC -mcmodel=medium" \
+        CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         ./configure --prefix="${TP_INSTALL_DIR}"
 
@@ -604,6 +610,8 @@ build_lz4() {
         rm -f "${TP_INSTALL_DIR}/bin/${link}"
     done
 
+    export CFLAGS="-O2 -mcmodel=medium"
+    export CXXFLAGS="-O2 -mcmodel=medium"
     make -j "${PARALLEL}" install PREFIX="${TP_INSTALL_DIR}" BUILD_SHARED=no INCLUDEDIR="${TP_INCLUDE_DIR}/lz4"
 }
 
@@ -616,6 +624,7 @@ build_zstd() {
     cd "${BUILD_DIR}"
 
     "${CMAKE_CMD}" -G "${GENERATOR}" -DBUILD_TESTING=OFF -DZSTD_BUILD_TESTS=OFF -DZSTD_BUILD_STATIC=ON \
+        -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \
         -DZSTD_BUILD_PROGRAMS=OFF -DZSTD_BUILD_SHARED=OFF -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" ..
 
     "${BUILD_SYSTEM}" -j "${PARALLEL}" install
@@ -627,6 +636,8 @@ build_bzip() {
     check_if_source_exist "${BZIP_SOURCE}"
     cd "${TP_SOURCE_DIR}/${BZIP_SOURCE}"
 
+    export CFLAGS="-O2 -mcmodel=medium -fPIC"
+    export CXXFLAGS="-O2 -mcmodel=medium -fPIC"
     make -j "${PARALLEL}" install PREFIX="${TP_INSTALL_DIR}"
 }
 
@@ -635,8 +646,8 @@ build_lzo2() {
     check_if_source_exist "${LZO2_SOURCE}"
     cd "${TP_SOURCE_DIR}/${LZO2_SOURCE}"
 
-    CPPFLAGS="-I${TP_INCLUDE_DIR}" \
-        LDFLAGS="-L${TP_LIB_DIR}" \
+    CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
+        LDFLAGS="-L${TP_LIB_DIR} -mcmodel=medium" \
         ./configure --prefix="${TP_INSTALL_DIR}" --disable-shared --enable-static
 
     make -j "${PARALLEL}"
@@ -662,7 +673,7 @@ build_curl() {
         libs='-lcrypto -lssl -lcrypto -ldl'
     fi
 
-    CPPFLAGS="-I${TP_INCLUDE_DIR} " \
+    CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" LIBS="${libs}" \
         PKG_CONFIG="pkg-config --static" \
         ./configure --prefix="${TP_INSTALL_DIR}" --disable-shared --enable-static \
@@ -680,12 +691,14 @@ build_re2() {
     cd "${TP_SOURCE_DIR}/${RE2_SOURCE}"
 
     "${CMAKE_CMD}" -DCMAKE_BUILD_TYPE=Release -G "${GENERATOR}" -DBUILD_SHARED_LIBS=0 -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+        -DCMAKE_C_FLAGS="-mcmodel=medium -fPIC" -DCMAKE_CXX_FLAGS="-mcmodel=medium -fPIC" \
         -DCMAKE_PREFIX_PATH="${TP_INSTALL_DIR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}"
     "${BUILD_SYSTEM}" -j "${PARALLEL}" install
     strip_lib libre2.a
 }
 
 # hyperscan
+# on loongarch hyperscan need use gcc
 build_hyperscan() {
     check_if_source_exist "${RAGEL_SOURCE}"
     cd "${TP_SOURCE_DIR}/${RAGEL_SOURCE}"
@@ -696,7 +709,7 @@ build_hyperscan() {
         cxxflags=''
     fi
 
-    CXXFLAGS="${cxxflags}" \
+    CXXFLAGS="${cxxflags} -mcmodel=medium" \
         ./configure --prefix="${TP_INSTALL_DIR}"
     make install
 
@@ -711,6 +724,7 @@ build_hyperscan() {
 
     CXXFLAGS="-D_HAS_AUTO_PTR_ETC=0" \
         "${CMAKE_CMD}" -G "${GENERATOR}" -DBUILD_SHARED_LIBS=0 -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+        -DCMAKE_C_FLAGS="-mcmodel=medium -mno-relax" -DCMAKE_CXX_FLAGS="-mcmodel=medium -mno-relax" \
         -DBOOST_ROOT="${TP_INSTALL_DIR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DBUILD_EXAMPLES=OFF ..
     "${BUILD_SYSTEM}" -j "${PARALLEL}" install
     strip_lib libhs.a
@@ -727,7 +741,7 @@ build_boost() {
         cxxflags=''
     fi
 
-    CXXFLAGS="${cxxflags}" \
+    CXXFLAGS="${cxxflags} -mcmodel=medium" \
         ./bootstrap.sh --prefix="${TP_INSTALL_DIR}" --with-toolset="${boost_toolset}"
     # -q: Fail at first error
     ./b2 -q link=static runtime-link=static -j "${PARALLEL}" \
@@ -763,6 +777,7 @@ build_mysql() {
         "${CMAKE_CMD}" -G "${GENERATOR}" ../ -DCMAKE_LINK_SEARCH_END_STATIC=1 \
         -DWITH_BOOST="$(pwd)/${BOOST_SOURCE}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}/mysql" \
         -DWITHOUT_SERVER=1 -DWITH_ZLIB=1 -DZLIB_ROOT="${TP_INSTALL_DIR}" \
+        -DCMAKE_C_FLAGS="-mcmodel=medium " -DCMAKE_CXX_FLAGS="-mcmodel=medium " \
         -DCMAKE_CXX_FLAGS_RELWITHDEBINFO="-O3 -g -fabi-version=2 -fno-omit-frame-pointer -fno-strict-aliasing -std=gnu++11" \
         -DDISABLE_SHARED=1 -DBUILD_SHARED_LIBS=0 -DZLIB_LIBRARY="${TP_INSTALL_DIR}/lib/libz.a" -DENABLE_DTRACE=0
     "${BUILD_SYSTEM}" -j "${PARALLEL}" mysqlclient
@@ -791,13 +806,14 @@ build_leveldb() {
 
     rm -rf CMakeCache.txt CMakeFiles/
 
-    CXXFLAGS="-fPIC" "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DLEVELDB_BUILD_BENCHMARKS=OFF \
+    CXXFLAGS="-fPIC -mcmodel=medium" "${CMAKE_CMD}" -G "${GENERATOR}" -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" -DLEVELDB_BUILD_BENCHMARKS=OFF \
         -DLEVELDB_BUILD_TESTS=OFF ..
     "${BUILD_SYSTEM}" -j "${PARALLEL}" install
     strip_lib libleveldb.a
 }
 
 # brpc
+# on loongarch brpc need
 build_brpc() {
     check_if_source_exist "${BRPC_SOURCE}"
 
@@ -828,6 +844,7 @@ build_brpc() {
         "${CMAKE_CMD}" -G "${GENERATOR}" -DBUILD_SHARED_LIBS=ON -DWITH_GLOG=ON -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
         -DCMAKE_LIBRARY_PATH="${TP_INSTALL_DIR}/lib64" -DCMAKE_INCLUDE_PATH="${TP_INSTALL_DIR}/include" \
         -DBUILD_BRPC_TOOLS=OFF \
+        -DCMAKE_C_FLAGS="-O1 -mcmodel=medium -fPIC" -DCMAKE_CXX_FLAGS="-O1 -mcmodel=medium -fPIC" \
         -DPROTOBUF_PROTOC_EXECUTABLE="${TP_INSTALL_DIR}/bin/protoc" ..
 
     "${BUILD_SYSTEM}" -j "${PARALLEL}"
@@ -854,7 +871,7 @@ build_rocksdb() {
     fi
 
     # -Wno-range-loop-construct gcc-11
-    CFLAGS="-I ${TP_INCLUDE_DIR} -I ${TP_INCLUDE_DIR}/snappy -I ${TP_INCLUDE_DIR}/lz4" \
+    CFLAGS="-I ${TP_INCLUDE_DIR} -I ${TP_INCLUDE_DIR}/snappy -I ${TP_INCLUDE_DIR}/lz4 -mcmodel=medium" \
         CXXFLAGS="-include cstdint -Wno-deprecated-copy ${warning_stringop_truncation} ${warning_shadow} ${warning_dangling_gsl} \
     ${warning_defaulted_function_deleted} ${warning_unused_but_set_variable} -Wno-pessimizing-move -Wno-range-loop-construct" \
         LDFLAGS="${ldflags}" \
@@ -869,8 +886,8 @@ build_cyrus_sasl() {
     check_if_source_exist "${CYRUS_SASL_SOURCE}"
     cd "${TP_SOURCE_DIR}/${CYRUS_SASL_SOURCE}"
 
-    CFLAGS="-fPIC -Wno-implicit-function-declaration" \
-        CPPFLAGS="-I${TP_INCLUDE_DIR}" \
+    CFLAGS="-fPIC -Wno-implicit-function-declaration -mcmodel=medium" \
+        CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         LIBS="-lcrypto" \
         ./configure --prefix="${TP_INSTALL_DIR}" --enable-static --enable-shared=no --with-openssl="${TP_INSTALL_DIR}" --with-pic --enable-gssapi="${TP_INSTALL_DIR}" --with-gss_impl=mit --with-dblib=none
@@ -894,7 +911,7 @@ build_librdkafka() {
     # As a result, we use a patch to hard code "--static" into PKG_CONFIG instead.
     # PKG_CONFIG="pkg-config --static"
 
-    CPPFLAGS="-I${TP_INCLUDE_DIR}" \
+    CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium -mno-relax" \
         LDFLAGS="-L${TP_LIB_DIR} -lssl -lcrypto -lzstd -lz -lsasl2 \
         -lgssapi_krb5 -lkrb5 -lkrb5support -lk5crypto -lcom_err -lresolv" \
         ./configure --prefix="${TP_INSTALL_DIR}" --enable-static --enable-sasl --disable-c11threads
@@ -913,7 +930,7 @@ build_libunixodbc() {
 
     cd "${TP_SOURCE_DIR}/${ODBC_SOURCE}"
 
-    CFLAGS="-I${TP_INCLUDE_DIR} -Wno-int-conversion -Wno-implicit-function-declaration" \
+    CFLAGS="-I${TP_INCLUDE_DIR} -Wno-int-conversion -Wno-implicit-function-declaration -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         ./configure --prefix="${TP_INSTALL_DIR}" --with-included-ltdl --enable-static=yes --enable-shared=no
 
@@ -940,6 +957,7 @@ build_flatbuffers() {
     LDFLAGS="${ldflags}" \
         "${CMAKE_CMD}" -G "${GENERATOR}" \
         -DFLATBUFFERS_CXX_FLAGS="${warning_class_memaccess} ${warning_unused_but_set_variable}" \
+        -DCMAKE_C_FLAGS="-mcmodel=medium " -DCMAKE_CXX_FLAGS="-mcmodel=medium " \
         -DFLATBUFFERS_BUILD_TESTS=OFF \
         ..
 
@@ -961,6 +979,7 @@ build_cares() {
         -DCARES_STATIC=ON \
         -DCARES_SHARED=OFF \
         -DCARES_STATIC_PIC=ON \
+        -DCMAKE_C_FLAGS="-mcmodel=medium " -DCMAKE_CXX_FLAGS="-mcmodel=medium " \
         -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" ..
     make
     make install
@@ -991,6 +1010,7 @@ build_grpc() {
         -DgRPC_ZLIB_PROVIDER=package \
         -DZLIB_ROOT="${TP_INSTALL_DIR}" \
         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+        -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \
         ../..
 
     make -j "${PARALLEL}"
@@ -1029,6 +1049,7 @@ build_arrow() {
 
     LDFLAGS="${ldflags}" \
         "${CMAKE_CMD}" -G "${GENERATOR}" -DARROW_PARQUET=ON -DARROW_IPC=ON -DARROW_BUILD_SHARED=OFF \
+        -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \
         -DARROW_BUILD_STATIC=ON -DARROW_WITH_BROTLI=ON -DARROW_WITH_LZ4=ON -DARROW_USE_GLOG=ON \
         -DARROW_WITH_SNAPPY=ON -DARROW_WITH_ZLIB=ON -DARROW_WITH_ZSTD=ON -DARROW_JSON=ON \
         -DARROW_WITH_UTF8PROC=OFF -DARROW_WITH_RE2=ON -DARROW_ORC=ON \
@@ -1093,6 +1114,7 @@ build_abseil() {
         -DCMAKE_BUILD_TYPE=Release \
         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
         -DABSL_PROPAGATE_CXX_STD=ON \
+        -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \
         -DBUILD_SHARED_LIBS=OFF
 
     cmake --build "${BUILD_DIR}" -j "${PARALLEL}"
@@ -1115,6 +1137,7 @@ build_s2() {
         -DBUILD_SHARED_LIBS=OFF \
         -DWITH_GFLAGS=ON \
         -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_C_FLAGS="-mcmodel=medium" -DCMAKE_CXX_FLAGS="-mcmodel=medium" \
         -DCMAKE_LIBRARY_PATH="${TP_INSTALL_DIR}" ..
 
     "${BUILD_SYSTEM}" -j "${PARALLEL}"
@@ -1220,7 +1243,7 @@ build_croaringbitmap() {
         ldflags="-L${TP_LIB_DIR}"
     fi
 
-    CXXFLAGS="-O3" \
+    CXXFLAGS="-O3 -mcmodel=medium" \
         LDFLAGS="${ldflags}" \
         "${CMAKE_CMD}" -G "${GENERATOR}" ${avx_flag:+${avx_flag}} -DROARING_BUILD_STATIC=ON -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
         -DENABLE_ROARING_TESTS=OFF ..
@@ -1275,7 +1298,7 @@ build_orc() {
 
     rm -rf CMakeCache.txt CMakeFiles/
 
-    CXXFLAGS="-O3 -Wno-array-bounds ${warning_reserved_identifier} ${warning_suggest_override}" \
+    CXXFLAGS="-O3 -Wno-array-bounds ${warning_reserved_identifier} ${warning_suggest_override} -mcmodel=medium -fPIC" \
         "${CMAKE_CMD}" -G "${GENERATOR}" ../ -DBUILD_JAVA=OFF \
         -DPROTOBUF_HOME="${TP_INSTALL_DIR}" \
         -DSNAPPY_HOME="${TP_INSTALL_DIR}" \
@@ -1346,7 +1369,7 @@ build_aws_sdk() {
         -DCMAKE_PREFIX_PATH="${TP_INSTALL_DIR}" -DBUILD_SHARED_LIBS=OFF -DENABLE_TESTING=OFF \
         -DCURL_LIBRARY_RELEASE="${TP_INSTALL_DIR}/lib/libcurl.a" -DZLIB_LIBRARY_RELEASE="${TP_INSTALL_DIR}/lib/libz.a" \
         -DBUILD_ONLY="core;s3;s3-crt;transfer;identity-management;sts" \
-        -DCMAKE_CXX_FLAGS="-Wno-nonnull -Wno-deprecated-declarations ${warning_dangling_reference}" -DCPP_STANDARD=17
+        -DCMAKE_CXX_FLAGS="-Wno-nonnull -Wno-deprecated-declarations ${warning_dangling_reference} -mcmodel=medium" -DCPP_STANDARD=17
 
     cd "${BUILD_DIR}"
 
@@ -1414,7 +1437,7 @@ build_xml2() {
     mkdir -p "${BUILD_DIR}"
     cd "${BUILD_DIR}"
 
-    CPPLAGS="-I${TP_INCLUDE_DIR}" \
+    CPPLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         ../configure --prefix="${TP_INSTALL_DIR}" --enable-shared=no --with-pic --with-python=no --with-lzma="${TP_INSTALL_DIR}"
 
@@ -1431,6 +1454,8 @@ build_idn() {
     mkdir -p "${BUILD_DIR}"
     cd "${BUILD_DIR}"
 
+    CFLAGS="-O2 -mcmodel=medium" \
+    CXXFLAGS="-O2 -mcmodel=medium" \
     ../configure --prefix="${TP_INSTALL_DIR}" --enable-shared=no --with-pic
 
     make -j "${PARALLEL}"
@@ -1446,7 +1471,7 @@ build_gsasl() {
     cd "${BUILD_DIR}"
 
     KRB5_CONFIG="${TP_INSTALL_DIR}/bin/krb5-config" \
-        CFLAGS="-I${TP_INCLUDE_DIR} -Wno-implicit-function-declaration" \
+        CFLAGS="-I${TP_INCLUDE_DIR} -Wno-implicit-function-declaration -mcmodel=medium" \
         ../configure --prefix="${TP_INSTALL_DIR}" --with-gssapi-impl=mit --enable-shared=no --with-pic --with-libidn-prefix="${TP_INSTALL_DIR}"
 
     make -j "${PARALLEL}"
@@ -1465,7 +1490,7 @@ build_krb5() {
         with_crypto_impl='--with-crypto-impl=openssl'
     fi
 
-    CFLAGS="-fcommon -fPIC -I${TP_INSTALL_DIR}/include" LDFLAGS="-L${TP_INSTALL_DIR}/lib" \
+    CFLAGS="-fcommon -fPIC -I${TP_INSTALL_DIR}/include -mcmodel=medium" LDFLAGS="-L${TP_INSTALL_DIR}/lib" \
         ../configure --prefix="${TP_INSTALL_DIR}" --disable-shared --enable-static \
         --without-keyutils ${with_crypto_impl:+${with_crypto_impl}}
 
@@ -1496,7 +1521,8 @@ build_hdfs3() {
         -DKERBEROS_LIBRARIES="${TP_INSTALL_DIR}/lib/libkrb5.a" \
         -DGSASL_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
         -DGSASL_LIBRARIES="${TP_INSTALL_DIR}/lib/libgsasl.a" \
-        -DCMAKE_CXX_FLAGS='-include cstdint' \
+        -DCMAKE_C_FLAGS="-mcmodel=medium" \
+        -DCMAKE_CXX_FLAGS='-include cstdint -mcmodel=medium' \
         ..
 
     make CXXFLAGS="${libhdfs_cxx17}" -j "${PARALLEL}"
@@ -1512,7 +1538,7 @@ build_jemalloc() {
     mkdir -p "${BUILD_DIR}"
     cd "${BUILD_DIR}"
 
-    cflags='-O3 -fno-omit-frame-pointer -fPIC -g'
+    cflags='-O3 -fno-omit-frame-pointer -fPIC -g -mcmodel=medium'
     # Build jemalloc --with-lg-page=16 in order to make the wheel work on both 4k and 64k page arm64 systems.
     # Jemalloc compiled on a system with page size 4K can only run on a system with the same page size 4K.
     # If it is run on a system with page size > 4K, an error `unsupported system page size`.
@@ -1525,7 +1551,7 @@ build_jemalloc() {
         WITH_LG_PAGE=''
     fi
 
-    CFLAGS="${cflags}" ../configure --prefix="${TP_INSTALL_DIR}" --with-install-suffix="_doris" "${WITH_LG_PAGE}" \
+    CFLAGS="${cflags} -mcmodel=medium" ../configure --prefix="${TP_INSTALL_DIR}" --with-install-suffix="_doris" "${WITH_LG_PAGE}" \
         --with-jemalloc-prefix=je --enable-prof --disable-cxx --disable-libdl --disable-shared
 
     make -j "${PARALLEL}"
@@ -1592,7 +1618,7 @@ build_simdjson() {
     mkdir -p "${BUILD_DIR}"
     cd "${BUILD_DIR}"
 
-    CXXFLAGS="-O3" CFLAGS="-O3" \
+    CXXFLAGS="-O3 -mcmodel=medium" CFLAGS="-O3 -mcmodel=medium" \
         "${CMAKE_CMD}" -DSIMDJSON_EXCEPTIONS=OFF \
         -DSIMDJSON_DEVELOPER_MODE=OFF -DSIMDJSON_BUILD_STATIC=ON \
         -DSIMDJSON_JUST_LIBRARY=ON -DSIMDJSON_ENABLE_THREADS=ON ..
@@ -1682,6 +1708,7 @@ build_hadoop_libs() {
     check_if_source_exist "${HADOOP_LIBS_SOURCE}"
     cd "${TP_SOURCE_DIR}/${HADOOP_LIBS_SOURCE}"
     echo "THIRDPARTY_INSTALLED=${TP_INSTALL_DIR}" >env.sh
+    export MAVEN_OPTS="-Dos.detected.arch=loongarch64"
     ./build.sh
 
     rm -rf "${TP_INSTALL_DIR}/include/hadoop_hdfs/"
@@ -1762,8 +1789,8 @@ build_ali_sdk() {
     mkdir -p "${BUILD_DIR}"
     cd "${BUILD_DIR}"
 
-    CPPFLAGS="-I${TP_INCLUDE_DIR}" \
-        CXXFLAGS="-I${TP_INCLUDE_DIR}" \
+    CPPFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
+        CXXFLAGS="-I${TP_INCLUDE_DIR} -mcmodel=medium" \
         LDFLAGS="-L${TP_LIB_DIR}" \
         "${CMAKE_CMD}" -G "${GENERATOR}" -DBUILD_PRODUCT=core -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
         -DTP_INSTALL_DIR="${TP_INSTALL_DIR}" ..
@@ -1799,7 +1826,10 @@ build_icu() {
     rm -rf "${BUILD_DIR}"
     mkdir -p "${BUILD_DIR}"
     cd "${BUILD_DIR}"
+    cp -rf ${TP_PATCH_DIR}/config.guess  ${TP_PATCH_DIR}/patches/config.sub ${TP_SOURCE_DIR}/${ICU_SOURCE}/icu4c/source
 
+    CFLAGS="${CFLAGS} -mcmodel=medium" \
+    CPPFLAGS="${CPPFLAGS} -mcmodel=medium" \
     ../configure --prefix="${TP_INSTALL_DIR}" \
         --enable-static \
         --disable-shared \
diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh
index c7f2eea4c3..e84985d044 100755
--- a/thirdparty/download-thirdparty.sh
+++ b/thirdparty/download-thirdparty.sh
@@ -360,6 +360,15 @@ if [[ "${HYPERSCAN_SOURCE}" == "vectorscan-vectorscan-5.4.11" ]]; then
     fi
     cd -
 fi
+
+cd "${TP_SOURCE_DIR}/${HYPERSCAN_SOURCE}"
+if [[ ! -f "${PATCHED_MARK}" ]]; then
+    patch -p1 <"${TP_PATCH_DIR}/add-loongarch64-support-hyperscan-5.4.2.patch"
+    patch -p1 <"${TP_PATCH_DIR}/add-the-parameter-mlsx.patch"
+    touch "${PATCHED_MARK}"
+fi
+cd -
+
 echo "Finished patching ${HYPERSCAN_SOURCE}"
 
 cd "${TP_SOURCE_DIR}/${AWS_SDK_SOURCE}"
@@ -471,5 +480,15 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " THRIFT " ]]; then
     echo "Finished patching ${THRIFT_SOURCE}"
 fi
 
+# patch hdfs3
+if [[ "${HDFS3_SOURCE}" = "doris-thirdparty-libhdfs3-v2.3.9" ]]; then
+    cd "${TP_SOURCE_DIR}/${HDFS3_SOURCE}"
+    if [[ ! -f "${PATCHED_MARK}" ]]; then
+        patch -p1 <"${TP_PATCH_DIR}/libhdfs3-add-loongarch-support.patch"
+        touch "${PATCHED_MARK}"
+    fi
+    cd -
+fi
+echo "Finished patching ${HDFS3_SOURCE}"
 
 # vim: ts=4 sw=4 ts=4 tw=100:
diff --git a/thirdparty/patches/add-loongarch64-support-hyperscan-5.4.2.patch b/thirdparty/patches/add-loongarch64-support-hyperscan-5.4.2.patch
new file mode 100644
index 0000000000..2479bb4406
--- /dev/null
+++ b/thirdparty/patches/add-loongarch64-support-hyperscan-5.4.2.patch
@@ -0,0 +1,4322 @@
+From 6e29e1e679ea60d7152e37a8949805b80054cdea Mon Sep 17 00:00:00 2001
+From: Jingyun Hua <huajingyun@loongson.cn>
+Date: Wed, 5 Jul 2023 07:11:50 +0000
+Subject: [PATCH] add loongarch64 support hyperscan-5.4.2
+
+---
+ CMakeLists.txt              |   14 +-
+ cmake/arch.cmake            |   21 +-
+ cmake/config.h.in           |    6 +
+ cmake/platform.cmake        |   13 +-
+ src/hs_valid_platform.c     |    4 +
+ src/nfa/shufti.c            |    6 +-
+ src/nfa/truffle.c           |    4 +-
+ src/rose/counting_miracle.h |    2 +-
+ src/util/arch.h             |    7 +
+ src/util/cpuid_flags.c      |   11 +-
+ src/util/cpuid_flags.h      |    2 +-
+ src/util/cpuid_inline.h     |    8 +-
+ src/util/intrinsics.h       |    6 +
+ src/util/simd_loongarch.h   |  956 +++++++++++++++++++++++
+ src/util/simd_types.h       |    3 +
+ src/util/simd_utils.h       | 1386 +---------------------------------
+ src/util/simd_x86.h         | 1420 +++++++++++++++++++++++++++++++++++
+ src/util/state_compress.c   |   42 +-
+ 18 files changed, 2479 insertions(+), 1432 deletions(-)
+ create mode 100644 src/util/simd_loongarch.h
+ create mode 100644 src/util/simd_x86.h
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 7757916..4289817 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -182,7 +182,7 @@ else()
+         string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
+     endforeach ()
+ 
+-    if (CMAKE_COMPILER_IS_GNUCC)
++    if (ARCH_IA32 OR ARCH_X86_64 AND CMAKE_COMPILER_IS_GNUCC)
+         message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
+         # If gcc doesn't recognise the host cpu, then mtune=native becomes
+         # generic, which isn't very good in some cases. march=native looks at
+@@ -289,10 +289,14 @@ else()
+ endif()
+ 
+ CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
+-CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
+-CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
+-CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
+-CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
++if (ARCH_IA32 OR ARCH_X86_64)
++ CHECK_INCLUDE_FILES(intrin.h HAVE_C_INTRIN_H)
++ CHECK_INCLUDE_FILE_CXX(intrin.h HAVE_CXX_INTRIN_H)
++ CHECK_INCLUDE_FILES(x86intrin.h HAVE_C_X86INTRIN_H)
++ CHECK_INCLUDE_FILE_CXX(x86intrin.h HAVE_CXX_X86INTRIN_H)
++elseif (ARCH_LOONGARCH64)
++ CHECK_INCLUDE_FILES(lsxintrin.h HAVE_C_LSXINTRIN_H)
++endif()
+ 
+ CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
+ CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
+diff --git a/cmake/arch.cmake b/cmake/arch.cmake
+index eb4791e..e8b5f11 100644
+--- a/cmake/arch.cmake
++++ b/cmake/arch.cmake
+@@ -6,7 +6,10 @@ if (HAVE_C_X86INTRIN_H)
+     set (INTRIN_INC_H "x86intrin.h")
+ elseif (HAVE_C_INTRIN_H)
+     set (INTRIN_INC_H "intrin.h")
+-else ()
++elseif (HAVE_C_LSXINTRIN_H)
++    set (INTRIN_INC_H "lsxintrin.h")
++    set (FAT_RUNTIME OFF)
++else()
+     message (FATAL_ERROR "No intrinsics header found")
+ endif ()
+ 
+@@ -82,29 +85,29 @@ int main(){
+ }" HAVE_AVX512VBMI)
+ 
+ if (FAT_RUNTIME)
+-    if (NOT HAVE_SSSE3)
++    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
+         message(FATAL_ERROR "SSSE3 support required to build fat runtime")
+     endif ()
+-    if (NOT HAVE_AVX2)
++    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
+         message(FATAL_ERROR "AVX2 support required to build fat runtime")
+     endif ()
+-    if (BUILD_AVX512 AND NOT HAVE_AVX512)
++    if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512 AND NOT HAVE_AVX512)
+         message(FATAL_ERROR "AVX512 support requested but not supported")
+     endif ()
+-    if (BUILD_AVX512VBMI AND NOT HAVE_AVX512VBMI)
++    if ((ARCH_IA32 OR ARCH_X86_64) AND BUILD_AVX512VBMI AND NOT HAVE_AVX512VBMI)
+         message(FATAL_ERROR "AVX512VBMI support requested but not supported")
+     endif ()
+ else (NOT FAT_RUNTIME)
+-    if (NOT HAVE_AVX2)
++    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX2)
+         message(STATUS "Building without AVX2 support")
+     endif ()
+-    if (NOT HAVE_AVX512)
++    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512)
+         message(STATUS "Building without AVX512 support")
+     endif ()
+-    if (NOT HAVE_AVX512VBMI)
++    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_AVX512VBMI)
+         message(STATUS "Building without AVX512VBMI support")
+     endif ()
+-    if (NOT HAVE_SSSE3)
++    if ((ARCH_IA32 OR ARCH_X86_64) AND NOT HAVE_SSSE3)
+         message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
+     endif ()
+ endif ()
+diff --git a/cmake/config.h.in b/cmake/config.h.in
+index 5454643..43827fe 100644
+--- a/cmake/config.h.in
++++ b/cmake/config.h.in
+@@ -15,6 +15,9 @@
+ /* "Define if building for EM64T" */
+ #cmakedefine ARCH_X86_64
+ 
++/* "Define if building for LOONGARCH64" */
++#cmakedefine ARCH_LOONGARCH64
++
+ /* internal build, switch on dump support. */
+ #cmakedefine DUMP_SUPPORT
+ 
+@@ -48,6 +51,9 @@
+ /* C compiler has intrin.h */
+ #cmakedefine HAVE_C_INTRIN_H
+ 
++/* C compiler has lsxintrin.h */
++#cmakedefine HAVE_C_LSXINTRIN_H
++
+ /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+    0 if you don't. */
+ #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
+diff --git a/cmake/platform.cmake b/cmake/platform.cmake
+index 593c544..aba432f 100644
+--- a/cmake/platform.cmake
++++ b/cmake/platform.cmake
+@@ -1,9 +1,14 @@
+ # determine the target arch
+ 
+ # really only interested in the preprocessor here
+-CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_64_BIT)
++CHECK_C_SOURCE_COMPILES("#if !(defined(__x86_64__) || defined(_M_X64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_X86_64)
+ 
+-CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_32_BIT)
++CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_IA32)
+ 
+-set(ARCH_X86_64 ${ARCH_64_BIT})
+-set(ARCH_IA32 ${ARCH_32_BIT})
++CHECK_C_SOURCE_COMPILES("#if !defined(__loongarch64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_LOONGARCH64)
++
++if (ARCH_X86_64 OR ARCH_LOONGARCH64)
++    set(ARCH_64_BIT 1)
++elseif ()
++    set(ARCH_32_BIT 1)
++endif()
+diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
+index 59ad3f3..564d179 100644
+--- a/src/hs_valid_platform.c
++++ b/src/hs_valid_platform.c
+@@ -33,9 +33,13 @@
+ HS_PUBLIC_API
+ hs_error_t HS_CDECL hs_valid_platform(void) {
+     /* Hyperscan requires SSSE3, anything else is a bonus */
++#if defined(__x86_64__)
+     if (check_ssse3()) {
+         return HS_SUCCESS;
+     } else {
+         return HS_ARCH_ERROR;
+     }
++#elif defined(ARCH_LOONGARCH64)
++    return HS_SUCCESS;
++#endif
+ }
+diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
+index 09ffc0c..c8776a7 100644
+--- a/src/nfa/shufti.c
++++ b/src/nfa/shufti.c
+@@ -159,7 +159,7 @@ const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+     }
+ 
+     const m128 zeroes = zeroes128();
+-    const m128 low4bits = _mm_set1_epi8(0xf);
++    const m128 low4bits = __lsx_vldi(0xf);
+     const u8 *rv;
+ 
+     size_t min = (size_t)buf % 16;
+@@ -246,7 +246,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+     }
+ 
+     const m128 zeroes = zeroes128();
+-    const m128 low4bits = _mm_set1_epi8(0xf);
++    const m128 low4bits = __lsx_vldi(0xf);
+     const u8 *rv;
+ 
+     assert(buf_end - buf >= 16);
+@@ -320,7 +320,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                            m128 mask2_lo, m128 mask2_hi,
+                            const u8 *buf, const u8 *buf_end) {
+     const m128 ones = ones128();
+-    const m128 low4bits = _mm_set1_epi8(0xf);
++    const m128 low4bits = __lsx_vldi(0xf);
+     const u8 *rv;
+ 
+     size_t min = (size_t)buf % 16;
+diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
+index be6b312..f208854 100644
+--- a/src/nfa/truffle.c
++++ b/src/nfa/truffle.c
+@@ -64,8 +64,8 @@ const u8 *firstMatch(const u8 *buf, u32 z) {
+ static really_inline
+ u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
+ 
+-    m128 highconst = _mm_set1_epi8(0x80);
+-    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
++    m128 highconst = __lsx_vldi(0x80);
++    m128 shuf_mask_hi = __lsx_vreplgr2vr_d(0x8040201008040201);
+ 
+     // and now do the real work
+     m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
+diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
+index 976208b..1cf5189 100644
+--- a/src/rose/counting_miracle.h
++++ b/src/rose/counting_miracle.h
+@@ -94,7 +94,7 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
+     u32 count = *count_inout;
+ 
+     const m128 zeroes = zeroes128();
+-    const m128 low4bits = _mm_set1_epi8(0xf);
++    const m128 low4bits = __lsx_vldi(0xf);
+ 
+     for (; d + 16 <= d_end; d_end -= 16) {
+         m128 data = loadu128(d_end - 16);
+diff --git a/src/util/arch.h b/src/util/arch.h
+index 985fec6..296d322 100644
+--- a/src/util/arch.h
++++ b/src/util/arch.h
+@@ -87,4 +87,11 @@
+ #define NO_ASM
+ #endif
+ 
++/*
++ * LOONGARCH64 uses a different form of inline asm
++ */
++#if defined(__loongarch64)
++#define NO_ASM
++#endif
++
+ #endif // UTIL_ARCH_H_
+diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c
+index c00ce58..705794a 100644
+--- a/src/util/cpuid_flags.c
++++ b/src/util/cpuid_flags.c
+@@ -33,13 +33,15 @@
+ #include "hs_internal.h"
+ #include "util/arch.h"
+ 
++#if defined(__x86_64__) || defined(_M_X64)
+ #if !defined(_WIN32) && !defined(CPUID_H_)
+ #include <cpuid.h>
+ #endif
++#endif
+ 
+ u64a cpuid_flags(void) {
+     u64a cap = 0;
+-
++#if defined(__X86_64__)
+     if (check_avx2()) {
+         DEBUG_PRINTF("AVX2 enabled\n");
+         cap |= HS_CPU_FEATURES_AVX2;
+@@ -68,7 +70,7 @@ u64a cpuid_flags(void) {
+     (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI))
+     cap &= ~HS_CPU_FEATURES_AVX512VBMI;
+ #endif
+-
++#endif
+     return cap;
+ }
+ 
+@@ -78,6 +80,7 @@ struct family_id {
+     u32 tune;
+ };
+ 
++#if defined(__X86_64__)
+ /* from table 35-1 of the Intel 64 and IA32 Arch. Software Developer's Manual
+  * and "Intel Architecture and Processor Identification With CPUID Model and
+  * Family Numbers" */
+@@ -121,6 +124,7 @@ static const struct family_id known_microarch[] = {
+     { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */
+ 
+ };
++#endif
+ 
+ #ifdef DUMP_SUPPORT
+ static UNUSED
+@@ -144,6 +148,7 @@ const char *dumpTune(u32 tune) {
+ #endif
+ 
+ u32 cpuid_tune(void) {
++#if defined(__X86_64__)
+     unsigned int eax, ebx, ecx, edx;
+ 
+     cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+@@ -171,6 +176,6 @@ u32 cpuid_tune(void) {
+         DEBUG_PRINTF("found tune flag %s\n", dumpTune(tune) );
+         return tune;
+     }
+-
++#endif
+     return HS_TUNE_FAMILY_GENERIC;
+ }
+diff --git a/src/util/cpuid_flags.h b/src/util/cpuid_flags.h
+index 527c6d5..68e427d 100644
+--- a/src/util/cpuid_flags.h
++++ b/src/util/cpuid_flags.h
+@@ -31,7 +31,7 @@
+ 
+ #include "ue2common.h"
+ 
+-#if !defined(_WIN32) && !defined(CPUID_H_)
++#if (defined(ARCH_IA32) || defined(ARCH_X86_64)) && !defined(_WIN32) && !defined(CPUID_H_)
+ #include <cpuid.h>
+  /* system header doesn't have a header guard */
+ #define CPUID_H_
+diff --git a/src/util/cpuid_inline.h b/src/util/cpuid_inline.h
+index b7b4245..425bcfc 100644
+--- a/src/util/cpuid_inline.h
++++ b/src/util/cpuid_inline.h
+@@ -32,17 +32,20 @@
+ #include "ue2common.h"
+ #include "cpuid_flags.h"
+ 
++#if defined(__x86_64__) || defined(_M_X64)
+ #if !defined(_WIN32) && !defined(CPUID_H_)
+ #include <cpuid.h>
+ /* system header doesn't have a header guard */
+ #define CPUID_H_
+ #endif
++#endif
+ 
+ #ifdef __cplusplus
+ extern "C"
+ {
+ #endif
+ 
++#if defined(__x86_64__) || defined(_M_X64)
+ static inline
+ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
+            unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
+@@ -57,6 +60,7 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
+     *edx = a[3];
+ #endif
+ }
++#endif
+ 
+ // ECX
+ #define CPUID_SSE3 (1 << 0)
+@@ -93,11 +97,12 @@ void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
+ #define CPUID_XCR0_AVX512                                                      \
+     (CPUID_XCR0_OPMASK | CPUID_XCR0_ZMM_Hi256 | CPUID_XCR0_Hi16_ZMM)
+ 
++#if defined(__x86_64__)
+ static inline
+ u64a xgetbv(u32 op) {
+ #if defined(_WIN32) || defined(__INTEL_COMPILER)
+     return _xgetbv(op);
+-#else
++#elif defined(__x86_64__)
+     u32 a, d;
+     __asm__ volatile (
+             "xgetbv\n"
+@@ -252,6 +257,7 @@ int check_popcnt(void) {
+     cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+     return !!(ecx & CPUID_POPCNT);
+ }
++#endif  //__x86_64__
+ 
+ #ifdef __cplusplus
+ } /* extern "C" */
+diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h
+index edc4f6e..1094e72 100644
+--- a/src/util/intrinsics.h
++++ b/src/util/intrinsics.h
+@@ -45,6 +45,10 @@
+ # endif
+ #endif
+ 
++#if defined(HAVE_C_LSXINTRIN_H)
++#  define USE_LSXINTRIN_H
++#endif
++
+ #ifdef __cplusplus
+ # if defined(HAVE_CXX_INTRIN_H)
+ #  define USE_INTRIN_H
+@@ -59,6 +63,8 @@
+ #include <x86intrin.h>
+ #elif defined(USE_INTRIN_H)
+ #include <intrin.h>
++#elif defined(USE_LSXINTRIN_H)
++#include <lsxintrin.h>
+ #else
+ #error no intrinsics file
+ #endif
+diff --git a/src/util/simd_loongarch.h b/src/util/simd_loongarch.h
+new file mode 100644
+index 0000000..b311ffb
+--- /dev/null
++++ b/src/util/simd_loongarch.h
+@@ -0,0 +1,956 @@
++/*
++ * Copyright (c) 2015-2017, Intel Corporation
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ *  * Redistributions of source code must retain the above copyright notice,
++ *    this list of conditions and the following disclaimer.
++ *  * Redistributions in binary form must reproduce the above copyright
++ *    notice, this list of conditions and the following disclaimer in the
++ *    documentation and/or other materials provided with the distribution.
++ *  * Neither the name of Intel Corporation nor the names of its contributors
++ *    may be used to endorse or promote products derived from this software
++ *    without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
++ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
++ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
++ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
++ * POSSIBILITY OF SUCH DAMAGE.
++ */
++
++/** \file
++ * \brief SIMD types and primitive operations.
++ */
++
++#ifndef SIMD_LSX
++#define SIMD_LSX
++
++#include "config.h"
++#include "ue2common.h"
++#include "simd_types.h"
++#include "unaligned.h"
++#include "util/arch.h"
++#include "util/intrinsics.h"
++#include <lsxintrin.h>
++
++#include <string.h> // for memcpy
++
++// Define a common assume_aligned using an appropriate compiler built-in, if
++// it's available. Note that we need to handle C or C++ compilation.
++#ifdef __cplusplus
++#ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
++#define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
++#endif
++#else
++#ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
++#define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
++#endif
++#endif
++
++// Fallback to identity case.
++#ifndef assume_aligned
++#define assume_aligned(x, y) (x)
++#endif
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++extern const char vbs_mask_data[];
++#ifdef __cplusplus
++}
++#endif
++
++static really_inline m128 ones128(void) {
++    /* gcc gets this right */
++   return  __lsx_vldi(0xFF);
++}
++
++static really_inline m128 zeroes128(void) {
++   return  __lsx_vldi(0);
++}
++
++/** \brief Bitwise not for m128*/
++static really_inline m128 not128(m128 a) {
++    return  __lsx_vxor_v(a,ones128());
++}
++
++/** \brief Return 1 if a and b are different otherwise 0 */
++static really_inline int diff128(m128 a, m128 b) {
++    return (__lsx_vpickve2gr_hu(__lsx_vmskltz_b(__lsx_vseq_b(a, b)), 0) ^ 0xffff);
++}
++
++static really_inline int isnonzero128(m128 a) {
++    return !!diff128(a, zeroes128());
++}
++
++/**
++ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich128(m128 a, m128 b) {
++     a = __lsx_vseq_w(a, b);
++     return ~( __lsx_vpickve2gr_hu(__lsx_vmskltz_w(a),0)) & 0xf;
++}
++/**
++ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
++ * returns a 4-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_128(m128 a, m128 b) {
++    u32 d = diffrich128(a, b);
++    return (d | (d >> 1)) & 0x5;
++}
++
++static really_really_inline
++m128 lshift64_m128(m128 a, unsigned b) {
++    m128 tmp = __lsx_vinsgr2vr_w(zeroes128(), b, 0);
++
++    m128 x = __lsx_vinsgr2vr_w(tmp, b, 2);
++    return  __lsx_vsll_d(a, x);
++}
++
++#define rshift64_m128(a, b) __lsx_vsrli_d((a), (b))
++#define eq128(a, b)      __lsx_vseq_b((a), (b))
++#define movemask128(a)  __lsx_vpickve2gr_hu(__lsx_vmskltz_b(a), 0)
++
++static really_inline m128 set16x8(u8 c) {
++    return __lsx_vreplgr2vr_b(c);
++}
++
++static really_inline m128 set4x32(u32 c) {
++    return  __lsx_vreplgr2vr_w(c);
++}
++
++static really_inline u32 movd(const m128 in) {
++    return __lsx_vpickve2gr_w(in, 0);
++}
++
++static really_inline u64a movq(const m128 in) {
++    u32 lo = movd(in);
++    u32 hi = movd(__lsx_vsrli_d(in, 32));
++    return (u64a)hi << 32 | lo;
++}
++
++/* another form of movq */
++static really_inline
++m128 load_m128_from_u64a(const u64a *p) {
++    return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(0LL), *p, 0);
++}
++
++#define L(a)  __lsx_vand_v(a, __lsx_vinsgr2vr_d(__lsx_vldi(0), 0xFFFFFFFFFFFFFFFF, 0))
++#define M(a) __lsx_vand_v(a, __lsx_vinsgr2vr_d(__lsx_vldi(0xFF), 0x0000000000000000, 0))
++#define N(a) __lsx_vpickod_d(__lsx_vldi(0),a)
++#define U(a) __lsx_vpickev_d(a, __lsx_vldi(0))
++#define rshiftbyte_m128(a, count_immed) \
++        (((count_immed) < 8) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (8*count_immed)), __lsx_vor_v(__lsx_vsrli_d(L(a), (8*count_immed)), __lsx_vslli_d(N(a), (64-(8*count_immed)))))) : (__lsx_vsrli_d(N(a),((8*count_immed)-64))))
++
++#define lshiftbyte_m128(a, count_immed) \
++        (((count_immed) < 8) ? (__lsx_vor_v(__lsx_vslli_d(L(a), (8*count_immed)), __lsx_vor_v(__lsx_vslli_d(M(a), (8*count_immed)), __lsx_vsrli_d(U(a), (64-(8*count_immed)))))) : (__lsx_vslli_d(U(a),((8*count_immed)-64))))
++
++#define extract32from128(a, imm) \
++        (((imm) < 2) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (32*imm)), __lsx_vor_v(__lsx_vsrli_d(L(a), (32*imm)), __lsx_vslli_d(N(a), (64-(32*imm)))))) : (__lsx_vsrli_d(N(a),((32*imm)-64))))
++#define extract64from128(a, imm) \
++        (((imm) < 1) ? (__lsx_vor_v(__lsx_vsrli_d(M(a), (64*imm)), __lsx_vor_v(__lsx_vsrli_d(L(a), (64*imm)), __lsx_vslli_d(N(a), (64-(64*imm)))))) : (__lsx_vsrli_d(N(a),((64*imm)-64))))
++
++#define extractlow64from256(a) movq(a.lo)
++#define extractlow32from256(a) movd(a.lo)
++
++static really_inline m128 and128(m128 a, m128 b) {
++    return __lsx_vand_v(a,b);
++}
++
++static really_inline m128 xor128(m128 a, m128 b) {
++    return __lsx_vxor_v(a,b);
++}
++
++static really_inline m128 or128(m128 a, m128 b) {
++    return __lsx_vor_v(a,b);
++}
++
++static really_inline m128 andnot128(m128 a, m128 b) {
++    return __lsx_vandn_v(a,b);
++}
++
++// aligned load
++static really_inline m128 load128(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    ptr = assume_aligned(ptr, 16);
++    return __lsx_vldx((const m128 *)ptr,0);
++}
++
++// aligned store
++static really_inline void store128(void *ptr, m128 a) {
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    ptr = assume_aligned(ptr, 16);
++    *(m128 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m128 loadu128(const void *ptr) {
++    return __lsx_vldx((const m128 *)ptr,0);
++}
++
++// unaligned store
++static really_inline void storeu128(void *ptr, m128 a) {
++    __lsx_vst(a,(m128 *)ptr,0);
++}
++
++// packed unaligned store of first N bytes
++static really_inline
++void storebytes128(void *ptr, m128 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline
++m128 loadbytes128(const void *ptr, unsigned int n) {
++    m128 a = zeroes128();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++extern const u8 simd_onebit_masks[];
++#ifdef __cplusplus
++}
++#endif
++
++static really_inline
++m128 mask1bit128(unsigned int n) {
++    assert(n < sizeof(m128) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu128(&simd_onebit_masks[mask_idx]);
++}
++
++// switches on bit N in the given vector.
++static really_inline
++void setbit128(m128 *ptr, unsigned int n) {
++    *ptr = or128(mask1bit128(n), *ptr);
++}
++
++// switches off bit N in the given vector.
++static really_inline
++void clearbit128(m128 *ptr, unsigned int n) {
++    *ptr = andnot128(mask1bit128(n), *ptr);
++}
++
++// tests bit N in the given vector.
++static really_inline
++char testbit128(m128 val, unsigned int n) {
++    const m128 mask = mask1bit128(n);
++    return isnonzero128(and128(mask, val));
++}
++
++#define palignr(r, l, offset) \
++       (((offset) < 8) ? __lsx_vor_v(rshiftbyte_m128(l,(offset)),lshiftbyte_m128(U(r),(8-(offset)))) : __lsx_vor_v(rshiftbyte_m128(l,(offset)), lshiftbyte_m128(r,(16-(offset)))))
++
++static really_inline
++m128 shuffle_epi8(m128 a, m128 b) {
++	m128 tmp1,tmp2,tmp3,dst;
++        tmp1 = ~(__lsx_vslt_b(b,__lsx_vldi(0)));
++        tmp2 = __lsx_vand_v(b,tmp1);
++        tmp3 = __lsx_vand_v(tmp2, __lsx_vldi(0x0F));
++        unsigned char* p = (unsigned char*)&tmp3;
++        unsigned char* pa = (unsigned char*)&a;
++        for (int i = 0; i < 16; i++) {
++                unsigned char value = p[i];
++                switch(i){
++                        case 0:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 0);}else{dst = tmp3;}break;
++                        case 1:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 1);}else{dst = tmp3;}break;
++                        case 2:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 2);}else{dst = tmp3;}break;
++                        case 3:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 3);}else{dst = tmp3;}break;
++                        case 4:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 4);}else{dst = tmp3;}break;
++                        case 5:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 5);}else{dst = tmp3;}break;
++                        case 6:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 6);}else{dst = tmp3;}break;
++                        case 7:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 7);}else{dst = tmp3;}break;
++                        case 8:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 8);}else{dst = tmp3;}break;
++                        case 9:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 9);}else{dst = tmp3;}break;
++                        case 10:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 10);}else{dst = tmp3;}break;
++                        case 11:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 11);}else{dst = tmp3;}break;
++                        case 12:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 12);}else{dst = tmp3;}break;
++                        case 13:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 13);}else{dst = tmp3;}break;
++                        case 14:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 14);}else{dst = tmp3;}break;
++                        case 15:if(value > 0){dst = __lsx_vinsgr2vr_b(tmp3, pa[value], 15);}else{dst = tmp3;}break;
++                        default:break;
++                }
++                tmp3 = dst;
++        }
++    return dst;
++}
++
++static really_inline
++m128 pshufb_m128(m128 a, m128 b) {
++    m128 result;
++    result = shuffle_epi8(a, b);
++    return result;
++}
++
++static really_inline
++m256 pshufb_m256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = pshufb_m128(a.lo, b.lo);
++    rv.hi = pshufb_m128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline
++m128 variable_byte_shift_m128(m128 in, s32 amount) {
++    assert(amount >= -16 && amount <= 16);
++    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
++    return pshufb_m128(in, shift_mask);
++}
++
++static really_inline
++m128 max_u8_m128(m128 a, m128 b) {
++    return __lsx_vmax_bu(a, b);
++}
++
++static really_inline
++m128 min_u8_m128(m128 a, m128 b) {
++    return __lsx_vmin_bu(a, b);
++}
++
++static really_inline
++m128 sadd_u8_m128(m128 a, m128 b) {
++    return __lsx_vsadd_bu(a, b);
++}
++
++static really_inline
++m128 sub_u8_m128(m128 a, m128 b) {
++    return __lsx_vsub_b(a, b);
++}
++
++static really_inline
++m128 set64x2(u64a hi, u64a lo) {
++    return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(hi),lo,0);
++}
++
++/****
++ **** 256-bit Primitives
++ ****/
++
++static really_really_inline
++m256 lshift64_m256(m256 a, int b) {
++    m256 rv = a;
++    rv.lo = lshift64_m128(rv.lo, b);
++    rv.hi = lshift64_m128(rv.hi, b);
++    return rv;
++}
++
++static really_inline
++m256 rshift64_m256(m256 a, int b) {
++    m256 rv = a;
++    rv.lo = rshift64_m128(rv.lo, b);
++    rv.hi = rshift64_m128(rv.hi, b);
++    return rv;
++}
++static really_inline
++m256 set32x8(u32 in) {
++    m256 rv;
++    rv.lo = set16x8((u8) in);
++    rv.hi = rv.lo;
++    return rv;
++}
++
++static really_inline
++m256 eq256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = eq128(a.lo, b.lo);
++    rv.hi = eq128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline
++u32 movemask256(m256 a) {
++    u32 lo_mask = movemask128(a.lo);
++    u32 hi_mask = movemask128(a.hi);
++    return lo_mask | (hi_mask << 16);
++}
++
++static really_inline
++m256 set2x128(m128 a) {
++    m256 rv = {a, a};
++    return rv;
++}
++
++static really_inline m256 zeroes256(void) {
++    m256 rv = {zeroes128(), zeroes128()};
++    return rv;
++}
++
++static really_inline m256 ones256(void) {
++    m256 rv = {ones128(), ones128()};
++    return rv;
++}
++
++static really_inline m256 and256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = and128(a.lo, b.lo);
++    rv.hi = and128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m256 or256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = or128(a.lo, b.lo);
++    rv.hi = or128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m256 xor256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = xor128(a.lo, b.lo);
++    rv.hi = xor128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m256 not256(m256 a) {
++    m256 rv;
++    rv.lo = not128(a.lo);
++    rv.hi = not128(a.hi);
++    return rv;
++}
++
++static really_inline m256 andnot256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = andnot128(a.lo, b.lo);
++    rv.hi = andnot128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline int diff256(m256 a, m256 b) {
++    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
++}
++
++static really_inline int isnonzero256(m256 a) {
++    return isnonzero128(or128(a.lo, a.hi));
++}
++
++/**
++ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich256(m256 a, m256 b) {
++    m128 z = zeroes128();
++    m128 tmp0,tmp1,tmp2,tmp3;
++    a.lo = __lsx_vseq_w(a.lo, b.lo);
++    a.hi = __lsx_vseq_w(a.hi, b.hi);
++
++    tmp0 =__lsx_vsat_w(a.lo, 15);
++    tmp1 =__lsx_vsat_w(b.hi, 15);
++    tmp2 =__lsx_vsat_h(__lsx_vpickev_h(tmp1, tmp0), 7);
++    tmp3 =__lsx_vsat_h(z, 7);
++    m128 packed = __lsx_vpickev_b(tmp3, tmp2);
++
++    return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xff;
++}
++
++/**
++ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
++ * returns an 8-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_256(m256 a, m256 b) {
++    u32 d = diffrich256(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline m256 load256(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++    m256 rv = {load128(ptr), load128((const char *)ptr + 16)};
++    return rv;
++}
++
++// aligned load  of 128-bit value to low and high part of 256-bit value
++static really_inline m256 load2x128(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    m256 rv;
++    rv.hi = rv.lo = load128(ptr);
++    return rv;
++}
++
++static really_inline m256 loadu2x128(const void *ptr) {
++    return set2x128(loadu128(ptr));
++}
++
++// aligned store
++static really_inline void store256(void *ptr, m256 a) {
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++    ptr = assume_aligned(ptr, 16);
++    *(m256 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m256 loadu256(const void *ptr) {
++    m256 rv = {loadu128(ptr), loadu128((const char *)ptr + 16)};
++    return rv;
++}
++
++// unaligned store
++static really_inline void storeu256(void *ptr, m256 a) {
++    storeu128(ptr, a.lo);
++    storeu128((char *)ptr + 16, a.hi);
++}
++
++// packed unaligned store of first N bytes
++static really_inline void storebytes256(void *ptr, m256 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline m256 loadbytes256(const void *ptr, unsigned int n) {
++    m256 a = zeroes256();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++static really_inline m256 mask1bit256(unsigned int n) {
++    assert(n < sizeof(m256) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu256(&simd_onebit_masks[mask_idx]);
++}
++
++static really_inline m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
++    m256 rv;
++    rv.hi = set64x2(hi_1, hi_0);
++    rv.lo = set64x2(lo_1, lo_0);
++    return rv;
++}
++
++// switches on bit N in the given vector.
++static really_inline void setbit256(m256 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else {
++        sub = &ptr->hi;
++        n -= 128;
++    }
++    setbit128(sub, n);
++}
++
++// switches off bit N in the given vector.
++static really_inline void clearbit256(m256 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else {
++        sub = &ptr->hi;
++        n -= 128;
++    }
++    clearbit128(sub, n);
++}
++
++// tests bit N in the given vector.
++static really_inline char testbit256(m256 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo;
++    } else {
++        sub = val.hi;
++        n -= 128;
++    }
++    return testbit128(sub, n);
++}
++
++static really_really_inline
++m128 movdq_hi(m256 x) {
++    return x.hi;
++}
++
++static really_really_inline m128 movdq_lo(m256 x) { return x.lo;}
++
++static really_inline m256 combine2x128(m128 hi, m128 lo) {
++    m256 rv = {lo, hi};
++    return rv;
++}
++
++/****
++ **** 384-bit Primitives
++ ****/
++
++static really_inline m384 and384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = and128(a.lo, b.lo);
++    rv.mid = and128(a.mid, b.mid);
++    rv.hi = and128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m384 or384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = or128(a.lo, b.lo);
++    rv.mid = or128(a.mid, b.mid);
++    rv.hi = or128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m384 xor384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = xor128(a.lo, b.lo);
++    rv.mid = xor128(a.mid, b.mid);
++    rv.hi = xor128(a.hi, b.hi);
++    return rv;
++}
++static really_inline m384 not384(m384 a) {
++    m384 rv;
++    rv.lo = not128(a.lo);
++    rv.mid = not128(a.mid);
++    rv.hi = not128(a.hi);
++    return rv;
++}
++static really_inline m384 andnot384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = andnot128(a.lo, b.lo);
++    rv.mid = andnot128(a.mid, b.mid);
++    rv.hi = andnot128(a.hi, b.hi);
++    return rv;
++}
++
++static really_really_inline m384 lshift64_m384(m384 a, unsigned b) {
++    m384 rv;
++    rv.lo = lshift64_m128(a.lo, b);
++    rv.mid = lshift64_m128(a.mid, b);
++    rv.hi = lshift64_m128(a.hi, b);
++    return rv;
++}
++
++static really_inline m384 zeroes384(void) {
++    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
++    return rv;
++}
++
++static really_inline m384 ones384(void) {
++    m384 rv = {ones128(), ones128(), ones128()};
++    return rv;
++}
++
++static really_inline int diff384(m384 a, m384 b) {
++    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
++}
++
++static really_inline int isnonzero384(m384 a) {
++    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
++}
++
++/**
++ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich384(m384 a, m384 b) {
++    m128 z = zeroes128();
++    m128 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5;
++    a.lo = __lsx_vseq_w(a.lo, b.lo);
++    a.mid = __lsx_vseq_w(a.mid, b.mid);
++    a.hi = __lsx_vseq_w(a.hi, b.hi);
++
++    tmp0 = __lsx_vsat_w(a.lo, 15);
++    tmp1 = __lsx_vsat_w(b.mid, 15);
++
++    tmp2 = __lsx_vsat_w(b.hi, 15);
++    tmp3 = __lsx_vsat_w(z, 15);
++
++    tmp4 = __lsx_vsat_h(__lsx_vpickev_h(tmp1, tmp0),7);
++    tmp5 = __lsx_vsat_h(__lsx_vpickev_h(tmp3, tmp2),7);
++
++    m128 packed = __lsx_vpickev_b(tmp5,tmp4);
++
++    return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xfff;
++}
++
++/**
++ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
++ * returns a 12-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_384(m384 a, m384 b) {
++    u32 d = diffrich384(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline m384 load384(const void *ptr) {
++    assert(ISALIGNED_16(ptr));
++    m384 rv = {load128(ptr), load128((const char *)ptr + 16),
++               load128((const char *)ptr + 32)};
++    return rv;
++}
++
++// aligned store
++static really_inline void store384(void *ptr, m384 a) {
++    assert(ISALIGNED_16(ptr));
++    ptr = assume_aligned(ptr, 16);
++    *(m384 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m384 loadu384(const void *ptr) {
++    m384 rv = {loadu128(ptr), loadu128((const char *)ptr + 16),
++               loadu128((const char *)ptr + 32)};
++    return rv;
++}
++
++// packed unaligned store of first N bytes
++static really_inline void storebytes384(void *ptr, m384 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline m384 loadbytes384(const void *ptr, unsigned int n) {
++    m384 a = zeroes384();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++// switches on bit N in the given vector.
++static really_inline void setbit384(m384 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else if (n < 256) {
++        sub = &ptr->mid;
++    } else {
++        sub = &ptr->hi;
++    }
++    setbit128(sub, n % 128);
++}
++
++// switches off bit N in the given vector.
++static really_inline void clearbit384(m384 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else if (n < 256) {
++        sub = &ptr->mid;
++    } else {
++        sub = &ptr->hi;
++    }
++    clearbit128(sub, n % 128);
++}
++
++// tests bit N in the given vector.
++static really_inline char testbit384(m384 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo;
++    } else if (n < 256) {
++        sub = val.mid;
++    } else {
++        sub = val.hi;
++    }
++    return testbit128(sub, n % 128);
++}
++
++/****
++ **** 512-bit Primitives
++ ****/
++
++static really_inline m512 zeroes512(void) {
++    m512 rv = {zeroes256(), zeroes256()};
++    return rv;
++}
++
++static really_inline m512 ones512(void) {
++    m512 rv = {ones256(), ones256()};
++    return rv;
++}
++
++static really_inline m512 and512(m512 a, m512 b) {
++    m512 rv;
++    rv.lo = and256(a.lo, b.lo);
++    rv.hi = and256(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m512 or512(m512 a, m512 b) {
++    m512 rv;
++    rv.lo = or256(a.lo, b.lo);
++    rv.hi = or256(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m512 xor512(m512 a, m512 b) {
++    m512 rv;
++    rv.lo = xor256(a.lo, b.lo);
++    rv.hi = xor256(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m512 not512(m512 a) {
++    m512 rv;
++    rv.lo = not256(a.lo);
++    rv.hi = not256(a.hi);
++    return rv;
++}
++
++static really_inline m512 andnot512(m512 a, m512 b) {
++    m512 rv;
++    rv.lo = andnot256(a.lo, b.lo);
++    rv.hi = andnot256(a.hi, b.hi);
++    return rv;
++}
++
++static really_really_inline m512 lshift64_m512(m512 a, unsigned b) {
++    m512 rv;
++    rv.lo = lshift64_m256(a.lo, b);
++    rv.hi = lshift64_m256(a.hi, b);
++    return rv;
++}
++
++static really_inline int diff512(m512 a, m512 b) {
++    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
++}
++
++static really_inline int isnonzero512(m512 a) {
++    m128 x = or128(a.lo.lo, a.lo.hi);
++    m128 y = or128(a.hi.lo, a.hi.hi);
++    return isnonzero128(or128(x, y));
++}
++
++/**
++ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich512(m512 a, m512 b) {
++    m128 tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6,tmp7;
++    a.lo.lo = __lsx_vseq_w(a.lo.lo, b.lo.lo);
++    a.lo.hi = __lsx_vseq_w(a.lo.hi, b.lo.hi);
++    a.hi.lo = __lsx_vseq_w(a.hi.lo, b.hi.lo);
++    a.hi.hi = __lsx_vseq_w(a.hi.hi, b.hi.hi);
++   
++    tmp0 =__lsx_vsat_w(a.lo.lo, 15);
++    tmp1 =__lsx_vsat_w(a.lo.hi, 15);
++    tmp2 =__lsx_vpickev_h(tmp1, tmp0);
++
++    tmp3 =__lsx_vsat_w(a.hi.lo, 15);
++    tmp4 =__lsx_vsat_w(a.hi.hi, 15);
++    tmp5 =__lsx_vpickev_h(tmp4, tmp3);
++
++    tmp6 =__lsx_vsat_h(tmp2, 7);
++    tmp7 =__lsx_vsat_h(tmp5, 7);
++    m128 packed = __lsx_vpickev_b(tmp7, tmp6);
++
++    return ~(__lsx_vpickve2gr_hu(__lsx_vmskltz_b(packed), 0)) & 0xffff;   // ok
++}
++
++/**
++ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
++ * returns a 16-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_512(m512 a, m512 b) {
++    u32 d = diffrich512(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline m512 load512(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++    m512 rv = {load256(ptr), load256((const char *)ptr + 32)};
++    return rv;
++}
++
++// aligned store
++static really_inline void store512(void *ptr, m512 a) {
++    assert(ISALIGNED_N(ptr, alignof(m512)));
++    ptr = assume_aligned(ptr, 16);
++    *(m512 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m512 loadu512(const void *ptr) {
++    m512 rv = {loadu256(ptr), loadu256((const char *)ptr + 32)};
++    return rv;
++}
++
++// packed unaligned store of first N bytes
++static really_inline void storebytes512(void *ptr, m512 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline m512 loadbytes512(const void *ptr, unsigned int n) {
++    m512 a = zeroes512();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++static really_inline m512 mask1bit512(unsigned int n) {
++    assert(n < sizeof(m512) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu512(&simd_onebit_masks[mask_idx]);
++}
++
++// switches on bit N in the given vector.
++static really_inline void setbit512(m512 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo.lo;
++    } else if (n < 256) {
++        sub = &ptr->lo.hi;
++    } else if (n < 384) {
++        sub = &ptr->hi.lo;
++    } else {
++        sub = &ptr->hi.hi;
++    }
++    setbit128(sub, n % 128);
++}
++
++// switches off bit N in the given vector.
++static really_inline void clearbit512(m512 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo.lo;
++    } else if (n < 256) {
++        sub = &ptr->lo.hi;
++    } else if (n < 384) {
++        sub = &ptr->hi.lo;
++    } else {
++        sub = &ptr->hi.hi;
++    }
++    clearbit128(sub, n % 128);
++}
++
++// tests bit N in the given vector.
++static really_inline char testbit512(m512 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo.lo;
++    } else if (n < 256) {
++        sub = val.lo.hi;
++    } else if (n < 384) {
++        sub = val.hi.lo;
++    } else {
++        sub = val.hi.hi;
++    }
++    return testbit128(sub, n % 128);
++}
++
++#endif
+diff --git a/src/util/simd_types.h b/src/util/simd_types.h
+index 962cad6..3831423 100644
+--- a/src/util/simd_types.h
++++ b/src/util/simd_types.h
+@@ -33,9 +33,12 @@
+ #include "util/arch.h"
+ #include "util/intrinsics.h"
+ #include "ue2common.h"
++#include <lsxintrin.h>
+ 
+ #if defined(HAVE_SSE2)
+ typedef __m128i m128;
++#elif defined(ARCH_LOONGARCH64)
++typedef __m128i m128;
+ #else
+ typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
+ #endif
+diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
+index 5fa727e..86760a4 100644
+--- a/src/util/simd_utils.h
++++ b/src/util/simd_utils.h
+@@ -33,1388 +33,10 @@
+ #ifndef SIMD_UTILS
+ #define SIMD_UTILS
+ 
+-#if !defined(_WIN32) && !defined(__SSSE3__)
+-#error SSSE3 instructions must be enabled
++#if defined(__x86_64__)
++#include "simd_x86.h"
++#elif defined(__loongarch64)
++#include "simd_loongarch.h"
+ #endif
+ 
+-#include "config.h"
+-#include "ue2common.h"
+-#include "simd_types.h"
+-#include "unaligned.h"
+-#include "util/arch.h"
+-#include "util/intrinsics.h"
+-
+-#include <string.h> // for memcpy
+-
+-// Define a common assume_aligned using an appropriate compiler built-in, if
+-// it's available. Note that we need to handle C or C++ compilation.
+-#ifdef __cplusplus
+-#  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
+-#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+-#  endif
+-#else
+-#  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
+-#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
+-#  endif
+-#endif
+-
+-// Fallback to identity case.
+-#ifndef assume_aligned
+-#define assume_aligned(x, y) (x)
+-#endif
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-extern const char vbs_mask_data[];
+-#ifdef __cplusplus
+-}
+-#endif
+-
+-static really_inline m128 ones128(void) {
+-#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+-    /* gcc gets this right */
+-    return _mm_set1_epi8(0xFF);
+-#else
+-    /* trick from Intel's optimization guide to generate all-ones.
+-     * ICC converts this to the single cmpeq instruction */
+-    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
+-#endif
+-}
+-
+-static really_inline m128 zeroes128(void) {
+-    return _mm_setzero_si128();
+-}
+-
+-/** \brief Bitwise not for m128*/
+-static really_inline m128 not128(m128 a) {
+-    return _mm_xor_si128(a, ones128());
+-}
+-
+-/** \brief Return 1 if a and b are different otherwise 0 */
+-static really_inline int diff128(m128 a, m128 b) {
+-    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
+-}
+-
+-static really_inline int isnonzero128(m128 a) {
+-    return !!diff128(a, zeroes128());
+-}
+-
+-/**
+- * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
+- * mask indicating which 32-bit words contain differences.
+- */
+-static really_inline u32 diffrich128(m128 a, m128 b) {
+-    a = _mm_cmpeq_epi32(a, b);
+-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
+-}
+-
+-/**
+- * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
+- * returns a 4-bit mask indicating which 64-bit words contain differences.
+- */
+-static really_inline u32 diffrich64_128(m128 a, m128 b) {
+-#if defined(HAVE_SSE41)
+-    a = _mm_cmpeq_epi64(a, b);
+-    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
+-#else
+-    u32 d = diffrich128(a, b);
+-    return (d | (d >> 1)) & 0x5;
+-#endif
+-}
+-
+-static really_really_inline
+-m128 lshift64_m128(m128 a, unsigned b) {
+-#if defined(HAVE__BUILTIN_CONSTANT_P)
+-    if (__builtin_constant_p(b)) {
+-        return _mm_slli_epi64(a, b);
+-    }
+-#endif
+-    m128 x = _mm_cvtsi32_si128(b);
+-    return _mm_sll_epi64(a, x);
+-}
+-
+-#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
+-#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
+-#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
+-
+-#if defined(HAVE_AVX512)
+-static really_inline m128 cast512to128(const m512 in) {
+-    return _mm512_castsi512_si128(in);
+-}
+-#endif
+-
+-static really_inline m128 set16x8(u8 c) {
+-    return _mm_set1_epi8(c);
+-}
+-
+-static really_inline m128 set4x32(u32 c) {
+-    return _mm_set1_epi32(c);
+-}
+-
+-static really_inline u32 movd(const m128 in) {
+-    return _mm_cvtsi128_si32(in);
+-}
+-
+-static really_inline u64a movq(const m128 in) {
+-#if defined(ARCH_X86_64)
+-    return _mm_cvtsi128_si64(in);
+-#else // 32-bit - this is horrific
+-    u32 lo = movd(in);
+-    u32 hi = movd(_mm_srli_epi64(in, 32));
+-    return (u64a)hi << 32 | lo;
+-#endif
+-}
+-
+-#if defined(HAVE_AVX512)
+-static really_inline u32 movd512(const m512 in) {
+-    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
+-    //       so we use 2-step convertions to work around.
+-    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
+-}
+-
+-static really_inline u64a movq512(const m512 in) {
+-    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
+-    //       so we use 2-step convertions to work around.
+-    return movq(_mm512_castsi512_si128(in));
+-}
+-#endif
+-
+-/* another form of movq */
+-static really_inline
+-m128 load_m128_from_u64a(const u64a *p) {
+-    return _mm_set_epi64x(0LL, *p);
+-}
+-
+-#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
+-#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
+-
+-#if defined(HAVE_SSE41)
+-#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
+-#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
+-#else
+-#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
+-#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
+-#endif
+-
+-#if !defined(HAVE_AVX2)
+-// TODO: this entire file needs restructuring - this carveout is awful
+-#define extractlow64from256(a) movq(a.lo)
+-#define extractlow32from256(a) movd(a.lo)
+-#if defined(HAVE_SSE41)
+-#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
+-#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
+-#else
+-#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
+-#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
+-#endif
+-
+-#endif // !AVX2
+-
+-static really_inline m128 and128(m128 a, m128 b) {
+-    return _mm_and_si128(a,b);
+-}
+-
+-static really_inline m128 xor128(m128 a, m128 b) {
+-    return _mm_xor_si128(a,b);
+-}
+-
+-static really_inline m128 or128(m128 a, m128 b) {
+-    return _mm_or_si128(a,b);
+-}
+-
+-#if defined(HAVE_AVX512VBMI)
+-static really_inline m512 expand128(m128 a) {
+-    return _mm512_broadcast_i32x4(a);
+-}
+-
+-static really_inline m512 expand256(m256 a) {
+-    return _mm512_broadcast_i64x4(a);
+-}
+-
+-static really_inline m512 expand384(m384 a) {
+-    u64a *lo = (u64a*)&a.lo;
+-    u64a *mid = (u64a*)&a.mid;
+-    u64a *hi = (u64a*)&a.hi;
+-    return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
+-                            lo[1], lo[0]);
+-}
+-#endif
+-
+-static really_inline m128 andnot128(m128 a, m128 b) {
+-    return _mm_andnot_si128(a, b);
+-}
+-
+-// aligned load
+-static really_inline m128 load128(const void *ptr) {
+-    assert(ISALIGNED_N(ptr, alignof(m128)));
+-    ptr = assume_aligned(ptr, 16);
+-    return _mm_load_si128((const m128 *)ptr);
+-}
+-
+-// aligned store
+-static really_inline void store128(void *ptr, m128 a) {
+-    assert(ISALIGNED_N(ptr, alignof(m128)));
+-    ptr = assume_aligned(ptr, 16);
+-    *(m128 *)ptr = a;
+-}
+-
+-// unaligned load
+-static really_inline m128 loadu128(const void *ptr) {
+-    return _mm_loadu_si128((const m128 *)ptr);
+-}
+-
+-// unaligned store
+-static really_inline void storeu128(void *ptr, m128 a) {
+-    _mm_storeu_si128 ((m128 *)ptr, a);
+-}
+-
+-// packed unaligned store of first N bytes
+-static really_inline
+-void storebytes128(void *ptr, m128 a, unsigned int n) {
+-    assert(n <= sizeof(a));
+-    memcpy(ptr, &a, n);
+-}
+-
+-// packed unaligned load of first N bytes, pad with zero
+-static really_inline
+-m128 loadbytes128(const void *ptr, unsigned int n) {
+-    m128 a = zeroes128();
+-    assert(n <= sizeof(a));
+-    memcpy(&a, ptr, n);
+-    return a;
+-}
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-extern const u8 simd_onebit_masks[];
+-#ifdef __cplusplus
+-}
+-#endif
+-
+-static really_inline
+-m128 mask1bit128(unsigned int n) {
+-    assert(n < sizeof(m128) * 8);
+-    u32 mask_idx = ((n % 8) * 64) + 95;
+-    mask_idx -= n / 8;
+-    return loadu128(&simd_onebit_masks[mask_idx]);
+-}
+-
+-// switches on bit N in the given vector.
+-static really_inline
+-void setbit128(m128 *ptr, unsigned int n) {
+-    *ptr = or128(mask1bit128(n), *ptr);
+-}
+-
+-// switches off bit N in the given vector.
+-static really_inline
+-void clearbit128(m128 *ptr, unsigned int n) {
+-    *ptr = andnot128(mask1bit128(n), *ptr);
+-}
+-
+-// tests bit N in the given vector.
+-static really_inline
+-char testbit128(m128 val, unsigned int n) {
+-    const m128 mask = mask1bit128(n);
+-#if defined(HAVE_SSE41)
+-    return !_mm_testz_si128(mask, val);
+-#else
+-    return isnonzero128(and128(mask, val));
+-#endif
+-}
+-
+-// offset must be an immediate
+-#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
+-
+-static really_inline
+-m128 pshufb_m128(m128 a, m128 b) {
+-    m128 result;
+-    result = _mm_shuffle_epi8(a, b);
+-    return result;
+-}
+-
+-static really_inline
+-m256 pshufb_m256(m256 a, m256 b) {
+-#if defined(HAVE_AVX2)
+-    return _mm256_shuffle_epi8(a, b);
+-#else
+-    m256 rv;
+-    rv.lo = pshufb_m128(a.lo, b.lo);
+-    rv.hi = pshufb_m128(a.hi, b.hi);
+-    return rv;
+-#endif
+-}
+-
+-#if defined(HAVE_AVX512)
+-static really_inline
+-m512 pshufb_m512(m512 a, m512 b) {
+-    return _mm512_shuffle_epi8(a, b);
+-}
+-
+-static really_inline
+-m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
+-    return _mm512_maskz_shuffle_epi8(k, a, b);
+-}
+-
+-#if defined(HAVE_AVX512VBMI)
+-#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
+-#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
+-#endif
+-
+-#endif
+-
+-static really_inline
+-m128 variable_byte_shift_m128(m128 in, s32 amount) {
+-    assert(amount >= -16 && amount <= 16);
+-    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
+-    return pshufb_m128(in, shift_mask);
+-}
+-
+-static really_inline
+-m128 max_u8_m128(m128 a, m128 b) {
+-    return _mm_max_epu8(a, b);
+-}
+-
+-static really_inline
+-m128 min_u8_m128(m128 a, m128 b) {
+-    return _mm_min_epu8(a, b);
+-}
+-
+-static really_inline
+-m128 sadd_u8_m128(m128 a, m128 b) {
+-    return _mm_adds_epu8(a, b);
+-}
+-
+-static really_inline
+-m128 sub_u8_m128(m128 a, m128 b) {
+-    return _mm_sub_epi8(a, b);
+-}
+-
+-static really_inline
+-m128 set64x2(u64a hi, u64a lo) {
+-    return _mm_set_epi64x(hi, lo);
+-}
+-
+-/****
+- **** 256-bit Primitives
+- ****/
+-
+-#if defined(HAVE_AVX2)
+-
+-static really_really_inline
+-m256 lshift64_m256(m256 a, unsigned b) {
+-#if defined(HAVE__BUILTIN_CONSTANT_P)
+-    if (__builtin_constant_p(b)) {
+-        return _mm256_slli_epi64(a, b);
+-    }
+-#endif
+-    m128 x = _mm_cvtsi32_si128(b);
+-    return _mm256_sll_epi64(a, x);
+-}
+-
+-#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
+-
+-static really_inline
+-m256 set32x8(u32 in) {
+-    return _mm256_set1_epi8(in);
+-}
+-
+-#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
+-#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
+-
+-static really_inline
+-m256 set2x128(m128 a) {
+-    return _mm256_broadcastsi128_si256(a);
+-}
+-
+-#else
+-
+-static really_really_inline
+-m256 lshift64_m256(m256 a, int b) {
+-    m256 rv = a;
+-    rv.lo = lshift64_m128(rv.lo, b);
+-    rv.hi = lshift64_m128(rv.hi, b);
+-    return rv;
+-}
+-
+-static really_inline
+-m256 rshift64_m256(m256 a, int b) {
+-    m256 rv = a;
+-    rv.lo = rshift64_m128(rv.lo, b);
+-    rv.hi = rshift64_m128(rv.hi, b);
+-    return rv;
+-}
+-static really_inline
+-m256 set32x8(u32 in) {
+-    m256 rv;
+-    rv.lo = set16x8((u8) in);
+-    rv.hi = rv.lo;
+-    return rv;
+-}
+-
+-static really_inline
+-m256 eq256(m256 a, m256 b) {
+-    m256 rv;
+-    rv.lo = eq128(a.lo, b.lo);
+-    rv.hi = eq128(a.hi, b.hi);
+-    return rv;
+-}
+-
+-static really_inline
+-u32 movemask256(m256 a) {
+-    u32 lo_mask = movemask128(a.lo);
+-    u32 hi_mask = movemask128(a.hi);
+-    return lo_mask | (hi_mask << 16);
+-}
+-
+-static really_inline
+-m256 set2x128(m128 a) {
+-    m256 rv = {a, a};
+-    return rv;
+-}
+-#endif
+-
+-static really_inline m256 zeroes256(void) {
+-#if defined(HAVE_AVX2)
+-    return _mm256_setzero_si256();
+-#else
+-    m256 rv = {zeroes128(), zeroes128()};
+-    return rv;
+-#endif
+-}
+-
+-static really_inline m256 ones256(void) {
+-#if defined(HAVE_AVX2)
+-    m256 rv = _mm256_set1_epi8(0xFF);
+-#else
+-    m256 rv = {ones128(), ones128()};
+-#endif
+-    return rv;
+-}
+-
+-#if defined(HAVE_AVX2)
+-static really_inline m256 and256(m256 a, m256 b) {
+-    return _mm256_and_si256(a, b);
+-}
+-#else
+-static really_inline m256 and256(m256 a, m256 b) {
+-    m256 rv;
+-    rv.lo = and128(a.lo, b.lo);
+-    rv.hi = and128(a.hi, b.hi);
+-    return rv;
+-}
+-#endif
+-
+-#if defined(HAVE_AVX2)
+-static really_inline m256 or256(m256 a, m256 b) {
+-    return _mm256_or_si256(a, b);
+-}
+-#else
+-static really_inline m256 or256(m256 a, m256 b) {
+-    m256 rv;
+-    rv.lo = or128(a.lo, b.lo);
+-    rv.hi = or128(a.hi, b.hi);
+-    return rv;
+-}
+-#endif
+-
+-#if defined(HAVE_AVX2)
+-static really_inline m256 xor256(m256 a, m256 b) {
+-    return _mm256_xor_si256(a, b);
+-}
+-#else
+-static really_inline m256 xor256(m256 a, m256 b) {
+-    m256 rv;
+-    rv.lo = xor128(a.lo, b.lo);
+-    rv.hi = xor128(a.hi, b.hi);
+-    return rv;
+-}
+-#endif
+-
+-#if defined(HAVE_AVX2)
+-static really_inline m256 not256(m256 a) {
+-    return _mm256_xor_si256(a, ones256());
+-}
+-#else
+-static really_inline m256 not256(m256 a) {
+-    m256 rv;
+-    rv.lo = not128(a.lo);
+-    rv.hi = not128(a.hi);
+-    return rv;
+-}
+-#endif
+-
+-#if defined(HAVE_AVX2)
+-static really_inline m256 andnot256(m256 a, m256 b) {
+-    return _mm256_andnot_si256(a, b);
+-}
+-#else
+-static really_inline m256 andnot256(m256 a, m256 b) {
+-    m256 rv;
+-    rv.lo = andnot128(a.lo, b.lo);
+-    rv.hi = andnot128(a.hi, b.hi);
+-    return rv;
+-}
+-#endif
+-
+-static really_inline int diff256(m256 a, m256 b) {
+-#if defined(HAVE_AVX2)
+-    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
+-#else
+-    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
+-#endif
+-}
+-
+-static really_inline int isnonzero256(m256 a) {
+-#if defined(HAVE_AVX2)
+-    return !!diff256(a, zeroes256());
+-#else
+-    return isnonzero128(or128(a.lo, a.hi));
+-#endif
+-}
+-
+-/**
+- * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
+- * mask indicating which 32-bit words contain differences.
+- */
+-static really_inline u32 diffrich256(m256 a, m256 b) {
+-#if defined(HAVE_AVX2)
+-    a = _mm256_cmpeq_epi32(a, b);
+-    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
+-#else
+-    m128 z = zeroes128();
+-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
+-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
+-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
+-    return ~(_mm_movemask_epi8(packed)) & 0xff;
+-#endif
+-}
+-
+-/**
+- * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
+- * returns an 8-bit mask indicating which 64-bit words contain differences.
+- */
+-static really_inline u32 diffrich64_256(m256 a, m256 b) {
+-    u32 d = diffrich256(a, b);
+-    return (d | (d >> 1)) & 0x55555555;
+-}
+-
+-// aligned load
+-static really_inline m256 load256(const void *ptr) {
+-    assert(ISALIGNED_N(ptr, alignof(m256)));
+-#if defined(HAVE_AVX2)
+-    return _mm256_load_si256((const m256 *)ptr);
+-#else
+-    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
+-    return rv;
+-#endif
+-}
+-
+-// aligned load  of 128-bit value to low and high part of 256-bit value
+-static really_inline m256 load2x128(const void *ptr) {
+-#if defined(HAVE_AVX2)
+-    return set2x128(load128(ptr));
+-#else
+-    assert(ISALIGNED_N(ptr, alignof(m128)));
+-    m256 rv;
+-    rv.hi = rv.lo = load128(ptr);
+-    return rv;
+-#endif
+-}
+-
+-static really_inline m256 loadu2x128(const void *ptr) {
+-    return set2x128(loadu128(ptr));
+-}
+-
+-// aligned store
+-static really_inline void store256(void *ptr, m256 a) {
+-    assert(ISALIGNED_N(ptr, alignof(m256)));
+-#if defined(HAVE_AVX2)
+-    _mm256_store_si256((m256 *)ptr, a);
+-#else
+-    ptr = assume_aligned(ptr, 16);
+-    *(m256 *)ptr = a;
+-#endif
+-}
+-
+-// unaligned load
+-static really_inline m256 loadu256(const void *ptr) {
+-#if defined(HAVE_AVX2)
+-    return _mm256_loadu_si256((const m256 *)ptr);
+-#else
+-    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
+-    return rv;
+-#endif
+-}
+-
+-// unaligned store
+-static really_inline void storeu256(void *ptr, m256 a) {
+-#if defined(HAVE_AVX2)
+-    _mm256_storeu_si256((m256 *)ptr, a);
+-#else
+-    storeu128(ptr, a.lo);
+-    storeu128((char *)ptr + 16, a.hi);
+-#endif
+-}
+-
+-// packed unaligned store of first N bytes
+-static really_inline
+-void storebytes256(void *ptr, m256 a, unsigned int n) {
+-    assert(n <= sizeof(a));
+-    memcpy(ptr, &a, n);
+-}
+-
+-// packed unaligned load of first N bytes, pad with zero
+-static really_inline
+-m256 loadbytes256(const void *ptr, unsigned int n) {
+-    m256 a = zeroes256();
+-    assert(n <= sizeof(a));
+-    memcpy(&a, ptr, n);
+-    return a;
+-}
+-
+-static really_inline
+-m256 mask1bit256(unsigned int n) {
+-    assert(n < sizeof(m256) * 8);
+-    u32 mask_idx = ((n % 8) * 64) + 95;
+-    mask_idx -= n / 8;
+-    return loadu256(&simd_onebit_masks[mask_idx]);
+-}
+-
+-static really_inline
+-m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+-#if defined(HAVE_AVX2)
+-    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
+-#else
+-    m256 rv;
+-    rv.hi = set64x2(hi_1, hi_0);
+-    rv.lo = set64x2(lo_1, lo_0);
+-    return rv;
+-#endif
+-}
+-
+-#if !defined(HAVE_AVX2)
+-// switches on bit N in the given vector.
+-static really_inline
+-void setbit256(m256 *ptr, unsigned int n) {
+-    assert(n < sizeof(*ptr) * 8);
+-    m128 *sub;
+-    if (n < 128) {
+-        sub = &ptr->lo;
+-    } else {
+-        sub = &ptr->hi;
+-        n -= 128;
+-    }
+-    setbit128(sub, n);
+-}
+-
+-// switches off bit N in the given vector.
+-static really_inline
+-void clearbit256(m256 *ptr, unsigned int n) {
+-    assert(n < sizeof(*ptr) * 8);
+-    m128 *sub;
+-    if (n < 128) {
+-        sub = &ptr->lo;
+-    } else {
+-        sub = &ptr->hi;
+-        n -= 128;
+-    }
+-    clearbit128(sub, n);
+-}
+-
+-// tests bit N in the given vector.
+-static really_inline
+-char testbit256(m256 val, unsigned int n) {
+-    assert(n < sizeof(val) * 8);
+-    m128 sub;
+-    if (n < 128) {
+-        sub = val.lo;
+-    } else {
+-        sub = val.hi;
+-        n -= 128;
+-    }
+-    return testbit128(sub, n);
+-}
+-
+-static really_really_inline
+-m128 movdq_hi(m256 x) {
+-    return x.hi;
+-}
+-
+-static really_really_inline
+-m128 movdq_lo(m256 x) {
+-    return x.lo;
+-}
+-
+-static really_inline
+-m256 combine2x128(m128 hi, m128 lo) {
+-    m256 rv = {lo, hi};
+-    return rv;
+-}
+-
+-#else // AVX2
+-
+-// switches on bit N in the given vector.
+-static really_inline
+-void setbit256(m256 *ptr, unsigned int n) {
+-    *ptr = or256(mask1bit256(n), *ptr);
+-}
+-
+-static really_inline
+-void clearbit256(m256 *ptr, unsigned int n) {
+-    *ptr = andnot256(mask1bit256(n), *ptr);
+-}
+-
+-// tests bit N in the given vector.
+-static really_inline
+-char testbit256(m256 val, unsigned int n) {
+-    const m256 mask = mask1bit256(n);
+-    return !_mm256_testz_si256(mask, val);
+-}
+-
+-static really_really_inline
+-m128 movdq_hi(m256 x) {
+-    return _mm256_extracti128_si256(x, 1);
+-}
+-
+-static really_really_inline
+-m128 movdq_lo(m256 x) {
+-    return _mm256_extracti128_si256(x, 0);
+-}
+-
+-#define cast256to128(a) _mm256_castsi256_si128(a)
+-#define cast128to256(a) _mm256_castsi128_si256(a)
+-#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
+-#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
+-#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
+-#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
+-#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
+-#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
+-#define extractlow64from256(a) movq(cast256to128(a))
+-#define extractlow32from256(a) movd(cast256to128(a))
+-#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
+-#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
+-#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
+-
+-static really_inline
+-m256 combine2x128(m128 hi, m128 lo) {
+-#if defined(_mm256_set_m128i)
+-    return _mm256_set_m128i(hi, lo);
+-#else
+-    return insert128to256(cast128to256(lo), hi, 1);
+-#endif
+-}
+-#endif //AVX2
+-
+-#if defined(HAVE_AVX512)
+-#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
+-#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
+-#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
+-#define set2x256(a) _mm512_broadcast_i64x4(a)
+-#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
+-#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
+-#endif
+-
+-/****
+- **** 384-bit Primitives
+- ****/
+-
+-static really_inline m384 and384(m384 a, m384 b) {
+-    m384 rv;
+-    rv.lo = and128(a.lo, b.lo);
+-    rv.mid = and128(a.mid, b.mid);
+-    rv.hi = and128(a.hi, b.hi);
+-    return rv;
+-}
+-
+-static really_inline m384 or384(m384 a, m384 b) {
+-    m384 rv;
+-    rv.lo = or128(a.lo, b.lo);
+-    rv.mid = or128(a.mid, b.mid);
+-    rv.hi = or128(a.hi, b.hi);
+-    return rv;
+-}
+-
+-static really_inline m384 xor384(m384 a, m384 b) {
+-    m384 rv;
+-    rv.lo = xor128(a.lo, b.lo);
+-    rv.mid = xor128(a.mid, b.mid);
+-    rv.hi = xor128(a.hi, b.hi);
+-    return rv;
+-}
+-static really_inline m384 not384(m384 a) {
+-    m384 rv;
+-    rv.lo = not128(a.lo);
+-    rv.mid = not128(a.mid);
+-    rv.hi = not128(a.hi);
+-    return rv;
+-}
+-static really_inline m384 andnot384(m384 a, m384 b) {
+-    m384 rv;
+-    rv.lo = andnot128(a.lo, b.lo);
+-    rv.mid = andnot128(a.mid, b.mid);
+-    rv.hi = andnot128(a.hi, b.hi);
+-    return rv;
+-}
+-
+-static really_really_inline
+-m384 lshift64_m384(m384 a, unsigned b) {
+-    m384 rv;
+-    rv.lo = lshift64_m128(a.lo, b);
+-    rv.mid = lshift64_m128(a.mid, b);
+-    rv.hi = lshift64_m128(a.hi, b);
+-    return rv;
+-}
+-
+-static really_inline m384 zeroes384(void) {
+-    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
+-    return rv;
+-}
+-
+-static really_inline m384 ones384(void) {
+-    m384 rv = {ones128(), ones128(), ones128()};
+-    return rv;
+-}
+-
+-static really_inline int diff384(m384 a, m384 b) {
+-    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
+-}
+-
+-static really_inline int isnonzero384(m384 a) {
+-    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
+-}
+-
+-/**
+- * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
+- * mask indicating which 32-bit words contain differences.
+- */
+-static really_inline u32 diffrich384(m384 a, m384 b) {
+-    m128 z = zeroes128();
+-    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
+-    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
+-    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
+-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
+-                                  _mm_packs_epi32(a.hi, z));
+-    return ~(_mm_movemask_epi8(packed)) & 0xfff;
+-}
+-
+-/**
+- * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
+- * returns a 12-bit mask indicating which 64-bit words contain differences.
+- */
+-static really_inline u32 diffrich64_384(m384 a, m384 b) {
+-    u32 d = diffrich384(a, b);
+-    return (d | (d >> 1)) & 0x55555555;
+-}
+-
+-// aligned load
+-static really_inline m384 load384(const void *ptr) {
+-    assert(ISALIGNED_16(ptr));
+-    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
+-                load128((const char *)ptr + 32) };
+-    return rv;
+-}
+-
+-// aligned store
+-static really_inline void store384(void *ptr, m384 a) {
+-    assert(ISALIGNED_16(ptr));
+-    ptr = assume_aligned(ptr, 16);
+-    *(m384 *)ptr = a;
+-}
+-
+-// unaligned load
+-static really_inline m384 loadu384(const void *ptr) {
+-    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
+-                loadu128((const char *)ptr + 32)};
+-    return rv;
+-}
+-
+-// packed unaligned store of first N bytes
+-static really_inline
+-void storebytes384(void *ptr, m384 a, unsigned int n) {
+-    assert(n <= sizeof(a));
+-    memcpy(ptr, &a, n);
+-}
+-
+-// packed unaligned load of first N bytes, pad with zero
+-static really_inline
+-m384 loadbytes384(const void *ptr, unsigned int n) {
+-    m384 a = zeroes384();
+-    assert(n <= sizeof(a));
+-    memcpy(&a, ptr, n);
+-    return a;
+-}
+-
+-// switches on bit N in the given vector.
+-static really_inline
+-void setbit384(m384 *ptr, unsigned int n) {
+-    assert(n < sizeof(*ptr) * 8);
+-    m128 *sub;
+-    if (n < 128) {
+-        sub = &ptr->lo;
+-    } else if (n < 256) {
+-        sub = &ptr->mid;
+-    } else {
+-        sub = &ptr->hi;
+-    }
+-    setbit128(sub, n % 128);
+-}
+-
+-// switches off bit N in the given vector.
+-static really_inline
+-void clearbit384(m384 *ptr, unsigned int n) {
+-    assert(n < sizeof(*ptr) * 8);
+-    m128 *sub;
+-    if (n < 128) {
+-        sub = &ptr->lo;
+-    } else if (n < 256) {
+-        sub = &ptr->mid;
+-    } else {
+-        sub = &ptr->hi;
+-    }
+-    clearbit128(sub, n % 128);
+-}
+-
+-// tests bit N in the given vector.
+-static really_inline
+-char testbit384(m384 val, unsigned int n) {
+-    assert(n < sizeof(val) * 8);
+-    m128 sub;
+-    if (n < 128) {
+-        sub = val.lo;
+-    } else if (n < 256) {
+-        sub = val.mid;
+-    } else {
+-        sub = val.hi;
+-    }
+-    return testbit128(sub, n % 128);
+-}
+-
+-/****
+- **** 512-bit Primitives
+- ****/
+-
+-#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
+-#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
+-
+-static really_inline
+-m512 zeroes512(void) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_setzero_si512();
+-#else
+-    m512 rv = {zeroes256(), zeroes256()};
+-    return rv;
+-#endif
+-}
+-
+-static really_inline
+-m512 ones512(void) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_set1_epi8(0xFF);
+-    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
+-#else
+-    m512 rv = {ones256(), ones256()};
+-    return rv;
+-#endif
+-}
+-
+-#if defined(HAVE_AVX512)
+-static really_inline
+-m512 set64x8(u8 a) {
+-    return _mm512_set1_epi8(a);
+-}
+-
+-static really_inline
+-m512 set8x64(u64a a) {
+-    return _mm512_set1_epi64(a);
+-}
+-
+-static really_inline
+-m512 set16x32(u32 a) {
+-    return _mm512_set1_epi32(a);
+-}
+-
+-static really_inline
+-m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
+-               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
+-    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
+-                            lo_3, lo_2, lo_1, lo_0);
+-}
+-
+-static really_inline
+-m512 swap256in512(m512 a) {
+-    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
+-    return vpermq512(idx, a);
+-}
+-
+-static really_inline
+-m512 set4x128(m128 a) {
+-    return _mm512_broadcast_i32x4(a);
+-}
+-
+-static really_inline
+-m512 sadd_u8_m512(m512 a, m512 b) {
+-    return _mm512_adds_epu8(a, b);
+-}
+-
+-static really_inline
+-m512 max_u8_m512(m512 a, m512 b) {
+-    return _mm512_max_epu8(a, b);
+-}
+-
+-static really_inline
+-m512 min_u8_m512(m512 a, m512 b) {
+-    return _mm512_min_epu8(a, b);
+-}
+-
+-static really_inline
+-m512 sub_u8_m512(m512 a, m512 b) {
+-    return _mm512_sub_epi8(a, b);
+-}
+-#endif
+-
+-static really_inline
+-m512 and512(m512 a, m512 b) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_and_si512(a, b);
+-#else
+-    m512 rv;
+-    rv.lo = and256(a.lo, b.lo);
+-    rv.hi = and256(a.hi, b.hi);
+-    return rv;
+-#endif
+-}
+-
+-static really_inline
+-m512 or512(m512 a, m512 b) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_or_si512(a, b);
+-#else
+-    m512 rv;
+-    rv.lo = or256(a.lo, b.lo);
+-    rv.hi = or256(a.hi, b.hi);
+-    return rv;
+-#endif
+-}
+-
+-static really_inline
+-m512 xor512(m512 a, m512 b) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_xor_si512(a, b);
+-#else
+-    m512 rv;
+-    rv.lo = xor256(a.lo, b.lo);
+-    rv.hi = xor256(a.hi, b.hi);
+-    return rv;
+-#endif
+-}
+-
+-static really_inline
+-m512 not512(m512 a) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_xor_si512(a, ones512());
+-#else
+-    m512 rv;
+-    rv.lo = not256(a.lo);
+-    rv.hi = not256(a.hi);
+-    return rv;
+-#endif
+-}
+-
+-static really_inline
+-m512 andnot512(m512 a, m512 b) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_andnot_si512(a, b);
+-#else
+-    m512 rv;
+-    rv.lo = andnot256(a.lo, b.lo);
+-    rv.hi = andnot256(a.hi, b.hi);
+-    return rv;
+-#endif
+-}
+-
+-#if defined(HAVE_AVX512)
+-static really_really_inline
+-m512 lshift64_m512(m512 a, unsigned b) {
+-#if defined(HAVE__BUILTIN_CONSTANT_P)
+-    if (__builtin_constant_p(b)) {
+-        return _mm512_slli_epi64(a, b);
+-    }
+-#endif
+-    m128 x = _mm_cvtsi32_si128(b);
+-    return _mm512_sll_epi64(a, x);
+-}
+-#else
+-static really_really_inline
+-m512 lshift64_m512(m512 a, unsigned b) {
+-    m512 rv;
+-    rv.lo = lshift64_m256(a.lo, b);
+-    rv.hi = lshift64_m256(a.hi, b);
+-    return rv;
+-}
+-#endif
+-
+-#if defined(HAVE_AVX512)
+-#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+-#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+-#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
+-#endif
+-
+-#if !defined(_MM_CMPINT_NE)
+-#define _MM_CMPINT_NE 0x4
+-#endif
+-
+-static really_inline
+-int diff512(m512 a, m512 b) {
+-#if defined(HAVE_AVX512)
+-    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
+-#else
+-    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+-#endif
+-}
+-
+-static really_inline
+-int isnonzero512(m512 a) {
+-#if defined(HAVE_AVX512)
+-    return diff512(a, zeroes512());
+-#elif defined(HAVE_AVX2)
+-    m256 x = or256(a.lo, a.hi);
+-    return !!diff256(x, zeroes256());
+-#else
+-    m128 x = or128(a.lo.lo, a.lo.hi);
+-    m128 y = or128(a.hi.lo, a.hi.hi);
+-    return isnonzero128(or128(x, y));
+-#endif
+-}
+-
+-/**
+- * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
+- * mask indicating which 32-bit words contain differences.
+- */
+-static really_inline
+-u32 diffrich512(m512 a, m512 b) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
+-#elif defined(HAVE_AVX2)
+-    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
+-#else
+-    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
+-    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
+-    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
+-    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
+-    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
+-                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
+-    return ~(_mm_movemask_epi8(packed)) & 0xffff;
+-#endif
+-}
+-
+-/**
+- * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
+- * returns a 16-bit mask indicating which 64-bit words contain differences.
+- */
+-static really_inline
+-u32 diffrich64_512(m512 a, m512 b) {
+-    //TODO: cmp_epi64?
+-    u32 d = diffrich512(a, b);
+-    return (d | (d >> 1)) & 0x55555555;
+-}
+-
+-// aligned load
+-static really_inline
+-m512 load512(const void *ptr) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_load_si512(ptr);
+-#else
+-    assert(ISALIGNED_N(ptr, alignof(m256)));
+-    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
+-    return rv;
+-#endif
+-}
+-
+-// aligned store
+-static really_inline
+-void store512(void *ptr, m512 a) {
+-    assert(ISALIGNED_N(ptr, alignof(m512)));
+-#if defined(HAVE_AVX512)
+-    return _mm512_store_si512(ptr, a);
+-#elif defined(HAVE_AVX2)
+-    m512 *x = (m512 *)ptr;
+-    store256(&x->lo, a.lo);
+-    store256(&x->hi, a.hi);
+-#else
+-    ptr = assume_aligned(ptr, 16);
+-    *(m512 *)ptr = a;
+-#endif
+-}
+-
+-// unaligned load
+-static really_inline
+-m512 loadu512(const void *ptr) {
+-#if defined(HAVE_AVX512)
+-    return _mm512_loadu_si512(ptr);
+-#else
+-    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
+-    return rv;
+-#endif
+-}
+-
+-// unaligned store
+-static really_inline
+-void storeu512(void *ptr, m512 a) {
+-#if defined(HAVE_AVX512)
+-    _mm512_storeu_si512((m512 *)ptr, a);
+-#elif defined(HAVE_AVX2)
+-    storeu256(ptr, a.lo);
+-    storeu256((char *)ptr + 32, a.hi);
+-#else
+-    storeu128(ptr, a.lo.lo);
+-    storeu128((char *)ptr + 16, a.lo.hi);
+-    storeu128((char *)ptr + 32, a.hi.lo);
+-    storeu128((char *)ptr + 48, a.hi.hi);
+-#endif
+-}
+-
+-#if defined(HAVE_AVX512)
+-static really_inline
+-m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+-    return _mm512_maskz_loadu_epi8(k, ptr);
+-}
+-
+-static really_inline
+-m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+-    return _mm512_mask_loadu_epi8(src, k, ptr);
+-}
+-
+-static really_inline
+-void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
+-    _mm512_mask_storeu_epi8(ptr, k, a);
+-}
+-
+-static really_inline
+-m512 set_mask_m512(__mmask64 k) {
+-    return _mm512_movm_epi8(k);
+-}
+-
+-static really_inline
+-m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
+-    return _mm256_maskz_loadu_epi8(k, ptr);
+-}
+-#endif
+-
+-// packed unaligned store of first N bytes
+-static really_inline
+-void storebytes512(void *ptr, m512 a, unsigned int n) {
+-    assert(n <= sizeof(a));
+-    memcpy(ptr, &a, n);
+-}
+-
+-// packed unaligned load of first N bytes, pad with zero
+-static really_inline
+-m512 loadbytes512(const void *ptr, unsigned int n) {
+-    m512 a = zeroes512();
+-    assert(n <= sizeof(a));
+-    memcpy(&a, ptr, n);
+-    return a;
+-}
+-
+-static really_inline
+-m512 mask1bit512(unsigned int n) {
+-    assert(n < sizeof(m512) * 8);
+-    u32 mask_idx = ((n % 8) * 64) + 95;
+-    mask_idx -= n / 8;
+-    return loadu512(&simd_onebit_masks[mask_idx]);
+-}
+-
+-// switches on bit N in the given vector.
+-static really_inline
+-void setbit512(m512 *ptr, unsigned int n) {
+-    assert(n < sizeof(*ptr) * 8);
+-#if !defined(HAVE_AVX2)
+-    m128 *sub;
+-    if (n < 128) {
+-        sub = &ptr->lo.lo;
+-    } else if (n < 256) {
+-        sub = &ptr->lo.hi;
+-    } else if (n < 384) {
+-        sub = &ptr->hi.lo;
+-    } else {
+-        sub = &ptr->hi.hi;
+-    }
+-    setbit128(sub, n % 128);
+-#elif defined(HAVE_AVX512)
+-    *ptr = or512(mask1bit512(n), *ptr);
+-#else
+-    m256 *sub;
+-    if (n < 256) {
+-        sub = &ptr->lo;
+-    } else {
+-        sub = &ptr->hi;
+-        n -= 256;
+-    }
+-    setbit256(sub, n);
+-#endif
+-}
+-
+-// switches off bit N in the given vector.
+-static really_inline
+-void clearbit512(m512 *ptr, unsigned int n) {
+-    assert(n < sizeof(*ptr) * 8);
+-#if !defined(HAVE_AVX2)
+-    m128 *sub;
+-    if (n < 128) {
+-        sub = &ptr->lo.lo;
+-    } else if (n < 256) {
+-        sub = &ptr->lo.hi;
+-    } else if (n < 384) {
+-        sub = &ptr->hi.lo;
+-    } else {
+-        sub = &ptr->hi.hi;
+-    }
+-    clearbit128(sub, n % 128);
+-#elif defined(HAVE_AVX512)
+-    *ptr = andnot512(mask1bit512(n), *ptr);
+-#else
+-    m256 *sub;
+-    if (n < 256) {
+-        sub = &ptr->lo;
+-    } else {
+-        sub = &ptr->hi;
+-        n -= 256;
+-    }
+-    clearbit256(sub, n);
+-#endif
+-}
+-
+-// tests bit N in the given vector.
+-static really_inline
+-char testbit512(m512 val, unsigned int n) {
+-    assert(n < sizeof(val) * 8);
+-#if !defined(HAVE_AVX2)
+-    m128 sub;
+-    if (n < 128) {
+-        sub = val.lo.lo;
+-    } else if (n < 256) {
+-        sub = val.lo.hi;
+-    } else if (n < 384) {
+-        sub = val.hi.lo;
+-    } else {
+-        sub = val.hi.hi;
+-    }
+-    return testbit128(sub, n % 128);
+-#elif defined(HAVE_AVX512)
+-    const m512 mask = mask1bit512(n);
+-    return !!_mm512_test_epi8_mask(mask, val);
+-#else
+-    m256 sub;
+-    if (n < 256) {
+-        sub = val.lo;
+-    } else {
+-        sub = val.hi;
+-        n -= 256;
+-    }
+-    return testbit256(sub, n);
+-#endif
+-}
+-
+ #endif
+diff --git a/src/util/simd_x86.h b/src/util/simd_x86.h
+new file mode 100644
+index 0000000..5fa727e
+--- /dev/null
++++ b/src/util/simd_x86.h
+@@ -0,0 +1,1420 @@
++/*
++ * Copyright (c) 2015-2021, Intel Corporation
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ *  * Redistributions of source code must retain the above copyright notice,
++ *    this list of conditions and the following disclaimer.
++ *  * Redistributions in binary form must reproduce the above copyright
++ *    notice, this list of conditions and the following disclaimer in the
++ *    documentation and/or other materials provided with the distribution.
++ *  * Neither the name of Intel Corporation nor the names of its contributors
++ *    may be used to endorse or promote products derived from this software
++ *    without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
++ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
++ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
++ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
++ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
++ * POSSIBILITY OF SUCH DAMAGE.
++ */
++
++/** \file
++ * \brief SIMD types and primitive operations.
++ */
++
++#ifndef SIMD_UTILS
++#define SIMD_UTILS
++
++#if !defined(_WIN32) && !defined(__SSSE3__)
++#error SSSE3 instructions must be enabled
++#endif
++
++#include "config.h"
++#include "ue2common.h"
++#include "simd_types.h"
++#include "unaligned.h"
++#include "util/arch.h"
++#include "util/intrinsics.h"
++
++#include <string.h> // for memcpy
++
++// Define a common assume_aligned using an appropriate compiler built-in, if
++// it's available. Note that we need to handle C or C++ compilation.
++#ifdef __cplusplus
++#  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
++#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
++#  endif
++#else
++#  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
++#    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
++#  endif
++#endif
++
++// Fallback to identity case.
++#ifndef assume_aligned
++#define assume_aligned(x, y) (x)
++#endif
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++extern const char vbs_mask_data[];
++#ifdef __cplusplus
++}
++#endif
++
++static really_inline m128 ones128(void) {
++#if defined(__GNUC__) || defined(__INTEL_COMPILER)
++    /* gcc gets this right */
++    return _mm_set1_epi8(0xFF);
++#else
++    /* trick from Intel's optimization guide to generate all-ones.
++     * ICC converts this to the single cmpeq instruction */
++    return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
++#endif
++}
++
++static really_inline m128 zeroes128(void) {
++    return _mm_setzero_si128();
++}
++
++/** \brief Bitwise not for m128*/
++static really_inline m128 not128(m128 a) {
++    return _mm_xor_si128(a, ones128());
++}
++
++/** \brief Return 1 if a and b are different otherwise 0 */
++static really_inline int diff128(m128 a, m128 b) {
++    return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
++}
++
++static really_inline int isnonzero128(m128 a) {
++    return !!diff128(a, zeroes128());
++}
++
++/**
++ * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich128(m128 a, m128 b) {
++    a = _mm_cmpeq_epi32(a, b);
++    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
++}
++
++/**
++ * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
++ * returns a 4-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_128(m128 a, m128 b) {
++#if defined(HAVE_SSE41)
++    a = _mm_cmpeq_epi64(a, b);
++    return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
++#else
++    u32 d = diffrich128(a, b);
++    return (d | (d >> 1)) & 0x5;
++#endif
++}
++
++static really_really_inline
++m128 lshift64_m128(m128 a, unsigned b) {
++#if defined(HAVE__BUILTIN_CONSTANT_P)
++    if (__builtin_constant_p(b)) {
++        return _mm_slli_epi64(a, b);
++    }
++#endif
++    m128 x = _mm_cvtsi32_si128(b);
++    return _mm_sll_epi64(a, x);
++}
++
++#define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
++#define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
++#define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
++
++#if defined(HAVE_AVX512)
++static really_inline m128 cast512to128(const m512 in) {
++    return _mm512_castsi512_si128(in);
++}
++#endif
++
++static really_inline m128 set16x8(u8 c) {
++    return _mm_set1_epi8(c);
++}
++
++static really_inline m128 set4x32(u32 c) {
++    return _mm_set1_epi32(c);
++}
++
++static really_inline u32 movd(const m128 in) {
++    return _mm_cvtsi128_si32(in);
++}
++
++static really_inline u64a movq(const m128 in) {
++#if defined(ARCH_X86_64)
++    return _mm_cvtsi128_si64(in);
++#else // 32-bit - this is horrific
++    u32 lo = movd(in);
++    u32 hi = movd(_mm_srli_epi64(in, 32));
++    return (u64a)hi << 32 | lo;
++#endif
++}
++
++#if defined(HAVE_AVX512)
++static really_inline u32 movd512(const m512 in) {
++    // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
++    //       so we use 2-step convertions to work around.
++    return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
++}
++
++static really_inline u64a movq512(const m512 in) {
++    // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
++    //       so we use 2-step convertions to work around.
++    return movq(_mm512_castsi512_si128(in));
++}
++#endif
++
++/* another form of movq */
++static really_inline
++m128 load_m128_from_u64a(const u64a *p) {
++    return _mm_set_epi64x(0LL, *p);
++}
++
++#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
++#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
++
++#if defined(HAVE_SSE41)
++#define extract32from128(a, imm) _mm_extract_epi32(a, imm)
++#define extract64from128(a, imm) _mm_extract_epi64(a, imm)
++#else
++#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
++#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
++#endif
++
++#if !defined(HAVE_AVX2)
++// TODO: this entire file needs restructuring - this carveout is awful
++#define extractlow64from256(a) movq(a.lo)
++#define extractlow32from256(a) movd(a.lo)
++#if defined(HAVE_SSE41)
++#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
++#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
++#else
++#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
++#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
++#endif
++
++#endif // !AVX2
++
++static really_inline m128 and128(m128 a, m128 b) {
++    return _mm_and_si128(a,b);
++}
++
++static really_inline m128 xor128(m128 a, m128 b) {
++    return _mm_xor_si128(a,b);
++}
++
++static really_inline m128 or128(m128 a, m128 b) {
++    return _mm_or_si128(a,b);
++}
++
++#if defined(HAVE_AVX512VBMI)
++static really_inline m512 expand128(m128 a) {
++    return _mm512_broadcast_i32x4(a);
++}
++
++static really_inline m512 expand256(m256 a) {
++    return _mm512_broadcast_i64x4(a);
++}
++
++static really_inline m512 expand384(m384 a) {
++    u64a *lo = (u64a*)&a.lo;
++    u64a *mid = (u64a*)&a.mid;
++    u64a *hi = (u64a*)&a.hi;
++    return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
++                            lo[1], lo[0]);
++}
++#endif
++
++static really_inline m128 andnot128(m128 a, m128 b) {
++    return _mm_andnot_si128(a, b);
++}
++
++// aligned load
++static really_inline m128 load128(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    ptr = assume_aligned(ptr, 16);
++    return _mm_load_si128((const m128 *)ptr);
++}
++
++// aligned store
++static really_inline void store128(void *ptr, m128 a) {
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    ptr = assume_aligned(ptr, 16);
++    *(m128 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m128 loadu128(const void *ptr) {
++    return _mm_loadu_si128((const m128 *)ptr);
++}
++
++// unaligned store
++static really_inline void storeu128(void *ptr, m128 a) {
++    _mm_storeu_si128 ((m128 *)ptr, a);
++}
++
++// packed unaligned store of first N bytes
++static really_inline
++void storebytes128(void *ptr, m128 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline
++m128 loadbytes128(const void *ptr, unsigned int n) {
++    m128 a = zeroes128();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++extern const u8 simd_onebit_masks[];
++#ifdef __cplusplus
++}
++#endif
++
++static really_inline
++m128 mask1bit128(unsigned int n) {
++    assert(n < sizeof(m128) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu128(&simd_onebit_masks[mask_idx]);
++}
++
++// switches on bit N in the given vector.
++static really_inline
++void setbit128(m128 *ptr, unsigned int n) {
++    *ptr = or128(mask1bit128(n), *ptr);
++}
++
++// switches off bit N in the given vector.
++static really_inline
++void clearbit128(m128 *ptr, unsigned int n) {
++    *ptr = andnot128(mask1bit128(n), *ptr);
++}
++
++// tests bit N in the given vector.
++static really_inline
++char testbit128(m128 val, unsigned int n) {
++    const m128 mask = mask1bit128(n);
++#if defined(HAVE_SSE41)
++    return !_mm_testz_si128(mask, val);
++#else
++    return isnonzero128(and128(mask, val));
++#endif
++}
++
++// offset must be an immediate
++#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
++
++static really_inline
++m128 pshufb_m128(m128 a, m128 b) {
++    m128 result;
++    result = _mm_shuffle_epi8(a, b);
++    return result;
++}
++
++static really_inline
++m256 pshufb_m256(m256 a, m256 b) {
++#if defined(HAVE_AVX2)
++    return _mm256_shuffle_epi8(a, b);
++#else
++    m256 rv;
++    rv.lo = pshufb_m128(a.lo, b.lo);
++    rv.hi = pshufb_m128(a.hi, b.hi);
++    return rv;
++#endif
++}
++
++#if defined(HAVE_AVX512)
++static really_inline
++m512 pshufb_m512(m512 a, m512 b) {
++    return _mm512_shuffle_epi8(a, b);
++}
++
++static really_inline
++m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
++    return _mm512_maskz_shuffle_epi8(k, a, b);
++}
++
++#if defined(HAVE_AVX512VBMI)
++#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
++#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
++#endif
++
++#endif
++
++static really_inline
++m128 variable_byte_shift_m128(m128 in, s32 amount) {
++    assert(amount >= -16 && amount <= 16);
++    m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
++    return pshufb_m128(in, shift_mask);
++}
++
++static really_inline
++m128 max_u8_m128(m128 a, m128 b) {
++    return _mm_max_epu8(a, b);
++}
++
++static really_inline
++m128 min_u8_m128(m128 a, m128 b) {
++    return _mm_min_epu8(a, b);
++}
++
++static really_inline
++m128 sadd_u8_m128(m128 a, m128 b) {
++    return _mm_adds_epu8(a, b);
++}
++
++static really_inline
++m128 sub_u8_m128(m128 a, m128 b) {
++    return _mm_sub_epi8(a, b);
++}
++
++static really_inline
++m128 set64x2(u64a hi, u64a lo) {
++    return _mm_set_epi64x(hi, lo);
++}
++
++/****
++ **** 256-bit Primitives
++ ****/
++
++#if defined(HAVE_AVX2)
++
++static really_really_inline
++m256 lshift64_m256(m256 a, unsigned b) {
++#if defined(HAVE__BUILTIN_CONSTANT_P)
++    if (__builtin_constant_p(b)) {
++        return _mm256_slli_epi64(a, b);
++    }
++#endif
++    m128 x = _mm_cvtsi32_si128(b);
++    return _mm256_sll_epi64(a, x);
++}
++
++#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
++
++static really_inline
++m256 set32x8(u32 in) {
++    return _mm256_set1_epi8(in);
++}
++
++#define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
++#define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
++
++static really_inline
++m256 set2x128(m128 a) {
++    return _mm256_broadcastsi128_si256(a);
++}
++
++#else
++
++static really_really_inline
++m256 lshift64_m256(m256 a, int b) {
++    m256 rv = a;
++    rv.lo = lshift64_m128(rv.lo, b);
++    rv.hi = lshift64_m128(rv.hi, b);
++    return rv;
++}
++
++static really_inline
++m256 rshift64_m256(m256 a, int b) {
++    m256 rv = a;
++    rv.lo = rshift64_m128(rv.lo, b);
++    rv.hi = rshift64_m128(rv.hi, b);
++    return rv;
++}
++static really_inline
++m256 set32x8(u32 in) {
++    m256 rv;
++    rv.lo = set16x8((u8) in);
++    rv.hi = rv.lo;
++    return rv;
++}
++
++static really_inline
++m256 eq256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = eq128(a.lo, b.lo);
++    rv.hi = eq128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline
++u32 movemask256(m256 a) {
++    u32 lo_mask = movemask128(a.lo);
++    u32 hi_mask = movemask128(a.hi);
++    return lo_mask | (hi_mask << 16);
++}
++
++static really_inline
++m256 set2x128(m128 a) {
++    m256 rv = {a, a};
++    return rv;
++}
++#endif
++
++static really_inline m256 zeroes256(void) {
++#if defined(HAVE_AVX2)
++    return _mm256_setzero_si256();
++#else
++    m256 rv = {zeroes128(), zeroes128()};
++    return rv;
++#endif
++}
++
++static really_inline m256 ones256(void) {
++#if defined(HAVE_AVX2)
++    m256 rv = _mm256_set1_epi8(0xFF);
++#else
++    m256 rv = {ones128(), ones128()};
++#endif
++    return rv;
++}
++
++#if defined(HAVE_AVX2)
++static really_inline m256 and256(m256 a, m256 b) {
++    return _mm256_and_si256(a, b);
++}
++#else
++static really_inline m256 and256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = and128(a.lo, b.lo);
++    rv.hi = and128(a.hi, b.hi);
++    return rv;
++}
++#endif
++
++#if defined(HAVE_AVX2)
++static really_inline m256 or256(m256 a, m256 b) {
++    return _mm256_or_si256(a, b);
++}
++#else
++static really_inline m256 or256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = or128(a.lo, b.lo);
++    rv.hi = or128(a.hi, b.hi);
++    return rv;
++}
++#endif
++
++#if defined(HAVE_AVX2)
++static really_inline m256 xor256(m256 a, m256 b) {
++    return _mm256_xor_si256(a, b);
++}
++#else
++static really_inline m256 xor256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = xor128(a.lo, b.lo);
++    rv.hi = xor128(a.hi, b.hi);
++    return rv;
++}
++#endif
++
++#if defined(HAVE_AVX2)
++static really_inline m256 not256(m256 a) {
++    return _mm256_xor_si256(a, ones256());
++}
++#else
++static really_inline m256 not256(m256 a) {
++    m256 rv;
++    rv.lo = not128(a.lo);
++    rv.hi = not128(a.hi);
++    return rv;
++}
++#endif
++
++#if defined(HAVE_AVX2)
++static really_inline m256 andnot256(m256 a, m256 b) {
++    return _mm256_andnot_si256(a, b);
++}
++#else
++static really_inline m256 andnot256(m256 a, m256 b) {
++    m256 rv;
++    rv.lo = andnot128(a.lo, b.lo);
++    rv.hi = andnot128(a.hi, b.hi);
++    return rv;
++}
++#endif
++
++static really_inline int diff256(m256 a, m256 b) {
++#if defined(HAVE_AVX2)
++    return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
++#else
++    return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
++#endif
++}
++
++static really_inline int isnonzero256(m256 a) {
++#if defined(HAVE_AVX2)
++    return !!diff256(a, zeroes256());
++#else
++    return isnonzero128(or128(a.lo, a.hi));
++#endif
++}
++
++/**
++ * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich256(m256 a, m256 b) {
++#if defined(HAVE_AVX2)
++    a = _mm256_cmpeq_epi32(a, b);
++    return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
++#else
++    m128 z = zeroes128();
++    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
++    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
++    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
++    return ~(_mm_movemask_epi8(packed)) & 0xff;
++#endif
++}
++
++/**
++ * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
++ * returns an 8-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_256(m256 a, m256 b) {
++    u32 d = diffrich256(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline m256 load256(const void *ptr) {
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++#if defined(HAVE_AVX2)
++    return _mm256_load_si256((const m256 *)ptr);
++#else
++    m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
++    return rv;
++#endif
++}
++
++// aligned load  of 128-bit value to low and high part of 256-bit value
++static really_inline m256 load2x128(const void *ptr) {
++#if defined(HAVE_AVX2)
++    return set2x128(load128(ptr));
++#else
++    assert(ISALIGNED_N(ptr, alignof(m128)));
++    m256 rv;
++    rv.hi = rv.lo = load128(ptr);
++    return rv;
++#endif
++}
++
++static really_inline m256 loadu2x128(const void *ptr) {
++    return set2x128(loadu128(ptr));
++}
++
++// aligned store
++static really_inline void store256(void *ptr, m256 a) {
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++#if defined(HAVE_AVX2)
++    _mm256_store_si256((m256 *)ptr, a);
++#else
++    ptr = assume_aligned(ptr, 16);
++    *(m256 *)ptr = a;
++#endif
++}
++
++// unaligned load
++static really_inline m256 loadu256(const void *ptr) {
++#if defined(HAVE_AVX2)
++    return _mm256_loadu_si256((const m256 *)ptr);
++#else
++    m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
++    return rv;
++#endif
++}
++
++// unaligned store
++static really_inline void storeu256(void *ptr, m256 a) {
++#if defined(HAVE_AVX2)
++    _mm256_storeu_si256((m256 *)ptr, a);
++#else
++    storeu128(ptr, a.lo);
++    storeu128((char *)ptr + 16, a.hi);
++#endif
++}
++
++// packed unaligned store of first N bytes
++static really_inline
++void storebytes256(void *ptr, m256 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline
++m256 loadbytes256(const void *ptr, unsigned int n) {
++    m256 a = zeroes256();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++static really_inline
++m256 mask1bit256(unsigned int n) {
++    assert(n < sizeof(m256) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu256(&simd_onebit_masks[mask_idx]);
++}
++
++static really_inline
++m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
++#if defined(HAVE_AVX2)
++    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
++#else
++    m256 rv;
++    rv.hi = set64x2(hi_1, hi_0);
++    rv.lo = set64x2(lo_1, lo_0);
++    return rv;
++#endif
++}
++
++#if !defined(HAVE_AVX2)
++// switches on bit N in the given vector.
++static really_inline
++void setbit256(m256 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else {
++        sub = &ptr->hi;
++        n -= 128;
++    }
++    setbit128(sub, n);
++}
++
++// switches off bit N in the given vector.
++static really_inline
++void clearbit256(m256 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else {
++        sub = &ptr->hi;
++        n -= 128;
++    }
++    clearbit128(sub, n);
++}
++
++// tests bit N in the given vector.
++static really_inline
++char testbit256(m256 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo;
++    } else {
++        sub = val.hi;
++        n -= 128;
++    }
++    return testbit128(sub, n);
++}
++
++static really_really_inline
++m128 movdq_hi(m256 x) {
++    return x.hi;
++}
++
++static really_really_inline
++m128 movdq_lo(m256 x) {
++    return x.lo;
++}
++
++static really_inline
++m256 combine2x128(m128 hi, m128 lo) {
++    m256 rv = {lo, hi};
++    return rv;
++}
++
++#else // AVX2
++
++// switches on bit N in the given vector.
++static really_inline
++void setbit256(m256 *ptr, unsigned int n) {
++    *ptr = or256(mask1bit256(n), *ptr);
++}
++
++static really_inline
++void clearbit256(m256 *ptr, unsigned int n) {
++    *ptr = andnot256(mask1bit256(n), *ptr);
++}
++
++// tests bit N in the given vector.
++static really_inline
++char testbit256(m256 val, unsigned int n) {
++    const m256 mask = mask1bit256(n);
++    return !_mm256_testz_si256(mask, val);
++}
++
++static really_really_inline
++m128 movdq_hi(m256 x) {
++    return _mm256_extracti128_si256(x, 1);
++}
++
++static really_really_inline
++m128 movdq_lo(m256 x) {
++    return _mm256_extracti128_si256(x, 0);
++}
++
++#define cast256to128(a) _mm256_castsi256_si128(a)
++#define cast128to256(a) _mm256_castsi128_si256(a)
++#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
++#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
++#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
++#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
++#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
++#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
++#define extractlow64from256(a) movq(cast256to128(a))
++#define extractlow32from256(a) movd(cast256to128(a))
++#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
++#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
++#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
++
++static really_inline
++m256 combine2x128(m128 hi, m128 lo) {
++#if defined(_mm256_set_m128i)
++    return _mm256_set_m128i(hi, lo);
++#else
++    return insert128to256(cast128to256(lo), hi, 1);
++#endif
++}
++#endif //AVX2
++
++#if defined(HAVE_AVX512)
++#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
++#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
++#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
++#define set2x256(a) _mm512_broadcast_i64x4(a)
++#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
++#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
++#endif
++
++/****
++ **** 384-bit Primitives
++ ****/
++
++static really_inline m384 and384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = and128(a.lo, b.lo);
++    rv.mid = and128(a.mid, b.mid);
++    rv.hi = and128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m384 or384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = or128(a.lo, b.lo);
++    rv.mid = or128(a.mid, b.mid);
++    rv.hi = or128(a.hi, b.hi);
++    return rv;
++}
++
++static really_inline m384 xor384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = xor128(a.lo, b.lo);
++    rv.mid = xor128(a.mid, b.mid);
++    rv.hi = xor128(a.hi, b.hi);
++    return rv;
++}
++static really_inline m384 not384(m384 a) {
++    m384 rv;
++    rv.lo = not128(a.lo);
++    rv.mid = not128(a.mid);
++    rv.hi = not128(a.hi);
++    return rv;
++}
++static really_inline m384 andnot384(m384 a, m384 b) {
++    m384 rv;
++    rv.lo = andnot128(a.lo, b.lo);
++    rv.mid = andnot128(a.mid, b.mid);
++    rv.hi = andnot128(a.hi, b.hi);
++    return rv;
++}
++
++static really_really_inline
++m384 lshift64_m384(m384 a, unsigned b) {
++    m384 rv;
++    rv.lo = lshift64_m128(a.lo, b);
++    rv.mid = lshift64_m128(a.mid, b);
++    rv.hi = lshift64_m128(a.hi, b);
++    return rv;
++}
++
++static really_inline m384 zeroes384(void) {
++    m384 rv = {zeroes128(), zeroes128(), zeroes128()};
++    return rv;
++}
++
++static really_inline m384 ones384(void) {
++    m384 rv = {ones128(), ones128(), ones128()};
++    return rv;
++}
++
++static really_inline int diff384(m384 a, m384 b) {
++    return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
++}
++
++static really_inline int isnonzero384(m384 a) {
++    return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
++}
++
++/**
++ * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline u32 diffrich384(m384 a, m384 b) {
++    m128 z = zeroes128();
++    a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
++    a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
++    a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
++    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
++                                  _mm_packs_epi32(a.hi, z));
++    return ~(_mm_movemask_epi8(packed)) & 0xfff;
++}
++
++/**
++ * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
++ * returns a 12-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline u32 diffrich64_384(m384 a, m384 b) {
++    u32 d = diffrich384(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline m384 load384(const void *ptr) {
++    assert(ISALIGNED_16(ptr));
++    m384 rv = { load128(ptr), load128((const char *)ptr + 16),
++                load128((const char *)ptr + 32) };
++    return rv;
++}
++
++// aligned store
++static really_inline void store384(void *ptr, m384 a) {
++    assert(ISALIGNED_16(ptr));
++    ptr = assume_aligned(ptr, 16);
++    *(m384 *)ptr = a;
++}
++
++// unaligned load
++static really_inline m384 loadu384(const void *ptr) {
++    m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
++                loadu128((const char *)ptr + 32)};
++    return rv;
++}
++
++// packed unaligned store of first N bytes
++static really_inline
++void storebytes384(void *ptr, m384 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline
++m384 loadbytes384(const void *ptr, unsigned int n) {
++    m384 a = zeroes384();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++// switches on bit N in the given vector.
++static really_inline
++void setbit384(m384 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else if (n < 256) {
++        sub = &ptr->mid;
++    } else {
++        sub = &ptr->hi;
++    }
++    setbit128(sub, n % 128);
++}
++
++// switches off bit N in the given vector.
++static really_inline
++void clearbit384(m384 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo;
++    } else if (n < 256) {
++        sub = &ptr->mid;
++    } else {
++        sub = &ptr->hi;
++    }
++    clearbit128(sub, n % 128);
++}
++
++// tests bit N in the given vector.
++static really_inline
++char testbit384(m384 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo;
++    } else if (n < 256) {
++        sub = val.mid;
++    } else {
++        sub = val.hi;
++    }
++    return testbit128(sub, n % 128);
++}
++
++/****
++ **** 512-bit Primitives
++ ****/
++
++#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
++#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
++
++static really_inline
++m512 zeroes512(void) {
++#if defined(HAVE_AVX512)
++    return _mm512_setzero_si512();
++#else
++    m512 rv = {zeroes256(), zeroes256()};
++    return rv;
++#endif
++}
++
++static really_inline
++m512 ones512(void) {
++#if defined(HAVE_AVX512)
++    return _mm512_set1_epi8(0xFF);
++    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
++#else
++    m512 rv = {ones256(), ones256()};
++    return rv;
++#endif
++}
++
++#if defined(HAVE_AVX512)
++static really_inline
++m512 set64x8(u8 a) {
++    return _mm512_set1_epi8(a);
++}
++
++static really_inline
++m512 set8x64(u64a a) {
++    return _mm512_set1_epi64(a);
++}
++
++static really_inline
++m512 set16x32(u32 a) {
++    return _mm512_set1_epi32(a);
++}
++
++static really_inline
++m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
++               u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
++    return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
++                            lo_3, lo_2, lo_1, lo_0);
++}
++
++static really_inline
++m512 swap256in512(m512 a) {
++    m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
++    return vpermq512(idx, a);
++}
++
++static really_inline
++m512 set4x128(m128 a) {
++    return _mm512_broadcast_i32x4(a);
++}
++
++static really_inline
++m512 sadd_u8_m512(m512 a, m512 b) {
++    return _mm512_adds_epu8(a, b);
++}
++
++static really_inline
++m512 max_u8_m512(m512 a, m512 b) {
++    return _mm512_max_epu8(a, b);
++}
++
++static really_inline
++m512 min_u8_m512(m512 a, m512 b) {
++    return _mm512_min_epu8(a, b);
++}
++
++static really_inline
++m512 sub_u8_m512(m512 a, m512 b) {
++    return _mm512_sub_epi8(a, b);
++}
++#endif
++
++static really_inline
++m512 and512(m512 a, m512 b) {
++#if defined(HAVE_AVX512)
++    return _mm512_and_si512(a, b);
++#else
++    m512 rv;
++    rv.lo = and256(a.lo, b.lo);
++    rv.hi = and256(a.hi, b.hi);
++    return rv;
++#endif
++}
++
++static really_inline
++m512 or512(m512 a, m512 b) {
++#if defined(HAVE_AVX512)
++    return _mm512_or_si512(a, b);
++#else
++    m512 rv;
++    rv.lo = or256(a.lo, b.lo);
++    rv.hi = or256(a.hi, b.hi);
++    return rv;
++#endif
++}
++
++static really_inline
++m512 xor512(m512 a, m512 b) {
++#if defined(HAVE_AVX512)
++    return _mm512_xor_si512(a, b);
++#else
++    m512 rv;
++    rv.lo = xor256(a.lo, b.lo);
++    rv.hi = xor256(a.hi, b.hi);
++    return rv;
++#endif
++}
++
++static really_inline
++m512 not512(m512 a) {
++#if defined(HAVE_AVX512)
++    return _mm512_xor_si512(a, ones512());
++#else
++    m512 rv;
++    rv.lo = not256(a.lo);
++    rv.hi = not256(a.hi);
++    return rv;
++#endif
++}
++
++static really_inline
++m512 andnot512(m512 a, m512 b) {
++#if defined(HAVE_AVX512)
++    return _mm512_andnot_si512(a, b);
++#else
++    m512 rv;
++    rv.lo = andnot256(a.lo, b.lo);
++    rv.hi = andnot256(a.hi, b.hi);
++    return rv;
++#endif
++}
++
++#if defined(HAVE_AVX512)
++static really_really_inline
++m512 lshift64_m512(m512 a, unsigned b) {
++#if defined(HAVE__BUILTIN_CONSTANT_P)
++    if (__builtin_constant_p(b)) {
++        return _mm512_slli_epi64(a, b);
++    }
++#endif
++    m128 x = _mm_cvtsi32_si128(b);
++    return _mm512_sll_epi64(a, x);
++}
++#else
++static really_really_inline
++m512 lshift64_m512(m512 a, unsigned b) {
++    m512 rv;
++    rv.lo = lshift64_m256(a.lo, b);
++    rv.hi = lshift64_m256(a.hi, b);
++    return rv;
++}
++#endif
++
++#if defined(HAVE_AVX512)
++#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
++#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
++#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
++#endif
++
++#if !defined(_MM_CMPINT_NE)
++#define _MM_CMPINT_NE 0x4
++#endif
++
++static really_inline
++int diff512(m512 a, m512 b) {
++#if defined(HAVE_AVX512)
++    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
++#else
++    return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
++#endif
++}
++
++static really_inline
++int isnonzero512(m512 a) {
++#if defined(HAVE_AVX512)
++    return diff512(a, zeroes512());
++#elif defined(HAVE_AVX2)
++    m256 x = or256(a.lo, a.hi);
++    return !!diff256(x, zeroes256());
++#else
++    m128 x = or128(a.lo.lo, a.lo.hi);
++    m128 y = or128(a.hi.lo, a.hi.hi);
++    return isnonzero128(or128(x, y));
++#endif
++}
++
++/**
++ * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
++ * mask indicating which 32-bit words contain differences.
++ */
++static really_inline
++u32 diffrich512(m512 a, m512 b) {
++#if defined(HAVE_AVX512)
++    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
++#elif defined(HAVE_AVX2)
++    return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
++#else
++    a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
++    a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
++    a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
++    a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
++    m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
++                                  _mm_packs_epi32(a.hi.lo, a.hi.hi));
++    return ~(_mm_movemask_epi8(packed)) & 0xffff;
++#endif
++}
++
++/**
++ * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
++ * returns a 16-bit mask indicating which 64-bit words contain differences.
++ */
++static really_inline
++u32 diffrich64_512(m512 a, m512 b) {
++    //TODO: cmp_epi64?
++    u32 d = diffrich512(a, b);
++    return (d | (d >> 1)) & 0x55555555;
++}
++
++// aligned load
++static really_inline
++m512 load512(const void *ptr) {
++#if defined(HAVE_AVX512)
++    return _mm512_load_si512(ptr);
++#else
++    assert(ISALIGNED_N(ptr, alignof(m256)));
++    m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
++    return rv;
++#endif
++}
++
++// aligned store
++static really_inline
++void store512(void *ptr, m512 a) {
++    assert(ISALIGNED_N(ptr, alignof(m512)));
++#if defined(HAVE_AVX512)
++    return _mm512_store_si512(ptr, a);
++#elif defined(HAVE_AVX2)
++    m512 *x = (m512 *)ptr;
++    store256(&x->lo, a.lo);
++    store256(&x->hi, a.hi);
++#else
++    ptr = assume_aligned(ptr, 16);
++    *(m512 *)ptr = a;
++#endif
++}
++
++// unaligned load
++static really_inline
++m512 loadu512(const void *ptr) {
++#if defined(HAVE_AVX512)
++    return _mm512_loadu_si512(ptr);
++#else
++    m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
++    return rv;
++#endif
++}
++
++// unaligned store
++static really_inline
++void storeu512(void *ptr, m512 a) {
++#if defined(HAVE_AVX512)
++    _mm512_storeu_si512((m512 *)ptr, a);
++#elif defined(HAVE_AVX2)
++    storeu256(ptr, a.lo);
++    storeu256((char *)ptr + 32, a.hi);
++#else
++    storeu128(ptr, a.lo.lo);
++    storeu128((char *)ptr + 16, a.lo.hi);
++    storeu128((char *)ptr + 32, a.hi.lo);
++    storeu128((char *)ptr + 48, a.hi.hi);
++#endif
++}
++
++#if defined(HAVE_AVX512)
++static really_inline
++m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
++    return _mm512_maskz_loadu_epi8(k, ptr);
++}
++
++static really_inline
++m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
++    return _mm512_mask_loadu_epi8(src, k, ptr);
++}
++
++static really_inline
++void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
++    _mm512_mask_storeu_epi8(ptr, k, a);
++}
++
++static really_inline
++m512 set_mask_m512(__mmask64 k) {
++    return _mm512_movm_epi8(k);
++}
++
++static really_inline
++m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
++    return _mm256_maskz_loadu_epi8(k, ptr);
++}
++#endif
++
++// packed unaligned store of first N bytes
++static really_inline
++void storebytes512(void *ptr, m512 a, unsigned int n) {
++    assert(n <= sizeof(a));
++    memcpy(ptr, &a, n);
++}
++
++// packed unaligned load of first N bytes, pad with zero
++static really_inline
++m512 loadbytes512(const void *ptr, unsigned int n) {
++    m512 a = zeroes512();
++    assert(n <= sizeof(a));
++    memcpy(&a, ptr, n);
++    return a;
++}
++
++static really_inline
++m512 mask1bit512(unsigned int n) {
++    assert(n < sizeof(m512) * 8);
++    u32 mask_idx = ((n % 8) * 64) + 95;
++    mask_idx -= n / 8;
++    return loadu512(&simd_onebit_masks[mask_idx]);
++}
++
++// switches on bit N in the given vector.
++static really_inline
++void setbit512(m512 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++#if !defined(HAVE_AVX2)
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo.lo;
++    } else if (n < 256) {
++        sub = &ptr->lo.hi;
++    } else if (n < 384) {
++        sub = &ptr->hi.lo;
++    } else {
++        sub = &ptr->hi.hi;
++    }
++    setbit128(sub, n % 128);
++#elif defined(HAVE_AVX512)
++    *ptr = or512(mask1bit512(n), *ptr);
++#else
++    m256 *sub;
++    if (n < 256) {
++        sub = &ptr->lo;
++    } else {
++        sub = &ptr->hi;
++        n -= 256;
++    }
++    setbit256(sub, n);
++#endif
++}
++
++// switches off bit N in the given vector.
++static really_inline
++void clearbit512(m512 *ptr, unsigned int n) {
++    assert(n < sizeof(*ptr) * 8);
++#if !defined(HAVE_AVX2)
++    m128 *sub;
++    if (n < 128) {
++        sub = &ptr->lo.lo;
++    } else if (n < 256) {
++        sub = &ptr->lo.hi;
++    } else if (n < 384) {
++        sub = &ptr->hi.lo;
++    } else {
++        sub = &ptr->hi.hi;
++    }
++    clearbit128(sub, n % 128);
++#elif defined(HAVE_AVX512)
++    *ptr = andnot512(mask1bit512(n), *ptr);
++#else
++    m256 *sub;
++    if (n < 256) {
++        sub = &ptr->lo;
++    } else {
++        sub = &ptr->hi;
++        n -= 256;
++    }
++    clearbit256(sub, n);
++#endif
++}
++
++// tests bit N in the given vector.
++static really_inline
++char testbit512(m512 val, unsigned int n) {
++    assert(n < sizeof(val) * 8);
++#if !defined(HAVE_AVX2)
++    m128 sub;
++    if (n < 128) {
++        sub = val.lo.lo;
++    } else if (n < 256) {
++        sub = val.lo.hi;
++    } else if (n < 384) {
++        sub = val.hi.lo;
++    } else {
++        sub = val.hi.hi;
++    }
++    return testbit128(sub, n % 128);
++#elif defined(HAVE_AVX512)
++    const m512 mask = mask1bit512(n);
++    return !!_mm512_test_epi8_mask(mask, val);
++#else
++    m256 sub;
++    if (n < 256) {
++        sub = val.lo;
++    } else {
++        sub = val.hi;
++        n -= 256;
++    }
++    return testbit256(sub, n);
++#endif
++}
++
++#endif
+diff --git a/src/util/state_compress.c b/src/util/state_compress.c
+index 7238849..d71f543 100644
+--- a/src/util/state_compress.c
++++ b/src/util/state_compress.c
+@@ -150,7 +150,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
+     u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]),
+                  expand32(v[2], m[2]), expand32(v[3], m[3]) };
+ 
+-    return _mm_set_epi32(x[3], x[2], x[1], x[0]);
++    return __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0);
+ }
+ #endif
+ 
+@@ -158,7 +158,7 @@ m128 loadcompressed128_32bit(const void *ptr, m128 mvec) {
+ static really_inline
+ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
+     // First, decompose our vectors into 64-bit chunks.
+-    u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) };
++    u64a m[2] = { movq(mvec), movq(__lsx_vsrli_h(mvec, 8)) };
+ 
+     u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) };
+     u64a v[2];
+@@ -167,7 +167,7 @@ m128 loadcompressed128_64bit(const void *ptr, m128 mvec) {
+ 
+     u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) };
+ 
+-    return _mm_set_epi64x(x[1], x[0]);
++    return __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0);
+ }
+ #endif
+ 
+@@ -264,8 +264,8 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
+                  expand32(v[6], m[6]), expand32(v[7], m[7]) };
+ 
+ #if !defined(HAVE_AVX2)
+-    m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
+-                  .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
++    m256 xvec = { .lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0),
++                  .hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[7]),x[6],2),x[5],1),x[4],0) };
+ #else
+     m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4],
+                                  x[3], x[2], x[1], x[0]);
+@@ -291,8 +291,8 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
+                   expand64(v[2], m[2]), expand64(v[3], m[3]) };
+ 
+ #if !defined(HAVE_AVX2)
+-    m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
+-                  .hi = _mm_set_epi64x(x[3], x[2]) };
++    m256 xvec = { .lo = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0),
++                  .hi = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0) };
+ #else
+     m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]);
+ #endif
+@@ -402,9 +402,9 @@ m384 loadcompressed384_32bit(const void *ptr, m384 mvec) {
+                   expand32(v[8], m[8]), expand32(v[9], m[9]),
+                   expand32(v[10], m[10]), expand32(v[11], m[11]) };
+ 
+-    m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
+-                  .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]),
+-                  .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) };
++    m384 xvec = { .lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0),
++                  .mid = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[7]),x[6],2),x[5],1),x[4],0),
++                  .hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[11]),x[10],2),x[9],1),x[8],0) };
+     return xvec;
+ }
+ #endif
+@@ -427,9 +427,9 @@ m384 loadcompressed384_64bit(const void *ptr, m384 mvec) {
+                   expand64(v[2], m[2]), expand64(v[3], m[3]),
+                   expand64(v[4], m[4]), expand64(v[5], m[5]) };
+ 
+-    m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
+-                  .mid = _mm_set_epi64x(x[3], x[2]),
+-                  .hi = _mm_set_epi64x(x[5], x[4]) };
++    m384 xvec = { .lo = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0),
++                  .mid = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0),
++                  .hi = __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[5]),x[4],0) };
+     return xvec;
+ }
+ #endif
+@@ -558,10 +558,10 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
+     xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
+                                x[11], x[10], x[9], x[8]);
+ #else
+-    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
+-    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
+-    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
+-    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
++    xvec.lo.lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[3]),x[2],2),x[1],1),x[0],0);
++    xvec.lo.hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[7]),x[6],2),x[5],1),x[4],0);
++    xvec.hi.lo = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[11]),x[10],2),x[9],1),x[8],0);
++    xvec.hi.hi = __lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(__lsx_vinsgr2vr_w(_lsx_vreplgr2vr_w(x[15]),x[14],2),x[13],1),x[12],0);
+ #endif
+     return xvec;
+ }
+@@ -594,10 +594,10 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
+     m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
+                   .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
+ #else
+-    m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
+-                          _mm_set_epi64x(x[3], x[2]) },
+-                  .hi = { _mm_set_epi64x(x[5], x[4]),
+-                          _mm_set_epi64x(x[7], x[6]) } };
++    m512 xvec = { .lo = { __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[1]),x[0],0),
++                          __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[3]),x[2],0) },
++                  .hi = { __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[5]),x[4],0),
++                          __lsx_vinsgr2vr_d(__lsx_vreplgr2vr_d(x[7]),x[6],0) } };
+ #endif
+     return xvec;
+ }
+-- 
+2.27.0
+
diff --git a/thirdparty/patches/add-the-parameter-mlsx.patch b/thirdparty/patches/add-the-parameter-mlsx.patch
new file mode 100644
index 0000000000..ac1c5d3fc6
--- /dev/null
+++ b/thirdparty/patches/add-the-parameter-mlsx.patch
@@ -0,0 +1,28 @@
+From 60308fe6711842de16742bee5e602c4a3454f461 Mon Sep 17 00:00:00 2001
+From: Jingyun Hua <huajingyun@loongson.cn>
+Date: Thu, 3 Aug 2023 09:32:46 +0800
+Subject: [PATCH] add the parameter -mlsx
+
+Change-Id: I3f11b0d38e71357b589ae9ae0c5b3791a00bf0ec
+---
+ CMakeLists.txt | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4289817..d8f5279 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -259,6 +259,10 @@ else()
+         set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}")
+     endif()
+ 
++    if (ARCH_LOONGARCH64)
++        set(ARCH_CXX_FLAGS "-mlsx")
++    endif()
++
+     if(CMAKE_COMPILER_IS_GNUCC)
+         # spurious warnings?
+         set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -Wno-array-bounds -Wno-maybe-uninitialized")
+-- 
+2.27.0
+
diff --git a/thirdparty/patches/config.guess b/thirdparty/patches/config.guess
new file mode 100644
index 0000000000..48a684601b
--- /dev/null
+++ b/thirdparty/patches/config.guess
@@ -0,0 +1,1815 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright 1992-2024 Free Software Foundation, Inc.
+
+# shellcheck disable=SC2006,SC2268 # see below for rationale
+
+timestamp='2024-07-27'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+#
+# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
+#
+# You can get the latest version of this script from:
+# https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
+#
+# Please send patches to <config-patches@gnu.org>.
+
+
+# The "shellcheck disable" line above the timestamp inhibits complaints
+# about features and limitations of the classic Bourne shell that were
+# superseded or lifted in POSIX.  However, this script identifies a wide
+# variety of pre-POSIX systems that do not have POSIX shells at all, and
+# even some reasonably current systems (Solaris 10 as case-in-point) still
+# have a pre-POSIX /bin/sh.
+
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system '$me' is run on.
+
+Options:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright 1992-2024 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try '$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+# Just in case it came from the environment.
+GUESS=
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, 'CC_FOR_BUILD' used to be named 'HOST_CC'. We still
+# use 'HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+tmp=
+# shellcheck disable=SC2172
+trap 'test -z "$tmp" || rm -fr "$tmp"' 0 1 2 13 15
+
+set_cc_for_build() {
+    # prevent multiple calls if $tmp is already set
+    test "$tmp" && return 0
+    : "${TMPDIR=/tmp}"
+    # shellcheck disable=SC2039,SC3028
+    { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+	{ test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir "$tmp" 2>/dev/null) ; } ||
+	{ tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir "$tmp" 2>/dev/null) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+	{ echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; }
+    dummy=$tmp/dummy
+    case ${CC_FOR_BUILD-},${HOST_CC-},${CC-} in
+	,,)    echo "int x;" > "$dummy.c"
+	       for driver in cc gcc c17 c99 c89 ; do
+		   if ($driver -c -o "$dummy.o" "$dummy.c") >/dev/null 2>&1 ; then
+		       CC_FOR_BUILD=$driver
+		       break
+		   fi
+	       done
+	       if test x"$CC_FOR_BUILD" = x ; then
+		   CC_FOR_BUILD=no_compiler_found
+	       fi
+	       ;;
+	,,*)   CC_FOR_BUILD=$CC ;;
+	,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+    esac
+}
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if test -f /.attbin/uname ; then
+	PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+case $UNAME_SYSTEM in
+Linux|GNU|GNU/*)
+	LIBC=unknown
+
+	set_cc_for_build
+	cat <<-EOF > "$dummy.c"
+	#if defined(__ANDROID__)
+	LIBC=android
+	#else
+	#include <features.h>
+	#if defined(__UCLIBC__)
+	LIBC=uclibc
+	#elif defined(__dietlibc__)
+	LIBC=dietlibc
+	#elif defined(__GLIBC__)
+	LIBC=gnu
+	#elif defined(__LLVM_LIBC__)
+	LIBC=llvm
+	#else
+	#include <stdarg.h>
+	/* First heuristic to detect musl libc.  */
+	#ifdef __DEFINED_va_list
+	LIBC=musl
+	#endif
+	#endif
+	#endif
+	EOF
+	cc_set_libc=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
+	eval "$cc_set_libc"
+
+	# Second heuristic to detect musl libc.
+	if [ "$LIBC" = unknown ] &&
+	   command -v ldd >/dev/null &&
+	   ldd --version 2>&1 | grep -q ^musl; then
+		LIBC=musl
+	fi
+
+	# If the system lacks a compiler, then just pick glibc.
+	# We could probably try harder.
+	if [ "$LIBC" = unknown ]; then
+		LIBC=gnu
+	fi
+	;;
+esac
+
+# Note: order is significant - the case branches are not exclusive.
+
+case $UNAME_MACHINE:$UNAME_SYSTEM:$UNAME_RELEASE:$UNAME_VERSION in
+    *:NetBSD:*:*)
+	# NetBSD (nbsd) targets should (where applicable) match one or
+	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+	# switched to ELF, *-*-netbsd* would select the old
+	# object file format.  This provides both forward
+	# compatibility and a consistent mechanism for selecting the
+	# object file format.
+	#
+	# Note: NetBSD doesn't particularly care about the vendor
+	# portion of the name.  We always set it to "unknown".
+	UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
+	    /sbin/sysctl -n hw.machine_arch 2>/dev/null || \
+	    /usr/sbin/sysctl -n hw.machine_arch 2>/dev/null || \
+	    echo unknown)`
+	case $UNAME_MACHINE_ARCH in
+	    aarch64eb) machine=aarch64_be-unknown ;;
+	    armeb) machine=armeb-unknown ;;
+	    arm*) machine=arm-unknown ;;
+	    sh3el) machine=shl-unknown ;;
+	    sh3eb) machine=sh-unknown ;;
+	    sh5el) machine=sh5le-unknown ;;
+	    earmv*)
+		arch=`echo "$UNAME_MACHINE_ARCH" | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
+		endian=`echo "$UNAME_MACHINE_ARCH" | sed -ne 's,^.*\(eb\)$,\1,p'`
+		machine=${arch}${endian}-unknown
+		;;
+	    *) machine=$UNAME_MACHINE_ARCH-unknown ;;
+	esac
+	# The Operating System including object format, if it has switched
+	# to ELF recently (or will in the future) and ABI.
+	case $UNAME_MACHINE_ARCH in
+	    earm*)
+		os=netbsdelf
+		;;
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+		set_cc_for_build
+		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+			| grep -q __ELF__
+		then
+		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+		    # Return netbsd for either.  FIX?
+		    os=netbsd
+		else
+		    os=netbsdelf
+		fi
+		;;
+	    *)
+		os=netbsd
+		;;
+	esac
+	# Determine ABI tags.
+	case $UNAME_MACHINE_ARCH in
+	    earm*)
+		expr='s/^earmv[0-9]/-eabi/;s/eb$//'
+		abi=`echo "$UNAME_MACHINE_ARCH" | sed -e "$expr"`
+		;;
+	esac
+	# The OS release
+	# Debian GNU/NetBSD machines have a different userland, and
+	# thus, need a distinct triplet. However, they do not need
+	# kernel version information, so it can be replaced with a
+	# suitable tag, in the style of linux-gnu.
+	case $UNAME_VERSION in
+	    Debian*)
+		release='-gnu'
+		;;
+	    *)
+		release=`echo "$UNAME_RELEASE" | sed -e 's/[-_].*//' | cut -d. -f1,2`
+		;;
+	esac
+	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+	# contains redundant information, the shorter form:
+	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+	GUESS=$machine-${os}${release}${abi-}
+	;;
+    *:Bitrig:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
+	GUESS=$UNAME_MACHINE_ARCH-unknown-bitrig$UNAME_RELEASE
+	;;
+    *:OpenBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+	GUESS=$UNAME_MACHINE_ARCH-unknown-openbsd$UNAME_RELEASE
+	;;
+    *:SecBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/SecBSD.//'`
+	GUESS=$UNAME_MACHINE_ARCH-unknown-secbsd$UNAME_RELEASE
+	;;
+    *:LibertyBSD:*:*)
+	UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
+	GUESS=$UNAME_MACHINE_ARCH-unknown-libertybsd$UNAME_RELEASE
+	;;
+    *:MidnightBSD:*:*)
+	GUESS=$UNAME_MACHINE-unknown-midnightbsd$UNAME_RELEASE
+	;;
+    *:ekkoBSD:*:*)
+	GUESS=$UNAME_MACHINE-unknown-ekkobsd$UNAME_RELEASE
+	;;
+    *:SolidBSD:*:*)
+	GUESS=$UNAME_MACHINE-unknown-solidbsd$UNAME_RELEASE
+	;;
+    *:OS108:*:*)
+	GUESS=$UNAME_MACHINE-unknown-os108_$UNAME_RELEASE
+	;;
+    macppc:MirBSD:*:*)
+	GUESS=powerpc-unknown-mirbsd$UNAME_RELEASE
+	;;
+    *:MirBSD:*:*)
+	GUESS=$UNAME_MACHINE-unknown-mirbsd$UNAME_RELEASE
+	;;
+    *:Sortix:*:*)
+	GUESS=$UNAME_MACHINE-unknown-sortix
+	;;
+    *:Twizzler:*:*)
+	GUESS=$UNAME_MACHINE-unknown-twizzler
+	;;
+    *:Redox:*:*)
+	GUESS=$UNAME_MACHINE-unknown-redox
+	;;
+    mips:OSF1:*.*)
+	GUESS=mips-dec-osf1
+	;;
+    alpha:OSF1:*:*)
+	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+	trap '' 0
+	case $UNAME_RELEASE in
+	*4.0)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+		;;
+	*5.*)
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		;;
+	esac
+	# According to Compaq, /usr/sbin/psrinfo has been available on
+	# OSF/1 and Tru64 systems produced since 1995.  I hope that
+	# covers most systems running today.  This code pipes the CPU
+	# types through head -n 1, so we only detect the type of CPU 0.
+	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+	case $ALPHA_CPU_TYPE in
+	    "EV4 (21064)")
+		UNAME_MACHINE=alpha ;;
+	    "EV4.5 (21064)")
+		UNAME_MACHINE=alpha ;;
+	    "LCA4 (21066/21068)")
+		UNAME_MACHINE=alpha ;;
+	    "EV5 (21164)")
+		UNAME_MACHINE=alphaev5 ;;
+	    "EV5.6 (21164A)")
+		UNAME_MACHINE=alphaev56 ;;
+	    "EV5.6 (21164PC)")
+		UNAME_MACHINE=alphapca56 ;;
+	    "EV5.7 (21164PC)")
+		UNAME_MACHINE=alphapca57 ;;
+	    "EV6 (21264)")
+		UNAME_MACHINE=alphaev6 ;;
+	    "EV6.7 (21264A)")
+		UNAME_MACHINE=alphaev67 ;;
+	    "EV6.8CB (21264C)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.8AL (21264B)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.8CX (21264D)")
+		UNAME_MACHINE=alphaev68 ;;
+	    "EV6.9A (21264/EV69A)")
+		UNAME_MACHINE=alphaev69 ;;
+	    "EV7 (21364)")
+		UNAME_MACHINE=alphaev7 ;;
+	    "EV7.9 (21364A)")
+		UNAME_MACHINE=alphaev79 ;;
+	esac
+	# A Pn.n version is a patched version.
+	# A Vn.n version is a released version.
+	# A Tn.n version is a released field test version.
+	# A Xn.n version is an unreleased experimental baselevel.
+	# 1.2 uses "1.2" for uname -r.
+	OSF_REL=`echo "$UNAME_RELEASE" | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	GUESS=$UNAME_MACHINE-dec-osf$OSF_REL
+	;;
+    Amiga*:UNIX_System_V:4.0:*)
+	GUESS=m68k-unknown-sysv4
+	;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+	GUESS=$UNAME_MACHINE-unknown-amigaos
+	;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+	GUESS=$UNAME_MACHINE-unknown-morphos
+	;;
+    *:OS/390:*:*)
+	GUESS=i370-ibm-openedition
+	;;
+    *:z/VM:*:*)
+	GUESS=s390-ibm-zvmoe
+	;;
+    *:OS400:*:*)
+	GUESS=powerpc-ibm-os400
+	;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+	GUESS=arm-acorn-riscix$UNAME_RELEASE
+	;;
+    arm*:riscos:*:*|arm*:RISCOS:*:*)
+	GUESS=arm-unknown-riscos
+	;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+	GUESS=hppa1.1-hitachi-hiuxmpp
+	;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+	case `(/bin/universe) 2>/dev/null` in
+	    att) GUESS=pyramid-pyramid-sysv3 ;;
+	    *)   GUESS=pyramid-pyramid-bsd   ;;
+	esac
+	;;
+    NILE*:*:*:dcosx)
+	GUESS=pyramid-pyramid-svr4
+	;;
+    DRS?6000:unix:4.0:6*)
+	GUESS=sparc-icl-nx6
+	;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+	case `/usr/bin/uname -p` in
+	    sparc) GUESS=sparc-icl-nx7 ;;
+	esac
+	;;
+    s390x:SunOS:*:*)
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=$UNAME_MACHINE-ibm-solaris2$SUN_REL
+	;;
+    sun4H:SunOS:5.*:*)
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=sparc-hal-solaris2$SUN_REL
+	;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=sparc-sun-solaris2$SUN_REL
+	;;
+    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
+	GUESS=i386-pc-auroraux$UNAME_RELEASE
+	;;
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+	set_cc_for_build
+	SUN_ARCH=i386
+	# If there is a compiler, see if it is configured for 64-bit objects.
+	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+	# This test works for both compilers.
+	if test "$CC_FOR_BUILD" != no_compiler_found; then
+	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS="" $CC_FOR_BUILD -m64 -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		SUN_ARCH=x86_64
+	    fi
+	fi
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=$SUN_ARCH-pc-solaris2$SUN_REL
+	;;
+    sun4*:SunOS:6*:*)
+	# According to config.sub, this is the proper way to canonicalize
+	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+	# it's likely to be more like Solaris than SunOS4.
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=sparc-sun-solaris3$SUN_REL
+	;;
+    sun4*:SunOS:*:*)
+	case `/usr/bin/arch -k` in
+	    Series*|S4*)
+		UNAME_RELEASE=`uname -v`
+		;;
+	esac
+	# Japanese Language versions have a version number like '4.1.3-JL'.
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/'`
+	GUESS=sparc-sun-sunos$SUN_REL
+	;;
+    sun3*:SunOS:*:*)
+	GUESS=m68k-sun-sunos$UNAME_RELEASE
+	;;
+    sun*:*:4.2BSD:*)
+	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+	test "x$UNAME_RELEASE" = x && UNAME_RELEASE=3
+	case `/bin/arch` in
+	    sun3)
+		GUESS=m68k-sun-sunos$UNAME_RELEASE
+		;;
+	    sun4)
+		GUESS=sparc-sun-sunos$UNAME_RELEASE
+		;;
+	esac
+	;;
+    aushp:SunOS:*:*)
+	GUESS=sparc-auspex-sunos$UNAME_RELEASE
+	;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+	GUESS=m68k-atari-mint$UNAME_RELEASE
+	;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+	GUESS=m68k-atari-mint$UNAME_RELEASE
+	;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+	GUESS=m68k-atari-mint$UNAME_RELEASE
+	;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+	GUESS=m68k-milan-mint$UNAME_RELEASE
+	;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+	GUESS=m68k-hades-mint$UNAME_RELEASE
+	;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+	GUESS=m68k-unknown-mint$UNAME_RELEASE
+	;;
+    m68k:machten:*:*)
+	GUESS=m68k-apple-machten$UNAME_RELEASE
+	;;
+    powerpc:machten:*:*)
+	GUESS=powerpc-apple-machten$UNAME_RELEASE
+	;;
+    RISC*:Mach:*:*)
+	GUESS=mips-dec-mach_bsd4.3
+	;;
+    RISC*:ULTRIX:*:*)
+	GUESS=mips-dec-ultrix$UNAME_RELEASE
+	;;
+    VAX*:ULTRIX*:*:*)
+	GUESS=vax-dec-ultrix$UNAME_RELEASE
+	;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+	GUESS=clipper-intergraph-clix$UNAME_RELEASE
+	;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+	set_cc_for_build
+	sed 's/^	//' << EOF > "$dummy.c"
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+	int main (int argc, char *argv[]) {
+#else
+	int main (argc, argv) int argc; char *argv[]; {
+#endif
+	#if defined (host_mips) && defined (MIPSEB)
+	#if defined (SYSTYPE_SYSV)
+	  printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_SVR4)
+	  printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0);
+	#endif
+	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+	  printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0);
+	#endif
+	#endif
+	  exit (-1);
+	}
+EOF
+	$CC_FOR_BUILD -o "$dummy" "$dummy.c" &&
+	  dummyarg=`echo "$UNAME_RELEASE" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+	  SYSTEM_NAME=`"$dummy" "$dummyarg"` &&
+	    { echo "$SYSTEM_NAME"; exit; }
+	GUESS=mips-mips-riscos$UNAME_RELEASE
+	;;
+    Motorola:PowerMAX_OS:*:*)
+	GUESS=powerpc-motorola-powermax
+	;;
+    Motorola:*:4.3:PL8-*)
+	GUESS=powerpc-harris-powermax
+	;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+	GUESS=powerpc-harris-powermax
+	;;
+    Night_Hawk:Power_UNIX:*:*)
+	GUESS=powerpc-harris-powerunix
+	;;
+    m88k:CX/UX:7*:*)
+	GUESS=m88k-harris-cxux7
+	;;
+    m88k:*:4*:R4*)
+	GUESS=m88k-motorola-sysv4
+	;;
+    m88k:*:3*:R3*)
+	GUESS=m88k-motorola-sysv3
+	;;
+    AViiON:dgux:*:*)
+	# DG/UX returns AViiON for all architectures
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	if test "$UNAME_PROCESSOR" = mc88100 || test "$UNAME_PROCESSOR" = mc88110
+	then
+	    if test "$TARGET_BINARY_INTERFACE"x = m88kdguxelfx || \
+	       test "$TARGET_BINARY_INTERFACE"x = x
+	    then
+		GUESS=m88k-dg-dgux$UNAME_RELEASE
+	    else
+		GUESS=m88k-dg-dguxbcs$UNAME_RELEASE
+	    fi
+	else
+	    GUESS=i586-dg-dgux$UNAME_RELEASE
+	fi
+	;;
+    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
+	GUESS=m88k-dolphin-sysv3
+	;;
+    M88*:*:R3*:*)
+	# Delta 88k system running SVR3
+	GUESS=m88k-motorola-sysv3
+	;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+	GUESS=m88k-tektronix-sysv3
+	;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+	GUESS=m68k-tektronix-bsd
+	;;
+    *:IRIX*:*:*)
+	IRIX_REL=`echo "$UNAME_RELEASE" | sed -e 's/-/_/g'`
+	GUESS=mips-sgi-irix$IRIX_REL
+	;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+	GUESS=romp-ibm-aix    # uname -m gives an 8 hex-code CPU id
+	;;                    # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+	GUESS=i386-ibm-aix
+	;;
+    ia64:AIX:*:*)
+	if test -x /usr/bin/oslevel ; then
+		IBM_REV=`/usr/bin/oslevel`
+	else
+		IBM_REV=$UNAME_VERSION.$UNAME_RELEASE
+	fi
+	GUESS=$UNAME_MACHINE-ibm-aix$IBM_REV
+	;;
+    *:AIX:2:3)
+	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+		set_cc_for_build
+		sed 's/^		//' << EOF > "$dummy.c"
+		#include <sys/systemcfg.h>
+
+		int
+		main ()
+			{
+			if (!__power_pc())
+				exit(1);
+			puts("powerpc-ibm-aix3.2.5");
+			exit(0);
+			}
+EOF
+		if $CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"`
+		then
+			GUESS=$SYSTEM_NAME
+		else
+			GUESS=rs6000-ibm-aix3.2.5
+		fi
+	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+		GUESS=rs6000-ibm-aix3.2.4
+	else
+		GUESS=rs6000-ibm-aix3.2
+	fi
+	;;
+    *:AIX:*:[4567])
+	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+	if /usr/sbin/lsattr -El "$IBM_CPU_ID" | grep ' POWER' >/dev/null 2>&1; then
+		IBM_ARCH=rs6000
+	else
+		IBM_ARCH=powerpc
+	fi
+	if test -x /usr/bin/lslpp ; then
+		IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc | \
+			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
+	else
+		IBM_REV=$UNAME_VERSION.$UNAME_RELEASE
+	fi
+	GUESS=$IBM_ARCH-ibm-aix$IBM_REV
+	;;
+    *:AIX:*:*)
+	GUESS=rs6000-ibm-aix
+	;;
+    ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*)
+	GUESS=romp-ibm-bsd4.4
+	;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+	GUESS=romp-ibm-bsd$UNAME_RELEASE    # 4.3 with uname added to
+	;;                                  # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+	GUESS=rs6000-bull-bosx
+	;;
+    DPX/2?00:B.O.S.:*:*)
+	GUESS=m68k-bull-sysv3
+	;;
+    9000/[34]??:4.3bsd:1.*:*)
+	GUESS=m68k-hp-bsd
+	;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+	GUESS=m68k-hp-bsd4.4
+	;;
+    9000/[34678]??:HP-UX:*:*)
+	HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'`
+	case $UNAME_MACHINE in
+	    9000/31?)            HP_ARCH=m68000 ;;
+	    9000/[34]??)         HP_ARCH=m68k ;;
+	    9000/[678][0-9][0-9])
+		if test -x /usr/bin/getconf; then
+		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+		    case $sc_cpu_version in
+		      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
+		      532)                      # CPU_PA_RISC2_0
+			case $sc_kernel_bits in
+			  32) HP_ARCH=hppa2.0n ;;
+			  64) HP_ARCH=hppa2.0w ;;
+			  '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
+			esac ;;
+		    esac
+		fi
+		if test "$HP_ARCH" = ""; then
+		    set_cc_for_build
+		    sed 's/^		//' << EOF > "$dummy.c"
+
+		#define _HPUX_SOURCE
+		#include <stdlib.h>
+		#include <unistd.h>
+
+		int
+		main ()
+		{
+		#if defined(_SC_KERNEL_BITS)
+		    long bits = sysconf(_SC_KERNEL_BITS);
+		#endif
+		    long cpu  = sysconf (_SC_CPU_VERSION);
+
+		    switch (cpu)
+			{
+			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+			case CPU_PA_RISC2_0:
+		#if defined(_SC_KERNEL_BITS)
+			    switch (bits)
+				{
+				case 64: puts ("hppa2.0w"); break;
+				case 32: puts ("hppa2.0n"); break;
+				default: puts ("hppa2.0"); break;
+				} break;
+		#else  /* !defined(_SC_KERNEL_BITS) */
+			    puts ("hppa2.0"); break;
+		#endif
+			default: puts ("hppa1.0"); break;
+			}
+		    exit (0);
+		}
+EOF
+		    (CCOPTS="" $CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null) && HP_ARCH=`"$dummy"`
+		    test -z "$HP_ARCH" && HP_ARCH=hppa
+		fi ;;
+	esac
+	if test "$HP_ARCH" = hppa2.0w
+	then
+	    set_cc_for_build
+
+	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+	    # generating 64-bit code.  GNU and HP use different nomenclature:
+	    #
+	    # $ CC_FOR_BUILD=cc ./config.guess
+	    # => hppa2.0w-hp-hpux11.23
+	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+	    # => hppa64-hp-hpux11.23
+
+	    if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
+		grep -q __LP64__
+	    then
+		HP_ARCH=hppa2.0w
+	    else
+		HP_ARCH=hppa64
+	    fi
+	fi
+	GUESS=$HP_ARCH-hp-hpux$HPUX_REV
+	;;
+    ia64:HP-UX:*:*)
+	HPUX_REV=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*.[0B]*//'`
+	GUESS=ia64-hp-hpux$HPUX_REV
+	;;
+    3050*:HI-UX:*:*)
+	set_cc_for_build
+	sed 's/^	//' << EOF > "$dummy.c"
+	#include <unistd.h>
+	int
+	main ()
+	{
+	  long cpu = sysconf (_SC_CPU_VERSION);
+	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+	     results, however.  */
+	  if (CPU_IS_PA_RISC (cpu))
+	    {
+	      switch (cpu)
+		{
+		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+		  default: puts ("hppa-hitachi-hiuxwe2"); break;
+		}
+	    }
+	  else if (CPU_IS_HP_MC68K (cpu))
+	    puts ("m68k-hitachi-hiuxwe2");
+	  else puts ("unknown-hitachi-hiuxwe2");
+	  exit (0);
+	}
+EOF
+	$CC_FOR_BUILD -o "$dummy" "$dummy.c" && SYSTEM_NAME=`"$dummy"` &&
+		{ echo "$SYSTEM_NAME"; exit; }
+	GUESS=unknown-hitachi-hiuxwe2
+	;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*)
+	GUESS=hppa1.1-hp-bsd
+	;;
+    9000/8??:4.3bsd:*:*)
+	GUESS=hppa1.0-hp-bsd
+	;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+	GUESS=hppa1.0-hp-mpeix
+	;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*)
+	GUESS=hppa1.1-hp-osf
+	;;
+    hp8??:OSF1:*:*)
+	GUESS=hppa1.0-hp-osf
+	;;
+    i*86:OSF1:*:*)
+	if test -x /usr/sbin/sysversion ; then
+	    GUESS=$UNAME_MACHINE-unknown-osf1mk
+	else
+	    GUESS=$UNAME_MACHINE-unknown-osf1
+	fi
+	;;
+    parisc*:Lites*:*:*)
+	GUESS=hppa1.1-hp-lites
+	;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+	GUESS=c1-convex-bsd
+	;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+	GUESS=c34-convex-bsd
+	;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+	GUESS=c38-convex-bsd
+	;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+	GUESS=c4-convex-bsd
+	;;
+    CRAY*Y-MP:*:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=ymp-cray-unicos$CRAY_REL
+	;;
+    CRAY*[A-Z]90:*:*:*)
+	echo "$UNAME_MACHINE"-cray-unicos"$UNAME_RELEASE" \
+	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+	      -e 's/\.[^.]*$/.X/'
+	exit ;;
+    CRAY*TS:*:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=t90-cray-unicos$CRAY_REL
+	;;
+    CRAY*T3E:*:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=alphaev5-cray-unicosmk$CRAY_REL
+	;;
+    CRAY*SV1:*:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=sv1-cray-unicos$CRAY_REL
+	;;
+    *:UNICOS/mp:*:*)
+	CRAY_REL=`echo "$UNAME_RELEASE" | sed -e 's/\.[^.]*$/.X/'`
+	GUESS=craynv-cray-unicosmp$CRAY_REL
+	;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+	FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_REL=`echo "$UNAME_RELEASE" | sed -e 's/ /_/'`
+	GUESS=${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}
+	;;
+    5000:UNIX_System_V:4.*:*)
+	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_REL=`echo "$UNAME_RELEASE" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
+	GUESS=sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}
+	;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+	GUESS=$UNAME_MACHINE-pc-bsdi$UNAME_RELEASE
+	;;
+    sparc*:BSD/OS:*:*)
+	GUESS=sparc-unknown-bsdi$UNAME_RELEASE
+	;;
+    *:BSD/OS:*:*)
+	GUESS=$UNAME_MACHINE-unknown-bsdi$UNAME_RELEASE
+	;;
+    arm:FreeBSD:*:*)
+	UNAME_PROCESSOR=`uname -p`
+	set_cc_for_build
+	if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_PCS_VFP
+	then
+	    FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	    GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabi
+	else
+	    FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	    GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL-gnueabihf
+	fi
+	;;
+    *:FreeBSD:*:*)
+	UNAME_PROCESSOR=`uname -p`
+	case $UNAME_PROCESSOR in
+	    amd64)
+		UNAME_PROCESSOR=x86_64 ;;
+	    i386)
+		UNAME_PROCESSOR=i586 ;;
+	esac
+	FREEBSD_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	GUESS=$UNAME_PROCESSOR-unknown-freebsd$FREEBSD_REL
+	;;
+    i*:CYGWIN*:*)
+	GUESS=$UNAME_MACHINE-pc-cygwin
+	;;
+    *:MINGW64*:*)
+	GUESS=$UNAME_MACHINE-pc-mingw64
+	;;
+    *:MINGW*:*)
+	GUESS=$UNAME_MACHINE-pc-mingw32
+	;;
+    *:MSYS*:*)
+	GUESS=$UNAME_MACHINE-pc-msys
+	;;
+    i*:PW*:*)
+	GUESS=$UNAME_MACHINE-pc-pw32
+	;;
+    *:SerenityOS:*:*)
+        GUESS=$UNAME_MACHINE-pc-serenity
+        ;;
+    *:Interix*:*)
+	case $UNAME_MACHINE in
+	    x86)
+		GUESS=i586-pc-interix$UNAME_RELEASE
+		;;
+	    authenticamd | genuineintel | EM64T)
+		GUESS=x86_64-unknown-interix$UNAME_RELEASE
+		;;
+	    IA64)
+		GUESS=ia64-unknown-interix$UNAME_RELEASE
+		;;
+	esac ;;
+    i*:UWIN*:*)
+	GUESS=$UNAME_MACHINE-pc-uwin
+	;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+	GUESS=x86_64-pc-cygwin
+	;;
+    prep*:SunOS:5.*:*)
+	SUN_REL=`echo "$UNAME_RELEASE" | sed -e 's/[^.]*//'`
+	GUESS=powerpcle-unknown-solaris2$SUN_REL
+	;;
+    *:GNU:*:*)
+	# the GNU system
+	GNU_ARCH=`echo "$UNAME_MACHINE" | sed -e 's,[-/].*$,,'`
+	GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's,/.*$,,'`
+	GUESS=$GNU_ARCH-unknown-$LIBC$GNU_REL
+	;;
+    *:GNU/*:*:*)
+	# other systems with GNU libc and userland
+	GNU_SYS=`echo "$UNAME_SYSTEM" | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"`
+	GNU_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	GUESS=$UNAME_MACHINE-unknown-$GNU_SYS$GNU_REL-$LIBC
+	;;
+    x86_64:[Mm]anagarm:*:*|i?86:[Mm]anagarm:*:*)
+	GUESS="$UNAME_MACHINE-pc-managarm-mlibc"
+	;;
+    *:[Mm]anagarm:*:*)
+	GUESS="$UNAME_MACHINE-unknown-managarm-mlibc"
+	;;
+    *:Minix:*:*)
+	GUESS=$UNAME_MACHINE-unknown-minix
+	;;
+    aarch64:Linux:*:*)
+	set_cc_for_build
+	CPU=$UNAME_MACHINE
+	LIBCABI=$LIBC
+	if test "$CC_FOR_BUILD" != no_compiler_found; then
+	    ABI=64
+	    sed 's/^	    //' << EOF > "$dummy.c"
+	    #ifdef __ARM_EABI__
+	    #ifdef __ARM_PCS_VFP
+	    ABI=eabihf
+	    #else
+	    ABI=eabi
+	    #endif
+	    #endif
+EOF
+	    cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'`
+	    eval "$cc_set_abi"
+	    case $ABI in
+		eabi | eabihf) CPU=armv8l; LIBCABI=$LIBC$ABI ;;
+	    esac
+	fi
+	GUESS=$CPU-unknown-linux-$LIBCABI
+	;;
+    aarch64_be:Linux:*:*)
+	UNAME_MACHINE=aarch64_be
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' /proc/cpuinfo 2>/dev/null` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+	esac
+	objdump --private-headers /bin/sh | grep -q ld.so.1
+	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    arc:Linux:*:* | arceb:Linux:*:* | arc32:Linux:*:* | arc64:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    arm*:Linux:*:*)
+	set_cc_for_build
+	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+	    | grep -q __ARM_EABI__
+	then
+	    GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	else
+	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+		| grep -q __ARM_PCS_VFP
+	    then
+		GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabi
+	    else
+		GUESS=$UNAME_MACHINE-unknown-linux-${LIBC}eabihf
+	    fi
+	fi
+	;;
+    avr32*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    cris:Linux:*:*)
+	GUESS=$UNAME_MACHINE-axis-linux-$LIBC
+	;;
+    crisv32:Linux:*:*)
+	GUESS=$UNAME_MACHINE-axis-linux-$LIBC
+	;;
+    e2k:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    frv:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    hexagon:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    i*86:Linux:*:*)
+	GUESS=$UNAME_MACHINE-pc-linux-$LIBC
+	;;
+    ia64:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    k1om:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    kvx:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    kvx:cos:*:*)
+	GUESS=$UNAME_MACHINE-unknown-cos
+	;;
+    kvx:mbr:*:*)
+	GUESS=$UNAME_MACHINE-unknown-mbr
+	;;
+    loongarch32:Linux:*:* | loongarch64:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    m32r*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    m68*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    mips:Linux:*:* | mips64:Linux:*:*)
+	set_cc_for_build
+	IS_GLIBC=0
+	test x"${LIBC}" = xgnu && IS_GLIBC=1
+	sed 's/^	//' << EOF > "$dummy.c"
+	#undef CPU
+	#undef mips
+	#undef mipsel
+	#undef mips64
+	#undef mips64el
+	#if ${IS_GLIBC} && defined(_ABI64)
+	LIBCABI=gnuabi64
+	#else
+	#if ${IS_GLIBC} && defined(_ABIN32)
+	LIBCABI=gnuabin32
+	#else
+	LIBCABI=${LIBC}
+	#endif
+	#endif
+
+	#if ${IS_GLIBC} && defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
+	CPU=mipsisa64r6
+	#else
+	#if ${IS_GLIBC} && !defined(__mips64) && defined(__mips_isa_rev) && __mips_isa_rev>=6
+	CPU=mipsisa32r6
+	#else
+	#if defined(__mips64)
+	CPU=mips64
+	#else
+	CPU=mips
+	#endif
+	#endif
+	#endif
+
+	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+	MIPS_ENDIAN=el
+	#else
+	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+	MIPS_ENDIAN=
+	#else
+	MIPS_ENDIAN=
+	#endif
+	#endif
+EOF
+	cc_set_vars=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^CPU\|^MIPS_ENDIAN\|^LIBCABI'`
+	eval "$cc_set_vars"
+	test "x$CPU" != x && { echo "$CPU${MIPS_ENDIAN}-unknown-linux-$LIBCABI"; exit; }
+	;;
+    mips64el:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    openrisc*:Linux:*:*)
+	GUESS=or1k-unknown-linux-$LIBC
+	;;
+    or32:Linux:*:* | or1k*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    padre:Linux:*:*)
+	GUESS=sparc-unknown-linux-$LIBC
+	;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	GUESS=hppa64-unknown-linux-$LIBC
+	;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+	# Look for CPU level
+	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+	  PA7*) GUESS=hppa1.1-unknown-linux-$LIBC ;;
+	  PA8*) GUESS=hppa2.0-unknown-linux-$LIBC ;;
+	  *)    GUESS=hppa-unknown-linux-$LIBC ;;
+	esac
+	;;
+    ppc64:Linux:*:*)
+	GUESS=powerpc64-unknown-linux-$LIBC
+	;;
+    ppc:Linux:*:*)
+	GUESS=powerpc-unknown-linux-$LIBC
+	;;
+    ppc64le:Linux:*:*)
+	GUESS=powerpc64le-unknown-linux-$LIBC
+	;;
+    ppcle:Linux:*:*)
+	GUESS=powerpcle-unknown-linux-$LIBC
+	;;
+    riscv32:Linux:*:* | riscv32be:Linux:*:* | riscv64:Linux:*:* | riscv64be:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+	GUESS=$UNAME_MACHINE-ibm-linux-$LIBC
+	;;
+    sh64*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    sh*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    tile*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    vax:Linux:*:*)
+	GUESS=$UNAME_MACHINE-dec-linux-$LIBC
+	;;
+    x86_64:Linux:*:*)
+	set_cc_for_build
+	CPU=$UNAME_MACHINE
+	LIBCABI=$LIBC
+	if test "$CC_FOR_BUILD" != no_compiler_found; then
+	    ABI=64
+	    sed 's/^	    //' << EOF > "$dummy.c"
+	    #ifdef __i386__
+	    ABI=x86
+	    #else
+	    #ifdef __ILP32__
+	    ABI=x32
+	    #endif
+	    #endif
+EOF
+	    cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'`
+	    eval "$cc_set_abi"
+	    case $ABI in
+		x86) CPU=i686 ;;
+		x32) LIBCABI=${LIBC}x32 ;;
+	    esac
+	fi
+	GUESS=$CPU-pc-linux-$LIBCABI
+	;;
+    xtensa*:Linux:*:*)
+	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
+	;;
+    i*86:DYNIX/ptx:4*:*)
+	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+	# earlier versions are messed up and put the nodename in both
+	# sysname and nodename.
+	GUESS=i386-sequent-sysv4
+	;;
+    i*86:UNIX_SV:4.2MP:2.*)
+	# Unixware is an offshoot of SVR4, but it has its own version
+	# number series starting with 2...
+	# I am not positive that other SVR4 systems won't match this,
+	# I just have to hope.  -- rms.
+	# Use sysv4.2uw... so that sysv4* matches it.
+	GUESS=$UNAME_MACHINE-pc-sysv4.2uw$UNAME_VERSION
+	;;
+    i*86:OS/2:*:*)
+	# If we were able to find 'uname', then EMX Unix compatibility
+	# is probably installed.
+	GUESS=$UNAME_MACHINE-pc-os2-emx
+	;;
+    i*86:XTS-300:*:STOP)
+	GUESS=$UNAME_MACHINE-unknown-stop
+	;;
+    i*86:atheos:*:*)
+	GUESS=$UNAME_MACHINE-unknown-atheos
+	;;
+    i*86:syllable:*:*)
+	GUESS=$UNAME_MACHINE-pc-syllable
+	;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
+	GUESS=i386-unknown-lynxos$UNAME_RELEASE
+	;;
+    i*86:*DOS:*:*)
+	GUESS=$UNAME_MACHINE-pc-msdosdjgpp
+	;;
+    i*86:*:4.*:*)
+	UNAME_REL=`echo "$UNAME_RELEASE" | sed 's/\/MP$//'`
+	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+		GUESS=$UNAME_MACHINE-univel-sysv$UNAME_REL
+	else
+		GUESS=$UNAME_MACHINE-pc-sysv$UNAME_REL
+	fi
+	;;
+    i*86:*:5:[678]*)
+	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	case `/bin/uname -X | grep "^Machine"` in
+	    *486*)	     UNAME_MACHINE=i486 ;;
+	    *Pentium)	     UNAME_MACHINE=i586 ;;
+	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+	esac
+	GUESS=$UNAME_MACHINE-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+	;;
+    i*86:*:3.2:*)
+	if test -f /usr/options/cb.name; then
+		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+		GUESS=$UNAME_MACHINE-pc-isc$UNAME_REL
+	elif /bin/uname -X 2>/dev/null >/dev/null ; then
+		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+			&& UNAME_MACHINE=i586
+		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+			&& UNAME_MACHINE=i686
+		GUESS=$UNAME_MACHINE-pc-sco$UNAME_REL
+	else
+		GUESS=$UNAME_MACHINE-pc-sysv32
+	fi
+	;;
+    pc:*:*:*)
+	# Left here for compatibility:
+	# uname -m prints for DJGPP always 'pc', but it prints nothing about
+	# the processor, so we play safe by assuming i586.
+	# Note: whatever this is, it MUST be the same as what config.sub
+	# prints for the "djgpp" host, or else GDB configure will decide that
+	# this is a cross-build.
+	GUESS=i586-pc-msdosdjgpp
+	;;
+    Intel:Mach:3*:*)
+	GUESS=i386-pc-mach3
+	;;
+    paragon:*:*:*)
+	GUESS=i860-intel-osf1
+	;;
+    i860:*:4.*:*) # i860-SVR4
+	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+	  GUESS=i860-stardent-sysv$UNAME_RELEASE    # Stardent Vistra i860-SVR4
+	else # Add other i860-SVR4 vendors below as they are discovered.
+	  GUESS=i860-unknown-sysv$UNAME_RELEASE     # Unknown i860-SVR4
+	fi
+	;;
+    mini*:CTIX:SYS*5:*)
+	# "miniframe"
+	GUESS=m68010-convergent-sysv
+	;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+	GUESS=m68k-convergent-sysv
+	;;
+    M680?0:D-NIX:5.3:*)
+	GUESS=m68k-diab-dnix
+	;;
+    M68*:*:R3V[5678]*:*)
+	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+	OS_REL=''
+	test -r /etc/.relid \
+	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	  && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4; exit; } ;;
+    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+	OS_REL='.3'
+	test -r /etc/.relid \
+	    && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	    && { echo i486-ncr-sysv4.3"$OS_REL"; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; }
+	/bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+	    && { echo i586-ncr-sysv4.3"$OS_REL"; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+	GUESS=m68k-unknown-lynxos$UNAME_RELEASE
+	;;
+    mc68030:UNIX_System_V:4.*:*)
+	GUESS=m68k-atari-sysv4
+	;;
+    TSUNAMI:LynxOS:2.*:*)
+	GUESS=sparc-unknown-lynxos$UNAME_RELEASE
+	;;
+    rs6000:LynxOS:2.*:*)
+	GUESS=rs6000-unknown-lynxos$UNAME_RELEASE
+	;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
+	GUESS=powerpc-unknown-lynxos$UNAME_RELEASE
+	;;
+    SM[BE]S:UNIX_SV:*:*)
+	GUESS=mips-dde-sysv$UNAME_RELEASE
+	;;
+    RM*:ReliantUNIX-*:*:*)
+	GUESS=mips-sni-sysv4
+	;;
+    RM*:SINIX-*:*:*)
+	GUESS=mips-sni-sysv4
+	;;
+    *:SINIX-*:*:*)
+	if uname -p 2>/dev/null >/dev/null ; then
+		UNAME_MACHINE=`(uname -p) 2>/dev/null`
+		GUESS=$UNAME_MACHINE-sni-sysv4
+	else
+		GUESS=ns32k-sni-sysv
+	fi
+	;;
+    PENTIUM:*:4.0*:*)	# Unisys 'ClearPath HMP IX 4000' SVR4/MP effort
+			# says <Richard.M.Bartel@ccMail.Census.GOV>
+	GUESS=i586-unisys-sysv4
+	;;
+    *:UNIX_System_V:4*:FTX*)
+	# From Gerald Hewes <hewes@openmarket.com>.
+	# How about differentiating between stratus architectures? -djm
+	GUESS=hppa1.1-stratus-sysv4
+	;;
+    *:*:*:FTX*)
+	# From seanf@swdc.stratus.com.
+	GUESS=i860-stratus-sysv4
+	;;
+    i*86:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	GUESS=$UNAME_MACHINE-stratus-vos
+	;;
+    *:VOS:*:*)
+	# From Paul.Green@stratus.com.
+	GUESS=hppa1.1-stratus-vos
+	;;
+    mc68*:A/UX:*:*)
+	GUESS=m68k-apple-aux$UNAME_RELEASE
+	;;
+    news*:NEWS-OS:6*:*)
+	GUESS=mips-sony-newsos6
+	;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+	if test -d /usr/nec; then
+		GUESS=mips-nec-sysv$UNAME_RELEASE
+	else
+		GUESS=mips-unknown-sysv$UNAME_RELEASE
+	fi
+	;;
+    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
+	GUESS=powerpc-be-beos
+	;;
+    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
+	GUESS=powerpc-apple-beos
+	;;
+    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
+	GUESS=i586-pc-beos
+	;;
+    BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
+	GUESS=i586-pc-haiku
+	;;
+    ppc:Haiku:*:*)	# Haiku running on Apple PowerPC
+	GUESS=powerpc-apple-haiku
+	;;
+    *:Haiku:*:*)	# Haiku modern gcc (not bound by BeOS compat)
+	GUESS=$UNAME_MACHINE-unknown-haiku
+	;;
+    SX-4:SUPER-UX:*:*)
+	GUESS=sx4-nec-superux$UNAME_RELEASE
+	;;
+    SX-5:SUPER-UX:*:*)
+	GUESS=sx5-nec-superux$UNAME_RELEASE
+	;;
+    SX-6:SUPER-UX:*:*)
+	GUESS=sx6-nec-superux$UNAME_RELEASE
+	;;
+    SX-7:SUPER-UX:*:*)
+	GUESS=sx7-nec-superux$UNAME_RELEASE
+	;;
+    SX-8:SUPER-UX:*:*)
+	GUESS=sx8-nec-superux$UNAME_RELEASE
+	;;
+    SX-8R:SUPER-UX:*:*)
+	GUESS=sx8r-nec-superux$UNAME_RELEASE
+	;;
+    SX-ACE:SUPER-UX:*:*)
+	GUESS=sxace-nec-superux$UNAME_RELEASE
+	;;
+    Power*:Rhapsody:*:*)
+	GUESS=powerpc-apple-rhapsody$UNAME_RELEASE
+	;;
+    *:Rhapsody:*:*)
+	GUESS=$UNAME_MACHINE-apple-rhapsody$UNAME_RELEASE
+	;;
+    arm64:Darwin:*:*)
+	GUESS=aarch64-apple-darwin$UNAME_RELEASE
+	;;
+    *:Darwin:*:*)
+	UNAME_PROCESSOR=`uname -p`
+	case $UNAME_PROCESSOR in
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
+	if command -v xcode-select > /dev/null 2> /dev/null && \
+		! xcode-select --print-path > /dev/null 2> /dev/null ; then
+	    # Avoid executing cc if there is no toolchain installed as
+	    # cc will be a stub that puts up a graphical alert
+	    # prompting the user to install developer tools.
+	    CC_FOR_BUILD=no_compiler_found
+	else
+	    set_cc_for_build
+	fi
+	if test "$CC_FOR_BUILD" != no_compiler_found; then
+	    if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		   grep IS_64BIT_ARCH >/dev/null
+	    then
+		case $UNAME_PROCESSOR in
+		    i386) UNAME_PROCESSOR=x86_64 ;;
+		    powerpc) UNAME_PROCESSOR=powerpc64 ;;
+		esac
+	    fi
+	    # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc
+	    if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \
+		   (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		   grep IS_PPC >/dev/null
+	    then
+		UNAME_PROCESSOR=powerpc
+	    fi
+	elif test "$UNAME_PROCESSOR" = i386 ; then
+	    # uname -m returns i386 or x86_64
+	    UNAME_PROCESSOR=$UNAME_MACHINE
+	fi
+	GUESS=$UNAME_PROCESSOR-apple-darwin$UNAME_RELEASE
+	;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+	UNAME_PROCESSOR=`uname -p`
+	if test "$UNAME_PROCESSOR" = x86; then
+		UNAME_PROCESSOR=i386
+		UNAME_MACHINE=pc
+	fi
+	GUESS=$UNAME_PROCESSOR-$UNAME_MACHINE-nto-qnx$UNAME_RELEASE
+	;;
+    *:QNX:*:4*)
+	GUESS=i386-pc-qnx
+	;;
+    NEO-*:NONSTOP_KERNEL:*:*)
+	GUESS=neo-tandem-nsk$UNAME_RELEASE
+	;;
+    NSE-*:NONSTOP_KERNEL:*:*)
+	GUESS=nse-tandem-nsk$UNAME_RELEASE
+	;;
+    NSR-*:NONSTOP_KERNEL:*:*)
+	GUESS=nsr-tandem-nsk$UNAME_RELEASE
+	;;
+    NSV-*:NONSTOP_KERNEL:*:*)
+	GUESS=nsv-tandem-nsk$UNAME_RELEASE
+	;;
+    NSX-*:NONSTOP_KERNEL:*:*)
+	GUESS=nsx-tandem-nsk$UNAME_RELEASE
+	;;
+    *:NonStop-UX:*:*)
+	GUESS=mips-compaq-nonstopux
+	;;
+    BS2000:POSIX*:*:*)
+	GUESS=bs2000-siemens-sysv
+	;;
+    DS/*:UNIX_System_V:*:*)
+	GUESS=$UNAME_MACHINE-$UNAME_SYSTEM-$UNAME_RELEASE
+	;;
+    *:Plan9:*:*)
+	# "uname -m" is not consistent, so use $cputype instead. 386
+	# is converted to i386 for consistency with other x86
+	# operating systems.
+	if test "${cputype-}" = 386; then
+	    UNAME_MACHINE=i386
+	elif test "x${cputype-}" != x; then
+	    UNAME_MACHINE=$cputype
+	fi
+	GUESS=$UNAME_MACHINE-unknown-plan9
+	;;
+    *:TOPS-10:*:*)
+	GUESS=pdp10-unknown-tops10
+	;;
+    *:TENEX:*:*)
+	GUESS=pdp10-unknown-tenex
+	;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+	GUESS=pdp10-dec-tops20
+	;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+	GUESS=pdp10-xkl-tops20
+	;;
+    *:TOPS-20:*:*)
+	GUESS=pdp10-unknown-tops20
+	;;
+    *:ITS:*:*)
+	GUESS=pdp10-unknown-its
+	;;
+    SEI:*:*:SEIUX)
+	GUESS=mips-sei-seiux$UNAME_RELEASE
+	;;
+    *:DragonFly:*:*)
+	DRAGONFLY_REL=`echo "$UNAME_RELEASE" | sed -e 's/[-(].*//'`
+	GUESS=$UNAME_MACHINE-unknown-dragonfly$DRAGONFLY_REL
+	;;
+    *:*VMS:*:*)
+	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	case $UNAME_MACHINE in
+	    A*) GUESS=alpha-dec-vms ;;
+	    I*) GUESS=ia64-dec-vms ;;
+	    V*) GUESS=vax-dec-vms ;;
+	esac ;;
+    *:XENIX:*:SysV)
+	GUESS=i386-pc-xenix
+	;;
+    i*86:skyos:*:*)
+	SKYOS_REL=`echo "$UNAME_RELEASE" | sed -e 's/ .*$//'`
+	GUESS=$UNAME_MACHINE-pc-skyos$SKYOS_REL
+	;;
+    i*86:rdos:*:*)
+	GUESS=$UNAME_MACHINE-pc-rdos
+	;;
+    i*86:Fiwix:*:*)
+	GUESS=$UNAME_MACHINE-pc-fiwix
+	;;
+    *:AROS:*:*)
+	GUESS=$UNAME_MACHINE-unknown-aros
+	;;
+    x86_64:VMkernel:*:*)
+	GUESS=$UNAME_MACHINE-unknown-esx
+	;;
+    amd64:Isilon\ OneFS:*:*)
+	GUESS=x86_64-unknown-onefs
+	;;
+    *:Unleashed:*:*)
+	GUESS=$UNAME_MACHINE-unknown-unleashed$UNAME_RELEASE
+	;;
+    *:Ironclad:*:*)
+	GUESS=$UNAME_MACHINE-unknown-ironclad
+	;;
+esac
+
+# Do we have a guess based on uname results?
+if test "x$GUESS" != x; then
+    echo "$GUESS"
+    exit
+fi
+
+# No uname command or uname output not recognized.
+set_cc_for_build
+cat > "$dummy.c" <<EOF
+#ifdef _SEQUENT_
+#include <sys/types.h>
+#include <sys/utsname.h>
+#endif
+#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
+#if defined (vax) || defined (__vax) || defined (__vax__) || defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
+#include <signal.h>
+#if defined(_SIZE_T_) || defined(SIGLOST)
+#include <sys/utsname.h>
+#endif
+#endif
+#endif
+int
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+  "4"
+#else
+  ""
+#endif
+  ); exit (0);
+#endif
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+  struct utsname un;
+
+  uname(&un);
+  if (strncmp(un.version, "V2", 2) == 0) {
+    printf ("i386-sequent-ptx2\n"); exit (0);
+  }
+  if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+    printf ("i386-sequent-ptx1\n"); exit (0);
+  }
+  printf ("i386-sequent-ptx\n"); exit (0);
+#endif
+
+#if defined (vax)
+#if !defined (ultrix)
+#include <sys/param.h>
+#if defined (BSD)
+#if BSD == 43
+  printf ("vax-dec-bsd4.3\n"); exit (0);
+#else
+#if BSD == 199006
+  printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#else
+  printf ("vax-dec-bsd\n"); exit (0);
+#endif
+#endif
+#else
+  printf ("vax-dec-bsd\n"); exit (0);
+#endif
+#else
+#if defined(_SIZE_T_) || defined(SIGLOST)
+  struct utsname un;
+  uname (&un);
+  printf ("vax-dec-ultrix%s\n", un.release); exit (0);
+#else
+  printf ("vax-dec-ultrix\n"); exit (0);
+#endif
+#endif
+#endif
+#if defined(ultrix) || defined(_ultrix) || defined(__ultrix) || defined(__ultrix__)
+#if defined(mips) || defined(__mips) || defined(__mips__) || defined(MIPS) || defined(__MIPS__)
+#if defined(_SIZE_T_) || defined(SIGLOST)
+  struct utsname *un;
+  uname (&un);
+  printf ("mips-dec-ultrix%s\n", un.release); exit (0);
+#else
+  printf ("mips-dec-ultrix\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o "$dummy" "$dummy.c" 2>/dev/null && SYSTEM_NAME=`"$dummy"` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+test -d /usr/apollo && { echo "$ISP-apollo-$SYSTYPE"; exit; }
+
+echo "$0: unable to guess system type" >&2
+
+case $UNAME_MACHINE:$UNAME_SYSTEM in
+    mips:Linux | mips64:Linux)
+	# If we got here on MIPS GNU/Linux, output extra information.
+	cat >&2 <<EOF
+
+NOTE: MIPS GNU/Linux systems require a C compiler to fully recognize
+the system type. Please install a C compiler and try again.
+EOF
+	;;
+esac
+
+cat >&2 <<EOF
+
+This script (version $timestamp), has failed to recognize the
+operating system you are using. If your script is old, overwrite *all*
+copies of config.guess and config.sub with the latest versions from:
+
+  https://git.savannah.gnu.org/cgit/config.git/plain/config.guess
+and
+  https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
+EOF
+
+our_year=`echo $timestamp | sed 's,-.*,,'`
+thisyear=`date +%Y`
+# shellcheck disable=SC2003
+script_age=`expr "$thisyear" - "$our_year"`
+if test "$script_age" -lt 3 ; then
+   cat >&2 <<EOF
+
+If $0 has already been updated, send the following data and any
+information you think might be pertinent to config-patches@gnu.org to
+provide the necessary information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = "$UNAME_MACHINE"
+UNAME_RELEASE = "$UNAME_RELEASE"
+UNAME_SYSTEM  = "$UNAME_SYSTEM"
+UNAME_VERSION = "$UNAME_VERSION"
+EOF
+fi
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'before-save-hook 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/thirdparty/patches/config.sub b/thirdparty/patches/config.sub
new file mode 100644
index 0000000000..4aaae46f6f
--- /dev/null
+++ b/thirdparty/patches/config.sub
@@ -0,0 +1,2354 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright 1992-2024 Free Software Foundation, Inc.
+
+# shellcheck disable=SC2006,SC2268,SC2162 # see below for rationale
+
+timestamp='2024-05-27'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <https://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+
+
+# Please send patches to <config-patches@gnu.org>.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# You can get the latest version of this script from:
+# https://git.savannah.gnu.org/cgit/config.git/plain/config.sub
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+# The "shellcheck disable" line above the timestamp inhibits complaints
+# about features and limitations of the classic Bourne shell that were
+# superseded or lifted in POSIX.  However, this script identifies a wide
+# variety of pre-POSIX systems that do not have POSIX shells at all, and
+# even some reasonably current systems (Solaris 10 as case-in-point) still
+# have a pre-POSIX /bin/sh.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
+
+Canonicalize a configuration name.
+
+Options:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright 1992-2024 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try '$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )	# Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo "$1"
+       exit ;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Split fields of configuration type
+saved_IFS=$IFS
+IFS="-" read field1 field2 field3 field4 <<EOF
+$1
+EOF
+IFS=$saved_IFS
+
+# Separate into logical components for further validation
+case $1 in
+	*-*-*-*-*)
+		echo "Invalid configuration '$1': more than four components" >&2
+		exit 1
+		;;
+	*-*-*-*)
+		basic_machine=$field1-$field2
+		basic_os=$field3-$field4
+		;;
+	*-*-*)
+		# Ambiguous whether COMPANY is present, or skipped and KERNEL-OS is two
+		# parts
+		maybe_os=$field2-$field3
+		case $maybe_os in
+			  cloudabi*-eabi* \
+			| kfreebsd*-gnu* \
+			| knetbsd*-gnu* \
+			| kopensolaris*-gnu* \
+			| linux-* \
+			| managarm-* \
+			| netbsd*-eabi* \
+			| netbsd*-gnu* \
+			| nto-qnx* \
+			| os2-emx* \
+			| rtmk-nova* \
+			| storm-chaos* \
+			| uclinux-gnu* \
+			| uclinux-uclibc* \
+			| windows-* )
+				basic_machine=$field1
+				basic_os=$maybe_os
+				;;
+			android-linux)
+				basic_machine=$field1-unknown
+				basic_os=linux-android
+				;;
+			*)
+				basic_machine=$field1-$field2
+				basic_os=$field3
+				;;
+		esac
+		;;
+	*-*)
+		case $field1-$field2 in
+			# Shorthands that happen to contain a single dash
+			convex-c[12] | convex-c3[248])
+				basic_machine=$field2-convex
+				basic_os=
+				;;
+			decstation-3100)
+				basic_machine=mips-dec
+				basic_os=
+				;;
+			*-*)
+				# Second component is usually, but not always the OS
+				case $field2 in
+					# Do not treat sunos as a manufacturer
+					sun*os*)
+						basic_machine=$field1
+						basic_os=$field2
+						;;
+					# Manufacturers
+					  3100* \
+					| 32* \
+					| 3300* \
+					| 3600* \
+					| 7300* \
+					| acorn \
+					| altos* \
+					| apollo \
+					| apple \
+					| atari \
+					| att* \
+					| axis \
+					| be \
+					| bull \
+					| cbm \
+					| ccur \
+					| cisco \
+					| commodore \
+					| convergent* \
+					| convex* \
+					| cray \
+					| crds \
+					| dec* \
+					| delta* \
+					| dg \
+					| digital \
+					| dolphin \
+					| encore* \
+					| gould \
+					| harris \
+					| highlevel \
+					| hitachi* \
+					| hp \
+					| ibm* \
+					| intergraph \
+					| isi* \
+					| knuth \
+					| masscomp \
+					| microblaze* \
+					| mips* \
+					| motorola* \
+					| ncr* \
+					| news \
+					| next \
+					| ns \
+					| oki \
+					| omron* \
+					| pc533* \
+					| rebel \
+					| rom68k \
+					| rombug \
+					| semi \
+					| sequent* \
+					| siemens \
+					| sgi* \
+					| siemens \
+					| sim \
+					| sni \
+					| sony* \
+					| stratus \
+					| sun \
+					| sun[234]* \
+					| tektronix \
+					| tti* \
+					| ultra \
+					| unicom* \
+					| wec \
+					| winbond \
+					| wrs)
+						basic_machine=$field1-$field2
+						basic_os=
+						;;
+					zephyr*)
+						basic_machine=$field1-unknown
+						basic_os=$field2
+						;;
+					*)
+						basic_machine=$field1
+						basic_os=$field2
+						;;
+				esac
+			;;
+		esac
+		;;
+	*)
+		# Convert single-component short-hands not valid as part of
+		# multi-component configurations.
+		case $field1 in
+			386bsd)
+				basic_machine=i386-pc
+				basic_os=bsd
+				;;
+			a29khif)
+				basic_machine=a29k-amd
+				basic_os=udi
+				;;
+			adobe68k)
+				basic_machine=m68010-adobe
+				basic_os=scout
+				;;
+			alliant)
+				basic_machine=fx80-alliant
+				basic_os=
+				;;
+			altos | altos3068)
+				basic_machine=m68k-altos
+				basic_os=
+				;;
+			am29k)
+				basic_machine=a29k-none
+				basic_os=bsd
+				;;
+			amdahl)
+				basic_machine=580-amdahl
+				basic_os=sysv
+				;;
+			amiga)
+				basic_machine=m68k-unknown
+				basic_os=
+				;;
+			amigaos | amigados)
+				basic_machine=m68k-unknown
+				basic_os=amigaos
+				;;
+			amigaunix | amix)
+				basic_machine=m68k-unknown
+				basic_os=sysv4
+				;;
+			apollo68)
+				basic_machine=m68k-apollo
+				basic_os=sysv
+				;;
+			apollo68bsd)
+				basic_machine=m68k-apollo
+				basic_os=bsd
+				;;
+			aros)
+				basic_machine=i386-pc
+				basic_os=aros
+				;;
+			aux)
+				basic_machine=m68k-apple
+				basic_os=aux
+				;;
+			balance)
+				basic_machine=ns32k-sequent
+				basic_os=dynix
+				;;
+			blackfin)
+				basic_machine=bfin-unknown
+				basic_os=linux
+				;;
+			cegcc)
+				basic_machine=arm-unknown
+				basic_os=cegcc
+				;;
+			cray)
+				basic_machine=j90-cray
+				basic_os=unicos
+				;;
+			crds | unos)
+				basic_machine=m68k-crds
+				basic_os=
+				;;
+			da30)
+				basic_machine=m68k-da30
+				basic_os=
+				;;
+			decstation | pmax | pmin | dec3100 | decstatn)
+				basic_machine=mips-dec
+				basic_os=
+				;;
+			delta88)
+				basic_machine=m88k-motorola
+				basic_os=sysv3
+				;;
+			dicos)
+				basic_machine=i686-pc
+				basic_os=dicos
+				;;
+			djgpp)
+				basic_machine=i586-pc
+				basic_os=msdosdjgpp
+				;;
+			ebmon29k)
+				basic_machine=a29k-amd
+				basic_os=ebmon
+				;;
+			es1800 | OSE68k | ose68k | ose | OSE)
+				basic_machine=m68k-ericsson
+				basic_os=ose
+				;;
+			gmicro)
+				basic_machine=tron-gmicro
+				basic_os=sysv
+				;;
+			go32)
+				basic_machine=i386-pc
+				basic_os=go32
+				;;
+			h8300hms)
+				basic_machine=h8300-hitachi
+				basic_os=hms
+				;;
+			h8300xray)
+				basic_machine=h8300-hitachi
+				basic_os=xray
+				;;
+			h8500hms)
+				basic_machine=h8500-hitachi
+				basic_os=hms
+				;;
+			harris)
+				basic_machine=m88k-harris
+				basic_os=sysv3
+				;;
+			hp300 | hp300hpux)
+				basic_machine=m68k-hp
+				basic_os=hpux
+				;;
+			hp300bsd)
+				basic_machine=m68k-hp
+				basic_os=bsd
+				;;
+			hppaosf)
+				basic_machine=hppa1.1-hp
+				basic_os=osf
+				;;
+			hppro)
+				basic_machine=hppa1.1-hp
+				basic_os=proelf
+				;;
+			i386mach)
+				basic_machine=i386-mach
+				basic_os=mach
+				;;
+			isi68 | isi)
+				basic_machine=m68k-isi
+				basic_os=sysv
+				;;
+			m68knommu)
+				basic_machine=m68k-unknown
+				basic_os=linux
+				;;
+			magnum | m3230)
+				basic_machine=mips-mips
+				basic_os=sysv
+				;;
+			merlin)
+				basic_machine=ns32k-utek
+				basic_os=sysv
+				;;
+			mingw64)
+				basic_machine=x86_64-pc
+				basic_os=mingw64
+				;;
+			mingw32)
+				basic_machine=i686-pc
+				basic_os=mingw32
+				;;
+			mingw32ce)
+				basic_machine=arm-unknown
+				basic_os=mingw32ce
+				;;
+			monitor)
+				basic_machine=m68k-rom68k
+				basic_os=coff
+				;;
+			morphos)
+				basic_machine=powerpc-unknown
+				basic_os=morphos
+				;;
+			moxiebox)
+				basic_machine=moxie-unknown
+				basic_os=moxiebox
+				;;
+			msdos)
+				basic_machine=i386-pc
+				basic_os=msdos
+				;;
+			msys)
+				basic_machine=i686-pc
+				basic_os=msys
+				;;
+			mvs)
+				basic_machine=i370-ibm
+				basic_os=mvs
+				;;
+			nacl)
+				basic_machine=le32-unknown
+				basic_os=nacl
+				;;
+			ncr3000)
+				basic_machine=i486-ncr
+				basic_os=sysv4
+				;;
+			netbsd386)
+				basic_machine=i386-pc
+				basic_os=netbsd
+				;;
+			netwinder)
+				basic_machine=armv4l-rebel
+				basic_os=linux
+				;;
+			news | news700 | news800 | news900)
+				basic_machine=m68k-sony
+				basic_os=newsos
+				;;
+			news1000)
+				basic_machine=m68030-sony
+				basic_os=newsos
+				;;
+			necv70)
+				basic_machine=v70-nec
+				basic_os=sysv
+				;;
+			nh3000)
+				basic_machine=m68k-harris
+				basic_os=cxux
+				;;
+			nh[45]000)
+				basic_machine=m88k-harris
+				basic_os=cxux
+				;;
+			nindy960)
+				basic_machine=i960-intel
+				basic_os=nindy
+				;;
+			mon960)
+				basic_machine=i960-intel
+				basic_os=mon960
+				;;
+			nonstopux)
+				basic_machine=mips-compaq
+				basic_os=nonstopux
+				;;
+			os400)
+				basic_machine=powerpc-ibm
+				basic_os=os400
+				;;
+			OSE68000 | ose68000)
+				basic_machine=m68000-ericsson
+				basic_os=ose
+				;;
+			os68k)
+				basic_machine=m68k-none
+				basic_os=os68k
+				;;
+			paragon)
+				basic_machine=i860-intel
+				basic_os=osf
+				;;
+			parisc)
+				basic_machine=hppa-unknown
+				basic_os=linux
+				;;
+			psp)
+				basic_machine=mipsallegrexel-sony
+				basic_os=psp
+				;;
+			pw32)
+				basic_machine=i586-unknown
+				basic_os=pw32
+				;;
+			rdos | rdos64)
+				basic_machine=x86_64-pc
+				basic_os=rdos
+				;;
+			rdos32)
+				basic_machine=i386-pc
+				basic_os=rdos
+				;;
+			rom68k)
+				basic_machine=m68k-rom68k
+				basic_os=coff
+				;;
+			sa29200)
+				basic_machine=a29k-amd
+				basic_os=udi
+				;;
+			sei)
+				basic_machine=mips-sei
+				basic_os=seiux
+				;;
+			sequent)
+				basic_machine=i386-sequent
+				basic_os=
+				;;
+			sps7)
+				basic_machine=m68k-bull
+				basic_os=sysv2
+				;;
+			st2000)
+				basic_machine=m68k-tandem
+				basic_os=
+				;;
+			stratus)
+				basic_machine=i860-stratus
+				basic_os=sysv4
+				;;
+			sun2)
+				basic_machine=m68000-sun
+				basic_os=
+				;;
+			sun2os3)
+				basic_machine=m68000-sun
+				basic_os=sunos3
+				;;
+			sun2os4)
+				basic_machine=m68000-sun
+				basic_os=sunos4
+				;;
+			sun3)
+				basic_machine=m68k-sun
+				basic_os=
+				;;
+			sun3os3)
+				basic_machine=m68k-sun
+				basic_os=sunos3
+				;;
+			sun3os4)
+				basic_machine=m68k-sun
+				basic_os=sunos4
+				;;
+			sun4)
+				basic_machine=sparc-sun
+				basic_os=
+				;;
+			sun4os3)
+				basic_machine=sparc-sun
+				basic_os=sunos3
+				;;
+			sun4os4)
+				basic_machine=sparc-sun
+				basic_os=sunos4
+				;;
+			sun4sol2)
+				basic_machine=sparc-sun
+				basic_os=solaris2
+				;;
+			sun386 | sun386i | roadrunner)
+				basic_machine=i386-sun
+				basic_os=
+				;;
+			sv1)
+				basic_machine=sv1-cray
+				basic_os=unicos
+				;;
+			symmetry)
+				basic_machine=i386-sequent
+				basic_os=dynix
+				;;
+			t3e)
+				basic_machine=alphaev5-cray
+				basic_os=unicos
+				;;
+			t90)
+				basic_machine=t90-cray
+				basic_os=unicos
+				;;
+			toad1)
+				basic_machine=pdp10-xkl
+				basic_os=tops20
+				;;
+			tpf)
+				basic_machine=s390x-ibm
+				basic_os=tpf
+				;;
+			udi29k)
+				basic_machine=a29k-amd
+				basic_os=udi
+				;;
+			ultra3)
+				basic_machine=a29k-nyu
+				basic_os=sym1
+				;;
+			v810 | necv810)
+				basic_machine=v810-nec
+				basic_os=none
+				;;
+			vaxv)
+				basic_machine=vax-dec
+				basic_os=sysv
+				;;
+			vms)
+				basic_machine=vax-dec
+				basic_os=vms
+				;;
+			vsta)
+				basic_machine=i386-pc
+				basic_os=vsta
+				;;
+			vxworks960)
+				basic_machine=i960-wrs
+				basic_os=vxworks
+				;;
+			vxworks68)
+				basic_machine=m68k-wrs
+				basic_os=vxworks
+				;;
+			vxworks29k)
+				basic_machine=a29k-wrs
+				basic_os=vxworks
+				;;
+			xbox)
+				basic_machine=i686-pc
+				basic_os=mingw32
+				;;
+			ymp)
+				basic_machine=ymp-cray
+				basic_os=unicos
+				;;
+			*)
+				basic_machine=$1
+				basic_os=
+				;;
+		esac
+		;;
+esac
+
+# Decode 1-component or ad-hoc basic machines
+case $basic_machine in
+	# Here we handle the default manufacturer of certain CPU types.  It is in
+	# some cases the only manufacturer, in others, it is the most popular.
+	w89k)
+		cpu=hppa1.1
+		vendor=winbond
+		;;
+	op50n)
+		cpu=hppa1.1
+		vendor=oki
+		;;
+	op60c)
+		cpu=hppa1.1
+		vendor=oki
+		;;
+	ibm*)
+		cpu=i370
+		vendor=ibm
+		;;
+	orion105)
+		cpu=clipper
+		vendor=highlevel
+		;;
+	mac | mpw | mac-mpw)
+		cpu=m68k
+		vendor=apple
+		;;
+	pmac | pmac-mpw)
+		cpu=powerpc
+		vendor=apple
+		;;
+
+	# Recognize the various machine names and aliases which stand
+	# for a CPU type and a company and sometimes even an OS.
+	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+		cpu=m68000
+		vendor=att
+		;;
+	3b*)
+		cpu=we32k
+		vendor=att
+		;;
+	bluegene*)
+		cpu=powerpc
+		vendor=ibm
+		basic_os=cnk
+		;;
+	decsystem10* | dec10*)
+		cpu=pdp10
+		vendor=dec
+		basic_os=tops10
+		;;
+	decsystem20* | dec20*)
+		cpu=pdp10
+		vendor=dec
+		basic_os=tops20
+		;;
+	delta | 3300 | delta-motorola | 3300-motorola | motorola-delta | motorola-3300)
+		cpu=m68k
+		vendor=motorola
+		;;
+	# This used to be dpx2*, but that gets the RS6000-based
+	# DPX/20 and the x86-based DPX/2-100 wrong.  See
+	# https://oldskool.silicium.org/stations/bull_dpx20.htm
+	# https://www.feb-patrimoine.com/english/bull_dpx2.htm
+	# https://www.feb-patrimoine.com/english/unix_and_bull.htm
+	dpx2 | dpx2[23]00 | dpx2[23]xx)
+		cpu=m68k
+		vendor=bull
+		;;
+	dpx2100 | dpx21xx)
+		cpu=i386
+		vendor=bull
+		;;
+	dpx20)
+		cpu=rs6000
+		vendor=bull
+		;;
+	encore | umax | mmax)
+		cpu=ns32k
+		vendor=encore
+		;;
+	elxsi)
+		cpu=elxsi
+		vendor=elxsi
+		basic_os=${basic_os:-bsd}
+		;;
+	fx2800)
+		cpu=i860
+		vendor=alliant
+		;;
+	genix)
+		cpu=ns32k
+		vendor=ns
+		;;
+	h3050r* | hiux*)
+		cpu=hppa1.1
+		vendor=hitachi
+		basic_os=hiuxwe2
+		;;
+	hp3k9[0-9][0-9] | hp9[0-9][0-9])
+		cpu=hppa1.0
+		vendor=hp
+		;;
+	hp9k2[0-9][0-9] | hp9k31[0-9])
+		cpu=m68000
+		vendor=hp
+		;;
+	hp9k3[2-9][0-9])
+		cpu=m68k
+		vendor=hp
+		;;
+	hp9k6[0-9][0-9] | hp6[0-9][0-9])
+		cpu=hppa1.0
+		vendor=hp
+		;;
+	hp9k7[0-79][0-9] | hp7[0-79][0-9])
+		cpu=hppa1.1
+		vendor=hp
+		;;
+	hp9k78[0-9] | hp78[0-9])
+		# FIXME: really hppa2.0-hp
+		cpu=hppa1.1
+		vendor=hp
+		;;
+	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+		# FIXME: really hppa2.0-hp
+		cpu=hppa1.1
+		vendor=hp
+		;;
+	hp9k8[0-9][13679] | hp8[0-9][13679])
+		cpu=hppa1.1
+		vendor=hp
+		;;
+	hp9k8[0-9][0-9] | hp8[0-9][0-9])
+		cpu=hppa1.0
+		vendor=hp
+		;;
+	i*86v32)
+		cpu=`echo "$1" | sed -e 's/86.*/86/'`
+		vendor=pc
+		basic_os=sysv32
+		;;
+	i*86v4*)
+		cpu=`echo "$1" | sed -e 's/86.*/86/'`
+		vendor=pc
+		basic_os=sysv4
+		;;
+	i*86v)
+		cpu=`echo "$1" | sed -e 's/86.*/86/'`
+		vendor=pc
+		basic_os=sysv
+		;;
+	i*86sol2)
+		cpu=`echo "$1" | sed -e 's/86.*/86/'`
+		vendor=pc
+		basic_os=solaris2
+		;;
+	j90 | j90-cray)
+		cpu=j90
+		vendor=cray
+		basic_os=${basic_os:-unicos}
+		;;
+	iris | iris4d)
+		cpu=mips
+		vendor=sgi
+		case $basic_os in
+		    irix*)
+			;;
+		    *)
+			basic_os=irix4
+			;;
+		esac
+		;;
+	miniframe)
+		cpu=m68000
+		vendor=convergent
+		;;
+	*mint | mint[0-9]* | *MiNT | *MiNT[0-9]*)
+		cpu=m68k
+		vendor=atari
+		basic_os=mint
+		;;
+	news-3600 | risc-news)
+		cpu=mips
+		vendor=sony
+		basic_os=newsos
+		;;
+	next | m*-next)
+		cpu=m68k
+		vendor=next
+		;;
+	np1)
+		cpu=np1
+		vendor=gould
+		;;
+	op50n-* | op60c-*)
+		cpu=hppa1.1
+		vendor=oki
+		basic_os=proelf
+		;;
+	pa-hitachi)
+		cpu=hppa1.1
+		vendor=hitachi
+		basic_os=hiuxwe2
+		;;
+	pbd)
+		cpu=sparc
+		vendor=tti
+		;;
+	pbb)
+		cpu=m68k
+		vendor=tti
+		;;
+	pc532)
+		cpu=ns32k
+		vendor=pc532
+		;;
+	pn)
+		cpu=pn
+		vendor=gould
+		;;
+	power)
+		cpu=power
+		vendor=ibm
+		;;
+	ps2)
+		cpu=i386
+		vendor=ibm
+		;;
+	rm[46]00)
+		cpu=mips
+		vendor=siemens
+		;;
+	rtpc | rtpc-*)
+		cpu=romp
+		vendor=ibm
+		;;
+	sde)
+		cpu=mipsisa32
+		vendor=sde
+		basic_os=${basic_os:-elf}
+		;;
+	simso-wrs)
+		cpu=sparclite
+		vendor=wrs
+		basic_os=vxworks
+		;;
+	tower | tower-32)
+		cpu=m68k
+		vendor=ncr
+		;;
+	vpp*|vx|vx-*)
+		cpu=f301
+		vendor=fujitsu
+		;;
+	w65)
+		cpu=w65
+		vendor=wdc
+		;;
+	w89k-*)
+		cpu=hppa1.1
+		vendor=winbond
+		basic_os=proelf
+		;;
+	none)
+		cpu=none
+		vendor=none
+		;;
+	leon|leon[3-9])
+		cpu=sparc
+		vendor=$basic_machine
+		;;
+	leon-*|leon[3-9]-*)
+		cpu=sparc
+		vendor=`echo "$basic_machine" | sed 's/-.*//'`
+		;;
+
+	*-*)
+		saved_IFS=$IFS
+		IFS="-" read cpu vendor <<EOF
+$basic_machine
+EOF
+		IFS=$saved_IFS
+		;;
+	# We use 'pc' rather than 'unknown'
+	# because (1) that's what they normally are, and
+	# (2) the word "unknown" tends to confuse beginning users.
+	i*86 | x86_64)
+		cpu=$basic_machine
+		vendor=pc
+		;;
+	# These rules are duplicated from below for sake of the special case above;
+	# i.e. things that normalized to x86 arches should also default to "pc"
+	pc98)
+		cpu=i386
+		vendor=pc
+		;;
+	x64 | amd64)
+		cpu=x86_64
+		vendor=pc
+		;;
+	# Recognize the basic CPU types without company name.
+	*)
+		cpu=$basic_machine
+		vendor=unknown
+		;;
+esac
+
+unset -v basic_machine
+
+# Decode basic machines in the full and proper CPU-Company form.
+case $cpu-$vendor in
+	# Here we handle the default manufacturer of certain CPU types in canonical form.
+	# It is in some cases the only manufacturer, in others, it is the most popular.
+	c[12]-convex | c[12]-unknown | c3[248]-convex | c3[248]-unknown)
+		vendor=convex
+		basic_os=${basic_os:-bsd}
+		;;
+	craynv-unknown)
+		vendor=cray
+		basic_os=${basic_os:-unicosmp}
+		;;
+	c90-unknown | c90-cray)
+		vendor=cray
+		basic_os=${basic_os:-unicos}
+		;;
+	fx80-unknown)
+		vendor=alliant
+		;;
+	romp-unknown)
+		vendor=ibm
+		;;
+	mmix-unknown)
+		vendor=knuth
+		;;
+	microblaze-unknown | microblazeel-unknown)
+		vendor=xilinx
+		;;
+	rs6000-unknown)
+		vendor=ibm
+		;;
+	vax-unknown)
+		vendor=dec
+		;;
+	pdp11-unknown)
+		vendor=dec
+		;;
+	we32k-unknown)
+		vendor=att
+		;;
+	cydra-unknown)
+		vendor=cydrome
+		;;
+	i370-ibm*)
+		vendor=ibm
+		;;
+	orion-unknown)
+		vendor=highlevel
+		;;
+	xps-unknown | xps100-unknown)
+		cpu=xps100
+		vendor=honeywell
+		;;
+
+	# Here we normalize CPU types with a missing or matching vendor
+	armh-unknown | armh-alt)
+		cpu=armv7l
+		vendor=alt
+		basic_os=${basic_os:-linux-gnueabihf}
+		;;
+
+	# Normalized CPU+vendor pairs that imply an OS, if not otherwise specified
+	m68k-isi)
+		basic_os=${basic_os:-sysv}
+		;;
+	m68k-sony)
+		basic_os=${basic_os:-newsos}
+		;;
+	m68k-tektronix)
+		basic_os=${basic_os:-bsd}
+		;;
+	m88k-harris)
+		basic_os=${basic_os:-sysv3}
+		;;
+	i386-bull | m68k-bull)
+		basic_os=${basic_os:-sysv3}
+		;;
+	rs6000-bull)
+		basic_os=${basic_os:-bosx}
+		;;
+	mips-sni)
+		basic_os=${basic_os:-sysv4}
+		;;
+
+	# Here we normalize CPU types irrespective of the vendor
+	amd64-*)
+		cpu=x86_64
+		;;
+	blackfin-*)
+		cpu=bfin
+		basic_os=${basic_os:-linux}
+		;;
+	c54x-*)
+		cpu=tic54x
+		;;
+	c55x-*)
+		cpu=tic55x
+		;;
+	c6x-*)
+		cpu=tic6x
+		;;
+	e500v[12]-*)
+		cpu=powerpc
+		basic_os=${basic_os}"spe"
+		;;
+	mips3*-*)
+		cpu=mips64
+		;;
+	ms1-*)
+		cpu=mt
+		;;
+	m68knommu-*)
+		cpu=m68k
+		basic_os=${basic_os:-linux}
+		;;
+	m9s12z-* | m68hcs12z-* | hcs12z-* | s12z-*)
+		cpu=s12z
+		;;
+	openrisc-*)
+		cpu=or32
+		;;
+	parisc-*)
+		cpu=hppa
+		basic_os=${basic_os:-linux}
+		;;
+	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+		cpu=i586
+		;;
+	pentiumpro-* | p6-* | 6x86-* | athlon-* | athlon_*-*)
+		cpu=i686
+		;;
+	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+		cpu=i686
+		;;
+	pentium4-*)
+		cpu=i786
+		;;
+	ppc-* | ppcbe-*)
+		cpu=powerpc
+		;;
+	ppcle-* | powerpclittle-*)
+		cpu=powerpcle
+		;;
+	ppc64-*)
+		cpu=powerpc64
+		;;
+	ppc64le-* | powerpc64little-*)
+		cpu=powerpc64le
+		;;
+	sb1-*)
+		cpu=mipsisa64sb1
+		;;
+	sb1el-*)
+		cpu=mipsisa64sb1el
+		;;
+	sh5e[lb]-*)
+		cpu=`echo "$cpu" | sed 's/^\(sh.\)e\(.\)$/\1\2e/'`
+		;;
+	spur-*)
+		cpu=spur
+		;;
+	strongarm-* | thumb-*)
+		cpu=arm
+		;;
+	tx39-*)
+		cpu=mipstx39
+		;;
+	tx39el-*)
+		cpu=mipstx39el
+		;;
+	xscale-* | xscalee[bl]-*)
+		cpu=`echo "$cpu" | sed 's/^xscale/arm/'`
+		;;
+	arm64-* | aarch64le-*)
+		cpu=aarch64
+		;;
+
+	# Recognize the canonical CPU Types that limit and/or modify the
+	# company names they are paired with.
+	cr16-*)
+		basic_os=${basic_os:-elf}
+		;;
+	crisv32-* | etraxfs*-*)
+		cpu=crisv32
+		vendor=axis
+		;;
+	cris-* | etrax*-*)
+		cpu=cris
+		vendor=axis
+		;;
+	crx-*)
+		basic_os=${basic_os:-elf}
+		;;
+	neo-tandem)
+		cpu=neo
+		vendor=tandem
+		;;
+	nse-tandem)
+		cpu=nse
+		vendor=tandem
+		;;
+	nsr-tandem)
+		cpu=nsr
+		vendor=tandem
+		;;
+	nsv-tandem)
+		cpu=nsv
+		vendor=tandem
+		;;
+	nsx-tandem)
+		cpu=nsx
+		vendor=tandem
+		;;
+	mipsallegrexel-sony)
+		cpu=mipsallegrexel
+		vendor=sony
+		;;
+	tile*-*)
+		basic_os=${basic_os:-linux-gnu}
+		;;
+
+	*)
+		# Recognize the canonical CPU types that are allowed with any
+		# company name.
+		case $cpu in
+			  1750a \
+			| 580 \
+			| [cjt]90 \
+			| a29k \
+			| aarch64 \
+			| aarch64_be \
+			| aarch64c \
+			| abacus \
+			| alpha \
+			| alpha64 \
+			| alpha64ev56 \
+			| alpha64ev6[78] \
+			| alpha64ev[4-8] \
+			| alpha64pca5[67] \
+			| alphaev56 \
+			| alphaev6[78] \
+			| alphaev[4-8] \
+			| alphapca5[67] \
+			| am33_2.0 \
+			| amdgcn \
+			| arc \
+			| arc32 \
+			| arc64 \
+			| arceb \
+			| arm \
+			| arm64e \
+			| arm64ec \
+			| arm[lb]e \
+			| arme[lb] \
+			| armv* \
+			| asmjs \
+			| avr \
+			| avr32 \
+			| ba \
+			| be32 \
+			| be64 \
+			| bfin \
+			| bpf \
+			| bs2000 \
+			| c30 \
+			| c4x \
+			| c8051 \
+			| c[123]* \
+			| clipper \
+			| craynv \
+			| csky \
+			| cydra \
+			| d10v \
+			| d30v \
+			| dlx \
+			| dsp16xx \
+			| e2k \
+			| elxsi \
+			| epiphany \
+			| f30[01] \
+			| f700 \
+			| fido \
+			| fr30 \
+			| frv \
+			| ft32 \
+			| fx80 \
+			| h8300 \
+			| h8500 \
+			| hexagon \
+			| hppa \
+			| hppa1.[01] \
+			| hppa2.0 \
+			| hppa2.0[nw] \
+			| hppa64 \
+			| i*86 \
+			| i370 \
+			| i860 \
+			| i960 \
+			| ia16 \
+			| ia64 \
+			| ip2k \
+			| iq2000 \
+			| javascript \
+			| k1om \
+			| kvx \
+			| le32 \
+			| le64 \
+			| lm32 \
+			| loongarch32 \
+			| loongarch64 \
+			| m32c \
+			| m32r \
+			| m32rle \
+			| m5200 \
+			| m68000 \
+			| m680[012346]0 \
+			| m6811 \
+			| m6812 \
+			| m68360 \
+			| m683?2 \
+			| m68hc11 \
+			| m68hc12 \
+			| m68hcs12x \
+			| m68k \
+			| m88110 \
+			| m88k \
+			| maxq \
+			| mb \
+			| mcore \
+			| mep \
+			| metag \
+			| microblaze \
+			| microblazeel \
+			| mips* \
+			| mmix \
+			| mn10200 \
+			| mn10300 \
+			| moxie \
+			| msp430 \
+			| mt \
+			| nanomips* \
+			| nds32 \
+			| nds32be \
+			| nds32le \
+			| nfp \
+			| nios \
+			| nios2 \
+			| nios2eb \
+			| nios2el \
+			| none \
+			| np1 \
+			| ns16k \
+			| ns32k \
+			| nvptx \
+			| open8 \
+			| or1k* \
+			| or32 \
+			| orion \
+			| pdp10 \
+			| pdp11 \
+			| picochip \
+			| pj \
+			| pjl \
+			| pn \
+			| power \
+			| powerpc \
+			| powerpc64 \
+			| powerpc64le \
+			| powerpcle \
+			| powerpcspe \
+			| pru \
+			| pyramid \
+			| riscv \
+			| riscv32 \
+			| riscv32be \
+			| riscv64 \
+			| riscv64be \
+			| rl78 \
+			| romp \
+			| rs6000 \
+			| rx \
+			| s390 \
+			| s390x \
+			| score \
+			| sh \
+			| sh64 \
+			| sh64le \
+			| sh[12345][lb]e \
+			| sh[1234] \
+			| sh[1234]e[lb] \
+			| sh[23]e \
+			| sh[23]ele \
+			| sh[24]a \
+			| sh[24]ae[lb] \
+			| sh[lb]e \
+			| she[lb] \
+			| shl \
+			| sparc \
+			| sparc64 \
+			| sparc64b \
+			| sparc64v \
+			| sparc86x \
+			| sparclet \
+			| sparclite \
+			| sparcv8 \
+			| sparcv9 \
+			| sparcv9b \
+			| sparcv9v \
+			| spu \
+			| sv1 \
+			| sx* \
+			| tahoe \
+			| thumbv7* \
+			| tic30 \
+			| tic4x \
+			| tic54x \
+			| tic55x \
+			| tic6x \
+			| tic80 \
+			| tron \
+			| ubicom32 \
+			| v70 \
+			| v810 \
+			| v850 \
+			| v850e \
+			| v850e1 \
+			| v850e2 \
+			| v850e2v3 \
+			| v850es \
+			| vax \
+			| vc4 \
+			| visium \
+			| w65 \
+			| wasm32 \
+			| wasm64 \
+			| we32k \
+			| x86 \
+			| x86_64 \
+			| xc16x \
+			| xgate \
+			| xps100 \
+			| xstormy16 \
+			| xtensa* \
+			| ymp \
+			| z80 \
+			| z8k)
+				;;
+
+			*)
+				echo "Invalid configuration '$1': machine '$cpu-$vendor' not recognized" 1>&2
+				exit 1
+				;;
+		esac
+		;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $vendor in
+	digital*)
+		vendor=dec
+		;;
+	commodore*)
+		vendor=cbm
+		;;
+	*)
+		;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if test x"$basic_os" != x
+then
+
+# First recognize some ad-hoc cases, or perhaps split kernel-os, or else just
+# set os.
+obj=
+case $basic_os in
+	gnu/linux*)
+		kernel=linux
+		os=`echo "$basic_os" | sed -e 's|gnu/linux|gnu|'`
+		;;
+	os2-emx)
+		kernel=os2
+		os=`echo "$basic_os" | sed -e 's|os2-emx|emx|'`
+		;;
+	nto-qnx*)
+		kernel=nto
+		os=`echo "$basic_os" | sed -e 's|nto-qnx|qnx|'`
+		;;
+	*-*)
+		saved_IFS=$IFS
+		IFS="-" read kernel os <<EOF
+$basic_os
+EOF
+		IFS=$saved_IFS
+		;;
+	# Default OS when just kernel was specified
+	nto*)
+		kernel=nto
+		os=`echo "$basic_os" | sed -e 's|nto|qnx|'`
+		;;
+	linux*)
+		kernel=linux
+		os=`echo "$basic_os" | sed -e 's|linux|gnu|'`
+		;;
+	managarm*)
+		kernel=managarm
+		os=`echo "$basic_os" | sed -e 's|managarm|mlibc|'`
+		;;
+	*)
+		kernel=
+		os=$basic_os
+		;;
+esac
+
+# Now, normalize the OS (knowing we just have one component, it's not a kernel,
+# etc.)
+case $os in
+	# First match some system type aliases that might get confused
+	# with valid system types.
+	# solaris* is a basic system type, with this one exception.
+	auroraux)
+		os=auroraux
+		;;
+	bluegene*)
+		os=cnk
+		;;
+	solaris1 | solaris1.*)
+		os=`echo "$os" | sed -e 's|solaris1|sunos4|'`
+		;;
+	solaris)
+		os=solaris2
+		;;
+	unixware*)
+		os=sysv4.2uw
+		;;
+	# The marketing names for NeXT's operating systems were
+	# NeXTSTEP, NeXTSTEP 2, OpenSTEP 3, OpenSTEP 4.  'openstep' is
+	# mapped to 'openstep3', but 'openstep1' and 'openstep2' are
+	# mapped to 'nextstep' and 'nextstep2', consistent with the
+	# treatment of SunOS/Solaris.
+	ns | ns1 | nextstep | nextstep1 | openstep1)
+		os=nextstep
+		;;
+	ns2 | nextstep2 | openstep2)
+		os=nextstep2
+		;;
+	ns3 | nextstep3 | openstep | openstep3)
+		os=openstep3
+		;;
+	ns4 | nextstep4 | openstep4)
+		os=openstep4
+		;;
+	# es1800 is here to avoid being matched by es* (a different OS)
+	es1800*)
+		os=ose
+		;;
+	# Some version numbers need modification
+	chorusos*)
+		os=chorusos
+		;;
+	isc)
+		os=isc2.2
+		;;
+	sco6)
+		os=sco5v6
+		;;
+	sco5)
+		os=sco3.2v5
+		;;
+	sco4)
+		os=sco3.2v4
+		;;
+	sco3.2.[4-9]*)
+		os=`echo "$os" | sed -e 's/sco3.2./sco3.2v/'`
+		;;
+	sco*v* | scout)
+		# Don't match below
+		;;
+	sco*)
+		os=sco3.2v2
+		;;
+	psos*)
+		os=psos
+		;;
+	qnx*)
+		os=qnx
+		;;
+	hiux*)
+		os=hiuxwe2
+		;;
+	lynx*178)
+		os=lynxos178
+		;;
+	lynx*5)
+		os=lynxos5
+		;;
+	lynxos*)
+		# don't get caught up in next wildcard
+		;;
+	lynx*)
+		os=lynxos
+		;;
+	mac[0-9]*)
+		os=`echo "$os" | sed -e 's|mac|macos|'`
+		;;
+	opened*)
+		os=openedition
+		;;
+	os400*)
+		os=os400
+		;;
+	sunos5*)
+		os=`echo "$os" | sed -e 's|sunos5|solaris2|'`
+		;;
+	sunos6*)
+		os=`echo "$os" | sed -e 's|sunos6|solaris3|'`
+		;;
+	wince*)
+		os=wince
+		;;
+	utek*)
+		os=bsd
+		vendor=`echo "$vendor" | sed -e 's|^unknown$|tektronix|'`
+		;;
+	dynix*)
+		os=bsd
+		;;
+	acis*)
+		os=aos
+		;;
+	atheos*)
+		os=atheos
+		;;
+	syllable*)
+		os=syllable
+		;;
+	386bsd)
+		os=bsd
+		;;
+	ctix*)
+		os=sysv
+		vendor=`echo "$vendor" | sed -e 's|^unknown$|convergent|'`
+		;;
+	uts*)
+		os=sysv
+		;;
+	nova*)
+		kernel=rtmk
+		os=nova
+		;;
+	# Preserve the version number of sinix5.
+	sinix5.*)
+		os=`echo "$os" | sed -e 's|sinix|sysv|'`
+		vendor=`echo "$vendor" | sed -e 's|^unknown$|sni|'`
+		;;
+	sinix*)
+		os=sysv4
+		vendor=`echo "$vendor" | sed -e 's|^unknown$|sni|'`
+		;;
+	tpf*)
+		os=tpf
+		;;
+	triton*)
+		os=sysv3
+		;;
+	oss*)
+		os=sysv3
+		;;
+	svr4*)
+		os=sysv4
+		;;
+	svr3)
+		os=sysv3
+		;;
+	sysvr4)
+		os=sysv4
+		;;
+	ose*)
+		os=ose
+		;;
+	*mint | mint[0-9]* | *MiNT | MiNT[0-9]*)
+		os=mint
+		;;
+	dicos*)
+		os=dicos
+		;;
+	pikeos*)
+		# Until real need of OS specific support for
+		# particular features comes up, bare metal
+		# configurations are quite functional.
+		case $cpu in
+		    arm*)
+			os=eabi
+			;;
+		    *)
+			os=
+			obj=elf
+			;;
+		esac
+		;;
+	aout* | coff* | elf* | pe*)
+		# These are machine code file formats, not OSes
+		obj=$os
+		os=
+		;;
+	*)
+		# No normalization, but not necessarily accepted, that comes below.
+		;;
+esac
+
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+kernel=
+obj=
+case $cpu-$vendor in
+	score-*)
+		os=
+		obj=elf
+		;;
+	spu-*)
+		os=
+		obj=elf
+		;;
+	*-acorn)
+		os=riscix1.2
+		;;
+	arm*-rebel)
+		kernel=linux
+		os=gnu
+		;;
+	arm*-semi)
+		os=
+		obj=aout
+		;;
+	c4x-* | tic4x-*)
+		os=
+		obj=coff
+		;;
+	c8051-*)
+		os=
+		obj=elf
+		;;
+	clipper-intergraph)
+		os=clix
+		;;
+	hexagon-*)
+		os=
+		obj=elf
+		;;
+	tic54x-*)
+		os=
+		obj=coff
+		;;
+	tic55x-*)
+		os=
+		obj=coff
+		;;
+	tic6x-*)
+		os=
+		obj=coff
+		;;
+	# This must come before the *-dec entry.
+	pdp10-*)
+		os=tops20
+		;;
+	pdp11-*)
+		os=none
+		;;
+	*-dec | vax-*)
+		os=ultrix4.2
+		;;
+	m68*-apollo)
+		os=domain
+		;;
+	i386-sun)
+		os=sunos4.0.2
+		;;
+	m68000-sun)
+		os=sunos3
+		;;
+	m68*-cisco)
+		os=
+		obj=aout
+		;;
+	mep-*)
+		os=
+		obj=elf
+		;;
+	# The -sgi and -siemens entries must be before the mips- entry
+	# or we get the wrong os.
+	*-sgi)
+		os=irix
+		;;
+	*-siemens)
+		os=sysv4
+		;;
+	mips*-cisco)
+		os=
+		obj=elf
+		;;
+	mips*-*|nanomips*-*)
+		os=
+		obj=elf
+		;;
+	or32-*)
+		os=
+		obj=coff
+		;;
+	# This must be before the sparc-* entry or we get the wrong os.
+	*-tti)
+		os=sysv3
+		;;
+	sparc-* | *-sun)
+		os=sunos4.1.1
+		;;
+	pru-*)
+		os=
+		obj=elf
+		;;
+	*-be)
+		os=beos
+		;;
+	*-ibm)
+		os=aix
+		;;
+	*-knuth)
+		os=mmixware
+		;;
+	*-wec)
+		os=proelf
+		;;
+	*-winbond)
+		os=proelf
+		;;
+	*-oki)
+		os=proelf
+		;;
+	*-hp)
+		os=hpux
+		;;
+	*-hitachi)
+		os=hiuxwe2
+		;;
+	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+		os=sysv
+		;;
+	*-cbm)
+		os=amigaos
+		;;
+	*-dg)
+		os=dgux
+		;;
+	*-dolphin)
+		os=sysv3
+		;;
+	m68k-ccur)
+		os=rtu
+		;;
+	m88k-omron*)
+		os=luna
+		;;
+	*-next)
+		os=nextstep
+		;;
+	*-sequent)
+		os=ptx
+		;;
+	*-crds)
+		os=unos
+		;;
+	*-ns)
+		os=genix
+		;;
+	i370-*)
+		os=mvs
+		;;
+	*-gould)
+		os=sysv
+		;;
+	*-highlevel)
+		os=bsd
+		;;
+	*-encore)
+		os=bsd
+		;;
+	*-masscomp)
+		os=rtu
+		;;
+	f30[01]-fujitsu | f700-fujitsu)
+		os=uxpv
+		;;
+	*-rom68k)
+		os=
+		obj=coff
+		;;
+	*-*bug)
+		os=
+		obj=coff
+		;;
+	*-apple)
+		os=macos
+		;;
+	*-atari*)
+		os=mint
+		;;
+	*-wrs)
+		os=vxworks
+		;;
+	*)
+		os=none
+		;;
+esac
+
+fi
+
+# Now, validate our (potentially fixed-up) individual pieces (OS, OBJ).
+
+case $os in
+	# Sometimes we do "kernel-libc", so those need to count as OSes.
+	llvm* | musl* | newlib* | relibc* | uclibc*)
+		;;
+	# Likewise for "kernel-abi"
+	eabi* | gnueabi*)
+		;;
+	# VxWorks passes extra cpu info in the 4th filed.
+	simlinux | simwindows | spe)
+		;;
+	# See `case $cpu-$os` validation below
+	ghcjs)
+		;;
+	# Now accept the basic system types.
+	# Each alternative MUST end in a * to match a version number.
+	  abug \
+	| aix* \
+	| amdhsa* \
+	| amigados* \
+	| amigaos* \
+	| android* \
+	| aof* \
+	| aos* \
+	| aros* \
+	| atheos* \
+	| auroraux* \
+	| aux* \
+	| beos* \
+	| bitrig* \
+	| bme* \
+	| bosx* \
+	| bsd* \
+	| cegcc* \
+	| chorusos* \
+	| chorusrdb* \
+	| clix* \
+	| cloudabi* \
+	| cnk* \
+	| conix* \
+	| cos* \
+	| cxux* \
+	| cygwin* \
+	| darwin* \
+	| dgux* \
+	| dicos* \
+	| dnix* \
+	| domain* \
+	| dragonfly* \
+	| drops* \
+	| ebmon* \
+	| ecoff* \
+	| ekkobsd* \
+	| emscripten* \
+	| emx* \
+	| es* \
+	| fiwix* \
+	| freebsd* \
+	| fuchsia* \
+	| genix* \
+	| genode* \
+	| glidix* \
+	| gnu* \
+	| go32* \
+	| haiku* \
+	| hcos* \
+	| hiux* \
+	| hms* \
+	| hpux* \
+	| ieee* \
+	| interix* \
+	| ios* \
+	| iris* \
+	| irix* \
+	| ironclad* \
+	| isc* \
+	| its* \
+	| l4re* \
+	| libertybsd* \
+	| lites* \
+	| lnews* \
+	| luna* \
+	| lynxos* \
+	| mach* \
+	| macos* \
+	| magic* \
+	| mbr* \
+	| midipix* \
+	| midnightbsd* \
+	| mingw32* \
+	| mingw64* \
+	| minix* \
+	| mint* \
+	| mirbsd* \
+	| mks* \
+	| mlibc* \
+	| mmixware* \
+	| mon960* \
+	| morphos* \
+	| moss* \
+	| moxiebox* \
+	| mpeix* \
+	| mpw* \
+	| msdos* \
+	| msys* \
+	| mvs* \
+	| nacl* \
+	| netbsd* \
+	| netware* \
+	| newsos* \
+	| nextstep* \
+	| nindy* \
+	| nonstopux* \
+	| nova* \
+	| nsk* \
+	| nucleus* \
+	| nx6 \
+	| nx7 \
+	| oabi* \
+	| ohos* \
+	| onefs* \
+	| openbsd* \
+	| openedition* \
+	| openstep* \
+	| os108* \
+	| os2* \
+	| os400* \
+	| os68k* \
+	| os9* \
+	| ose* \
+	| osf* \
+	| oskit* \
+	| osx* \
+	| palmos* \
+	| phoenix* \
+	| plan9* \
+	| powermax* \
+	| powerunix* \
+	| proelf* \
+	| psos* \
+	| psp* \
+	| ptx* \
+	| pw32* \
+	| qnx* \
+	| rdos* \
+	| redox* \
+	| rhapsody* \
+	| riscix* \
+	| riscos* \
+	| rtems* \
+	| rtmk* \
+	| rtu* \
+	| scout* \
+	| secbsd* \
+	| sei* \
+	| serenity* \
+	| sim* \
+	| skyos* \
+	| solaris* \
+	| solidbsd* \
+	| sortix* \
+	| storm-chaos* \
+	| sunos \
+	| sunos[34]* \
+	| superux* \
+	| syllable* \
+	| sym* \
+	| sysv* \
+	| tenex* \
+	| tirtos* \
+	| toppers* \
+	| tops10* \
+	| tops20* \
+	| tpf* \
+	| tvos* \
+	| twizzler* \
+	| uclinux* \
+	| udi* \
+	| udk* \
+	| ultrix* \
+	| unicos* \
+	| uniplus* \
+	| unleashed* \
+	| unos* \
+	| uwin* \
+	| uxpv* \
+	| v88r* \
+	|*vms* \
+	| vos* \
+	| vsta* \
+	| vxsim* \
+	| vxworks* \
+	| wasi* \
+	| watchos* \
+	| wince* \
+	| windiss* \
+	| windows* \
+	| winnt* \
+	| xenix* \
+	| xray* \
+	| zephyr* \
+	| zvmoe* )
+		;;
+	# This one is extra strict with allowed versions
+	sco3.2v2 | sco3.2v[4-9]* | sco5v6*)
+		# Don't forget version if it is 3.2v4 or newer.
+		;;
+	# This refers to builds using the UEFI calling convention
+	# (which depends on the architecture) and PE file format.
+	# Note that this is both a different calling convention and
+	# different file format than that of GNU-EFI
+	# (x86_64-w64-mingw32).
+	uefi)
+		;;
+	none)
+		;;
+	kernel* | msvc* )
+		# Restricted further below
+		;;
+	'')
+		if test x"$obj" = x
+		then
+			echo "Invalid configuration '$1': Blank OS only allowed with explicit machine code file format" 1>&2
+		fi
+		;;
+	*)
+		echo "Invalid configuration '$1': OS '$os' not recognized" 1>&2
+		exit 1
+		;;
+esac
+
+case $obj in
+	aout* | coff* | elf* | pe*)
+		;;
+	'')
+		# empty is fine
+		;;
+	*)
+		echo "Invalid configuration '$1': Machine code format '$obj' not recognized" 1>&2
+		exit 1
+		;;
+esac
+
+# Here we handle the constraint that a (synthetic) cpu and os are
+# valid only in combination with each other and nowhere else.
+case $cpu-$os in
+	# The "javascript-unknown-ghcjs" triple is used by GHC; we
+	# accept it here in order to tolerate that, but reject any
+	# variations.
+	javascript-ghcjs)
+		;;
+	javascript-* | *-ghcjs)
+		echo "Invalid configuration '$1': cpu '$cpu' is not valid with os '$os$obj'" 1>&2
+		exit 1
+		;;
+esac
+
+# As a final step for OS-related things, validate the OS-kernel combination
+# (given a valid OS), if there is a kernel.
+case $kernel-$os-$obj in
+	linux-gnu*- | linux-android*- | linux-dietlibc*- | linux-llvm*- \
+		    | linux-mlibc*- | linux-musl*- | linux-newlib*- \
+		    | linux-relibc*- | linux-uclibc*- | linux-ohos*- )
+		;;
+	uclinux-uclibc*- | uclinux-gnu*- )
+		;;
+	managarm-mlibc*- | managarm-kernel*- )
+		;;
+	windows*-msvc*-)
+		;;
+	-dietlibc*- | -llvm*- | -mlibc*- | -musl*- | -newlib*- | -relibc*- \
+		    | -uclibc*- )
+		# These are just libc implementations, not actual OSes, and thus
+		# require a kernel.
+		echo "Invalid configuration '$1': libc '$os' needs explicit kernel." 1>&2
+		exit 1
+		;;
+	-kernel*- )
+		echo "Invalid configuration '$1': '$os' needs explicit kernel." 1>&2
+		exit 1
+		;;
+	*-kernel*- )
+		echo "Invalid configuration '$1': '$kernel' does not support '$os'." 1>&2
+		exit 1
+		;;
+	*-msvc*- )
+		echo "Invalid configuration '$1': '$os' needs 'windows'." 1>&2
+		exit 1
+		;;
+	kfreebsd*-gnu*- | knetbsd*-gnu*- | netbsd*-gnu*- | kopensolaris*-gnu*-)
+		;;
+	vxworks-simlinux- | vxworks-simwindows- | vxworks-spe-)
+		;;
+	nto-qnx*-)
+		;;
+	os2-emx-)
+		;;
+	rtmk-nova-)
+		;;
+	*-eabi*- | *-gnueabi*-)
+		;;
+	none--*)
+		# None (no kernel, i.e. freestanding / bare metal),
+		# can be paired with an machine code file format
+		;;
+	-*-)
+		# Blank kernel with real OS is always fine.
+		;;
+	--*)
+		# Blank kernel and OS with real machine code file format is always fine.
+		;;
+	*-*-*)
+		echo "Invalid configuration '$1': Kernel '$kernel' not known to work with OS '$os'." 1>&2
+		exit 1
+		;;
+esac
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+case $vendor in
+	unknown)
+		case $cpu-$os in
+			*-riscix*)
+				vendor=acorn
+				;;
+			*-sunos* | *-solaris*)
+				vendor=sun
+				;;
+			*-cnk* | *-aix*)
+				vendor=ibm
+				;;
+			*-beos*)
+				vendor=be
+				;;
+			*-hpux*)
+				vendor=hp
+				;;
+			*-mpeix*)
+				vendor=hp
+				;;
+			*-hiux*)
+				vendor=hitachi
+				;;
+			*-unos*)
+				vendor=crds
+				;;
+			*-dgux*)
+				vendor=dg
+				;;
+			*-luna*)
+				vendor=omron
+				;;
+			*-genix*)
+				vendor=ns
+				;;
+			*-clix*)
+				vendor=intergraph
+				;;
+			*-mvs* | *-opened*)
+				vendor=ibm
+				;;
+			*-os400*)
+				vendor=ibm
+				;;
+			s390-* | s390x-*)
+				vendor=ibm
+				;;
+			*-ptx*)
+				vendor=sequent
+				;;
+			*-tpf*)
+				vendor=ibm
+				;;
+			*-vxsim* | *-vxworks* | *-windiss*)
+				vendor=wrs
+				;;
+			*-aux*)
+				vendor=apple
+				;;
+			*-hms*)
+				vendor=hitachi
+				;;
+			*-mpw* | *-macos*)
+				vendor=apple
+				;;
+			*-*mint | *-mint[0-9]* | *-*MiNT | *-MiNT[0-9]*)
+				vendor=atari
+				;;
+			*-vos*)
+				vendor=stratus
+				;;
+		esac
+		;;
+esac
+
+echo "$cpu-$vendor${kernel:+-$kernel}${os:+-$os}${obj:+-$obj}"
+exit
+
+# Local variables:
+# eval: (add-hook 'before-save-hook 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/thirdparty/patches/libhdfs3-add-loongarch-support.patch b/thirdparty/patches/libhdfs3-add-loongarch-support.patch
new file mode 100644
index 0000000000..1c6c0cf17e
--- /dev/null
+++ b/thirdparty/patches/libhdfs3-add-loongarch-support.patch
@@ -0,0 +1,61 @@
+From 596eb924f95a6e28abbbdbb9b9c212a0ed418250 Mon Sep 17 00:00:00 2001
+From: huchangqi <huchangqi@loongson.cn>
+Date: Mon, 19 May 2025 15:42:30 +0800
+Subject: [PATCH] libhdfs3 add loongarch support
+
+---
+ src/common/HWCrc32c.cpp | 33 ++++++++++++++++++++++++++++++++-
+ 1 file changed, 32 insertions(+), 1 deletion(-)
+
+diff --git a/src/common/HWCrc32c.cpp b/src/common/HWCrc32c.cpp
+index f61b4b6e10f..62dbb421fe8 100644
+--- a/src/common/HWCrc32c.cpp
++++ b/src/common/HWCrc32c.cpp
+@@ -66,6 +66,35 @@ static inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) {
+ #endif
+ #elif ((defined(__arm__) || defined(__aarch64__))) 
+ #include "sse2neon.h"
++#elif defined(__loongarch_lp64)
++#include <larchintrin.h>
++
++namespace Hdfs {
++namespace Internal {
++
++static inline uint64_t _mm_crc32_u64(uint64_t crc, uint64_t value) {
++    crc = __crc_w_d_w(value, crc);
++    return crc;
++}
++
++static inline uint32_t _mm_crc32_u32(uint32_t crc, uint64_t value) {
++    crc = __crc_w_w_w(value, crc);
++    return crc;
++}
++
++static inline uint32_t _mm_crc32_u16(uint32_t crc, uint16_t value) {
++    crc = __crc_w_h_w(value, crc);
++    return crc;
++}
++
++static inline uint32_t _mm_crc32_u8(uint32_t crc, uint8_t value) {
++    crc = __crc_w_b_w(value, crc);
++    return crc;
++}
++
++}
++}
++
+ #endif
+ 
+ namespace Hdfs {
+@@ -82,6 +111,8 @@ bool HWCrc32c::available() {
+     return (ecx & (1 << 20)) != 0;
+ #elif ((defined(__arm__) || defined(__aarch64__)))
+     return true;
++#elif defined(__loongarch_lp64)
++    return true;
+ #else
+     return false;
+ #endif
+-- 
+2.46.0
+
diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh
index 7bc66c4677..6644b0324e 100644
--- a/thirdparty/vars.sh
+++ b/thirdparty/vars.sh
@@ -198,10 +198,10 @@ LEVELDB_SOURCE=leveldb-1.23
 LEVELDB_MD5SUM="afbde776fb8760312009963f09a586c7"
 
 # brpc
-BRPC_DOWNLOAD="https://github.com/apache/brpc/archive/refs/tags/1.4.0.tar.gz"
-BRPC_NAME="brpc-1.4.0.tar.gz"
-BRPC_SOURCE="brpc-1.4.0"
-BRPC_MD5SUM="6af9d50822c33a3abc56a1ec0af0e0bc"
+BRPC_DOWNLOAD="https://github.com/apache/brpc/archive/refs/tags/1.12.1.tar.gz"
+BRPC_NAME="brpc-1.12.1.tar.gz"
+BRPC_SOURCE="brpc-1.12.1"
+BRPC_MD5SUM="8bc704bafadc3752edb61eb531fd951c"
 
 # rocksdb
 ROCKSDB_DOWNLOAD="https://github.com/facebook/rocksdb/archive/v5.14.2.tar.gz"
@@ -315,10 +315,11 @@ JEMALLOC_DORIS_SOURCE="jemalloc-5.3.0"
 JEMALLOC_DORIS_MD5SUM="09a8328574dab22a7df848eae6dbbf53"
 
 # libunwind
+# libunwind need use loongarch version
 LIBUNWIND_DOWNLOAD="https://github.com/libunwind/libunwind/releases/download/v1.6.2/libunwind-1.6.2.tar.gz"
 LIBUNWIND_NAME="libunwind-1.6.2.tar.gz"
 LIBUNWIND_SOURCE="libunwind-1.6.2"
-LIBUNWIND_MD5SUM="f625b6a98ac1976116c71708a73dc44a"
+LIBUNWIND_MD5SUM="dfa959c84479e4a10da44c7eda3b83ff"
 
 # cctz
 CCTZ_DOWNLOAD="https://github.com/google/cctz/archive/v2.3.tar.gz"